searchhut/crawler/crawler.go
Umar Getagazov 2971603710 Put domain labels minus eTLD into the text index
Before, only the hostname (say, harelang.org) was indexed, and no
results appeared for a "harelang" query. Now, all domain labels (minus
the eTLD) are indexed separately (for example, "docs" and "harelang" for
"docs.harelang.org"), and such queries work. eTLD is removed using the
data from Mozilla's Public Suffix List (https://publicsuffix.org).
2022-07-11 17:48:46 +02:00

196 lines
4.4 KiB
Go

package crawler
import (
"context"
"database/sql"
"fmt"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/lib/pq"
"github.com/temoto/robotstxt"
"golang.org/x/net/html"
"golang.org/x/net/publicsuffix"
"git.sr.ht/~sircmpwn/searchhut/database"
)
type Crawler struct {
Client *http.Client
Domain string
DomainID int
Exclude []*regexp.Regexp
Delay time.Duration
RetryAfter time.Duration
Robots *robotstxt.Group
UserAgent string
Start time.Time
db *sql.DB
seen map[string]struct{}
schedule []*url.URL
labels string
}
func NewCrawler(ua string, db *sql.DB, domain string) *Crawler {
var (
domainID int
exclPatsStr pq.StringArray
exclPats = []*regexp.Regexp{}
)
row := db.QueryRow(`SELECT id, exclusion_patterns FROM domain WHERE hostname = $1`, domain)
if err := row.Scan(&domainID, &exclPatsStr); err != nil {
log.Fatal(err)
}
for _, patStr := range exclPatsStr {
pat, err := regexp.Compile(patStr)
if err != nil {
log.Fatal(err)
}
exclPats = append(exclPats, pat)
}
client := &http.Client{
Timeout: 10 * time.Second,
}
return &Crawler{
Client: client,
Domain: domain,
DomainID: domainID,
Exclude: exclPats,
// TODO: Dynamic crawl delay based on remote performance
Delay: 5 * time.Second,
Robots: nil,
UserAgent: ua,
db: db,
seen: make(map[string]struct{}),
labels: strings.Join(getDomainLabels(domain), " "),
}
}
func (c *Crawler) Crawl() {
c.Start = time.Now().UTC()
log.Printf("Indexing %s (domain %d)", c.Domain, c.DomainID)
ctx := database.Context(context.Background(), c.db)
url, err := url.Parse(fmt.Sprintf("https://%s/robots.txt", c.Domain))
if err != nil {
log.Fatal(err)
}
resp, err := c.Get(ctx, url)
if err != nil {
log.Fatal(err)
}
robots, err := robotstxt.FromResponse(resp)
resp.Body.Close()
if err == nil {
log.Println("Found applicable robots.txt")
c.Robots = robots.FindGroup(c.UserAgent)
}
if c.Robots != nil && c.Robots.CrawlDelay != 0 {
c.Delay = c.Robots.CrawlDelay
}
url, err = url.Parse(fmt.Sprintf("https://%s", c.Domain))
if err != nil {
log.Fatal(err)
}
c.Schedule(url)
for len(c.schedule) != 0 {
next := c.schedule[0]
c.schedule = c.schedule[1:]
if err := c.Index(ctx, next); err != nil {
log.Println(err)
}
time.Sleep(c.Delay)
time.Sleep(c.RetryAfter)
c.RetryAfter = 0
}
duration := time.Now().UTC().Sub(c.Start)
if err := database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
UPDATE domain
SET last_index_date = $2, crawl_duration = $3::interval
WHERE id = $1;
`, c.DomainID, c.Start,
fmt.Sprintf("%d seconds", int(duration.Seconds())))
return err
}); err != nil {
log.Fatal(err)
}
}
func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url.String(), http.NoBody)
if err != nil {
return nil, err
}
req.Header.Add("User-Agent", c.UserAgent)
return c.Client.Do(req)
}
func (c *Crawler) Head(ctx context.Context, url *url.URL) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, "HEAD", url.String(), http.NoBody)
if err != nil {
return nil, err
}
req.Header.Add("User-Agent", c.UserAgent)
return c.Client.Do(req)
}
func (c *Crawler) Schedule(url *url.URL) {
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
return
}
if c.Robots != nil && !c.Robots.Test(url.Path) {
return
}
for _, pat := range c.Exclude {
if pat.MatchString(url.Path) {
return
}
}
trimmed := *url
trimmed.RawQuery = ""
trimmed.Fragment = ""
if _, seen := c.seen[trimmed.String()]; seen {
return
}
c.seen[trimmed.String()] = struct{}{}
c.schedule = append(c.schedule, &trimmed)
}
func (c *Crawler) ScheduleLinks(from *url.URL, node *html.Node) {
if node.Type == html.ElementNode && node.Data == "a" {
for _, attr := range node.Attr {
if attr.Key == "href" {
url, err := url.Parse(attr.Val)
if err == nil {
c.Schedule(from.ResolveReference(url))
}
break
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
c.ScheduleLinks(from, child)
}
}
func getDomainLabels(domain string) []string {
etld, _ := publicsuffix.PublicSuffix(domain)
if len(domain) > len(etld)+1 {
domain = domain[:len(domain)-len(etld)-1]
}
domain = strings.TrimPrefix(domain, "www.")
return strings.Split(domain, ".")
}