searchhut/crawler/crawler.go
Drew DeVault 19a9a3a3b5 sh-index: add -u flag to add URLs to schedule
This is useful for indexing parts of sites which are not reachable from
the index page.
2022-07-11 20:57:59 +02:00

202 lines
4.6 KiB
Go

package crawler
import (
"context"
"database/sql"
"fmt"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/lib/pq"
"github.com/temoto/robotstxt"
"golang.org/x/net/html"
"golang.org/x/net/publicsuffix"
"git.sr.ht/~sircmpwn/searchhut/database"
)
type Crawler struct {
Client *http.Client
Domain string
DomainID int
Authoritative bool
Exclude []*regexp.Regexp
Delay time.Duration
RetryAfter time.Duration
Robots *robotstxt.Group
UserAgent string
Start time.Time
db *sql.DB
seen map[string]struct{}
schedule []*url.URL
labels string
}
func NewCrawler(ua string, db *sql.DB, domain string) *Crawler {
var (
domainID int
authoritative bool
exclPatsStr pq.StringArray
exclPats = []*regexp.Regexp{}
)
row := db.QueryRow(`SELECT id, authoritative, exclusion_patterns FROM domain WHERE hostname = $1`, domain)
if err := row.Scan(&domainID, &authoritative, &exclPatsStr); err != nil {
log.Fatal(err)
}
for _, patStr := range exclPatsStr {
pat, err := regexp.Compile(patStr)
if err != nil {
log.Fatal(err)
}
exclPats = append(exclPats, pat)
}
client := &http.Client{
Timeout: 10 * time.Second,
}
return &Crawler{
Client: client,
Domain: domain,
DomainID: domainID,
Authoritative: authoritative,
Exclude: exclPats,
// TODO: Dynamic crawl delay based on remote performance
Delay: 5 * time.Second,
Robots: nil,
UserAgent: ua,
db: db,
seen: make(map[string]struct{}),
labels: strings.Join(getDomainLabels(domain), " "),
}
}
func (c *Crawler) Crawl() {
c.Start = time.Now().UTC()
log.Printf("Indexing %s (domain %d)", c.Domain, c.DomainID)
ctx := database.Context(context.Background(), c.db)
url, err := url.Parse(fmt.Sprintf("https://%s/robots.txt", c.Domain))
if err != nil {
log.Fatal(err)
}
resp, err := c.Get(ctx, url)
if err != nil {
log.Fatal(err)
}
robots, err := robotstxt.FromResponse(resp)
resp.Body.Close()
if err == nil {
log.Println("Found applicable robots.txt")
c.Robots = robots.FindGroup(c.UserAgent)
}
if c.Robots != nil && c.Robots.CrawlDelay != 0 {
c.Delay = c.Robots.CrawlDelay
}
if len(c.schedule) == 0 {
log.Println("Not indexing from root")
url, err = url.Parse(fmt.Sprintf("https://%s", c.Domain))
if err != nil {
log.Fatal(err)
}
c.Schedule(url)
}
for len(c.schedule) != 0 {
next := c.schedule[0]
c.schedule = c.schedule[1:]
if err := c.Index(ctx, next); err != nil {
log.Println(err)
}
time.Sleep(c.Delay)
time.Sleep(c.RetryAfter)
c.RetryAfter = 0
}
duration := time.Now().UTC().Sub(c.Start)
if err := database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
UPDATE domain
SET last_index_date = $2, crawl_duration = $3::interval
WHERE id = $1;
`, c.DomainID, c.Start,
fmt.Sprintf("%d seconds", int(duration.Seconds())))
return err
}); err != nil {
log.Fatal(err)
}
}
func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, "GET", url.String(), http.NoBody)
if err != nil {
return nil, err
}
req.Header.Add("User-Agent", c.UserAgent)
return c.Client.Do(req)
}
func (c *Crawler) Head(ctx context.Context, url *url.URL) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, "HEAD", url.String(), http.NoBody)
if err != nil {
return nil, err
}
req.Header.Add("User-Agent", c.UserAgent)
return c.Client.Do(req)
}
func (c *Crawler) Schedule(url *url.URL) {
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
return
}
if c.Robots != nil && !c.Robots.Test(url.Path) {
return
}
for _, pat := range c.Exclude {
if pat.MatchString(url.Path) {
return
}
}
trimmed := *url
trimmed.RawQuery = ""
trimmed.Fragment = ""
if _, seen := c.seen[trimmed.String()]; seen {
return
}
c.seen[trimmed.String()] = struct{}{}
c.schedule = append(c.schedule, &trimmed)
}
func (c *Crawler) ScheduleLinks(from *url.URL, node *html.Node) {
if node.Type == html.ElementNode && node.Data == "a" {
for _, attr := range node.Attr {
if attr.Key == "href" {
url, err := url.Parse(attr.Val)
if err == nil {
c.Schedule(from.ResolveReference(url))
}
break
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
c.ScheduleLinks(from, child)
}
}
func getDomainLabels(domain string) []string {
etld, _ := publicsuffix.PublicSuffix(domain)
if len(domain) > len(etld)+1 {
domain = domain[:len(domain)-len(etld)-1]
}
domain = strings.TrimPrefix(domain, "www.")
return strings.Split(domain, ".")
}