package crawler import ( "context" "database/sql" "fmt" "log" "net/http" "net/url" "time" "github.com/temoto/robotstxt" "golang.org/x/net/html" "git.sr.ht/~sircmpwn/searchhut/database" ) type Crawler struct { Client *http.Client Domain string DomainID int Delay time.Duration Robots *robotstxt.Group UserAgent string db *sql.DB seen map[string]struct{} schedule []*url.URL } func NewCrawler(ua string, db *sql.DB, domain string) *Crawler { var domainID int row := db.QueryRow(`SELECT id FROM domain WHERE hostname = $1`, domain) if err := row.Scan(&domainID); err != nil { log.Fatal(err) } client := &http.Client{ Timeout: 5 * time.Second, } return &Crawler{ Client: client, Domain: domain, DomainID: domainID, // TODO: Dynamic crawl delay based on remote performance Delay: 5 * time.Second, Robots: nil, UserAgent: ua, db: db, seen: make(map[string]struct{}), } } func (c *Crawler) Crawl() { log.Printf("Indexing %s (domain %d)", c.Domain, c.DomainID) ctx := database.Context(context.Background(), c.db) url, err := url.Parse(fmt.Sprintf("https://%s/robots.txt", c.Domain)) if err != nil { log.Fatal(err) } resp, err := c.Get(ctx, url) robots, err := robotstxt.FromResponse(resp) resp.Body.Close() if err == nil { log.Println("Found applicable robots.txt") c.Robots = robots.FindGroup(c.UserAgent) } if c.Robots != nil && c.Robots.CrawlDelay != 0 { c.Delay = c.Robots.CrawlDelay } url, err = url.Parse(fmt.Sprintf("https://%s", c.Domain)) if err != nil { log.Fatal(err) } c.Schedule(url) for len(c.schedule) != 0 { next := c.schedule[0] c.schedule = c.schedule[1:] if err := c.Index(ctx, next); err != nil { log.Println(err) } time.Sleep(c.Delay) } } func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error) { req, err := http.NewRequestWithContext(ctx, "GET", url.String(), http.NoBody) if err != nil { return nil, err } req.Header.Add("User-Agent", c.UserAgent) return c.Client.Do(req) } func (c *Crawler) Head(ctx context.Context, url *url.URL) (*http.Response, error) { req, err := http.NewRequestWithContext(ctx, "HEAD", url.String(), http.NoBody) if err != nil { return nil, err } req.Header.Add("User-Agent", c.UserAgent) return c.Client.Do(req) } func (c *Crawler) Schedule(url *url.URL) { if url.User != nil || url.Host != c.Domain || url.Scheme != "https" { return } if c.Robots != nil && !c.Robots.Test(url.Path) { return } trimmed := *url trimmed.RawQuery = "" trimmed.Fragment = "" if _, seen := c.seen[trimmed.String()]; seen { return } c.seen[trimmed.String()] = struct{}{} c.schedule = append(c.schedule, &trimmed) } func (c *Crawler) ScheduleLinks(from *url.URL, node *html.Node) { if node.Type == html.ElementNode && node.Data == "a" { for _, attr := range node.Attr { if attr.Key == "href" { url, err := url.Parse(attr.Val) if err == nil { c.Schedule(from.ResolveReference(url)) } break } } } for child := node.FirstChild; child != nil; child = child.NextSibling { c.ScheduleLinks(from, child) } }