package crawler import ( "context" "database/sql" "fmt" "log" "net/http" "net/url" "time" //"github.com/temoto/robotstxt" // TODO "golang.org/x/net/html" "git.sr.ht/~sircmpwn/searchhut/database" ) type Crawler struct { Client *http.Client Domain string DomainID int Delay time.Duration db *sql.DB seen map[string]struct{} schedule []*url.URL } func NewCrawler(db *sql.DB, domain string) *Crawler { var domainID int row := db.QueryRow(`SELECT id FROM domain WHERE hostname = $1`, domain) if err := row.Scan(&domainID); err != nil { log.Fatal(err) } client := &http.Client{ Timeout: 5 * time.Second, } return &Crawler{ Client: client, Domain: domain, DomainID: domainID, Delay: 1 * time.Second, // TODO: Increase me db: db, seen: make(map[string]struct{}), } } func (c *Crawler) Crawl() { log.Printf("Indexing %s (domain %d)", c.Domain, c.DomainID) ctx := database.Context(context.Background(), c.db) url, err := url.Parse(fmt.Sprintf("https://%s", c.Domain)) if err != nil { panic(err) } c.Schedule(url) for len(c.schedule) != 0 { next := c.schedule[0] c.schedule = c.schedule[1:] if err := c.Index(ctx, next); err != nil { log.Fatal(err) } time.Sleep(c.Delay) } } func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error) { req, err := http.NewRequestWithContext(ctx, "GET", url.String(), http.NoBody) if err != nil { return nil, err } ua := "SearchHut Bot 0.0; https://sr.ht/~sircmpwn/searchhut" req.Header.Add("User-Agent", ua) return c.Client.Do(req) } func (c *Crawler) Schedule(url *url.URL) { if url.User != nil || url.Host != c.Domain || url.Scheme != "https" { return } trimmed := *url trimmed.RawQuery = "" trimmed.Fragment = "" if _, seen := c.seen[trimmed.String()]; seen { return } c.seen[trimmed.String()] = struct{}{} c.schedule = append(c.schedule, &trimmed) } func (c *Crawler) ScheduleLinks(from *url.URL, node *html.Node) { if node.Type == html.ElementNode && node.Data == "a" { for _, attr := range node.Attr { if attr.Key == "href" { url, err := url.Parse(attr.Val) if err == nil { c.Schedule(from.ResolveReference(url)) } break } } } for child := node.FirstChild; child != nil; child = child.NextSibling { c.ScheduleLinks(from, child) } }