2971603710
Before, only the hostname (say, harelang.org) was indexed, and no results appeared for a "harelang" query. Now, all domain labels (minus the eTLD) are indexed separately (for example, "docs" and "harelang" for "docs.harelang.org"), and such queries work. eTLD is removed using the data from Mozilla's Public Suffix List (https://publicsuffix.org).
196 lines
4.4 KiB
Go
196 lines
4.4 KiB
Go
package crawler
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/lib/pq"
|
|
"github.com/temoto/robotstxt"
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/publicsuffix"
|
|
|
|
"git.sr.ht/~sircmpwn/searchhut/database"
|
|
)
|
|
|
|
type Crawler struct {
|
|
Client *http.Client
|
|
Domain string
|
|
DomainID int
|
|
Exclude []*regexp.Regexp
|
|
Delay time.Duration
|
|
RetryAfter time.Duration
|
|
Robots *robotstxt.Group
|
|
UserAgent string
|
|
Start time.Time
|
|
|
|
db *sql.DB
|
|
seen map[string]struct{}
|
|
schedule []*url.URL
|
|
labels string
|
|
}
|
|
|
|
func NewCrawler(ua string, db *sql.DB, domain string) *Crawler {
|
|
var (
|
|
domainID int
|
|
exclPatsStr pq.StringArray
|
|
exclPats = []*regexp.Regexp{}
|
|
)
|
|
row := db.QueryRow(`SELECT id, exclusion_patterns FROM domain WHERE hostname = $1`, domain)
|
|
if err := row.Scan(&domainID, &exclPatsStr); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
for _, patStr := range exclPatsStr {
|
|
pat, err := regexp.Compile(patStr)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
exclPats = append(exclPats, pat)
|
|
}
|
|
|
|
client := &http.Client{
|
|
Timeout: 10 * time.Second,
|
|
}
|
|
|
|
return &Crawler{
|
|
Client: client,
|
|
Domain: domain,
|
|
DomainID: domainID,
|
|
Exclude: exclPats,
|
|
// TODO: Dynamic crawl delay based on remote performance
|
|
Delay: 5 * time.Second,
|
|
Robots: nil,
|
|
UserAgent: ua,
|
|
|
|
db: db,
|
|
seen: make(map[string]struct{}),
|
|
labels: strings.Join(getDomainLabels(domain), " "),
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) Crawl() {
|
|
c.Start = time.Now().UTC()
|
|
log.Printf("Indexing %s (domain %d)", c.Domain, c.DomainID)
|
|
ctx := database.Context(context.Background(), c.db)
|
|
|
|
url, err := url.Parse(fmt.Sprintf("https://%s/robots.txt", c.Domain))
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
resp, err := c.Get(ctx, url)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
robots, err := robotstxt.FromResponse(resp)
|
|
resp.Body.Close()
|
|
if err == nil {
|
|
log.Println("Found applicable robots.txt")
|
|
c.Robots = robots.FindGroup(c.UserAgent)
|
|
}
|
|
if c.Robots != nil && c.Robots.CrawlDelay != 0 {
|
|
c.Delay = c.Robots.CrawlDelay
|
|
}
|
|
|
|
url, err = url.Parse(fmt.Sprintf("https://%s", c.Domain))
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
c.Schedule(url)
|
|
|
|
for len(c.schedule) != 0 {
|
|
next := c.schedule[0]
|
|
c.schedule = c.schedule[1:]
|
|
if err := c.Index(ctx, next); err != nil {
|
|
log.Println(err)
|
|
}
|
|
time.Sleep(c.Delay)
|
|
time.Sleep(c.RetryAfter)
|
|
c.RetryAfter = 0
|
|
}
|
|
|
|
duration := time.Now().UTC().Sub(c.Start)
|
|
if err := database.WithTx(ctx, nil, func(tx *sql.Tx) error {
|
|
_, err := tx.ExecContext(ctx, `
|
|
UPDATE domain
|
|
SET last_index_date = $2, crawl_duration = $3::interval
|
|
WHERE id = $1;
|
|
`, c.DomainID, c.Start,
|
|
fmt.Sprintf("%d seconds", int(duration.Seconds())))
|
|
return err
|
|
}); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url.String(), http.NoBody)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Add("User-Agent", c.UserAgent)
|
|
return c.Client.Do(req)
|
|
}
|
|
|
|
func (c *Crawler) Head(ctx context.Context, url *url.URL) (*http.Response, error) {
|
|
req, err := http.NewRequestWithContext(ctx, "HEAD", url.String(), http.NoBody)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req.Header.Add("User-Agent", c.UserAgent)
|
|
return c.Client.Do(req)
|
|
}
|
|
|
|
func (c *Crawler) Schedule(url *url.URL) {
|
|
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
|
|
return
|
|
}
|
|
if c.Robots != nil && !c.Robots.Test(url.Path) {
|
|
return
|
|
}
|
|
for _, pat := range c.Exclude {
|
|
if pat.MatchString(url.Path) {
|
|
return
|
|
}
|
|
}
|
|
trimmed := *url
|
|
trimmed.RawQuery = ""
|
|
trimmed.Fragment = ""
|
|
if _, seen := c.seen[trimmed.String()]; seen {
|
|
return
|
|
}
|
|
c.seen[trimmed.String()] = struct{}{}
|
|
c.schedule = append(c.schedule, &trimmed)
|
|
}
|
|
|
|
func (c *Crawler) ScheduleLinks(from *url.URL, node *html.Node) {
|
|
if node.Type == html.ElementNode && node.Data == "a" {
|
|
for _, attr := range node.Attr {
|
|
if attr.Key == "href" {
|
|
url, err := url.Parse(attr.Val)
|
|
if err == nil {
|
|
c.Schedule(from.ResolveReference(url))
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
|
c.ScheduleLinks(from, child)
|
|
}
|
|
}
|
|
|
|
func getDomainLabels(domain string) []string {
|
|
etld, _ := publicsuffix.PublicSuffix(domain)
|
|
if len(domain) > len(etld)+1 {
|
|
domain = domain[:len(domain)-len(etld)-1]
|
|
}
|
|
domain = strings.TrimPrefix(domain, "www.")
|
|
return strings.Split(domain, ".")
|
|
}
|