crawler: respect robots.txt

This commit is contained in:
Drew DeVault 2022-07-08 20:30:09 +02:00
parent eb6769c904
commit d6bc032d24
2 changed files with 40 additions and 9 deletions

View file

@ -9,17 +9,20 @@ import (
"net/url"
"time"
//"github.com/temoto/robotstxt" // TODO
"github.com/temoto/robotstxt"
"golang.org/x/net/html"
"git.sr.ht/~sircmpwn/searchhut/database"
)
const USER_AGENT = "SearchHut Bot 0.0; https://sr.ht/~sircmpwn/searchhut"
type Crawler struct {
Client *http.Client
Domain string
DomainID int
Delay time.Duration
Robots *robotstxt.Group
db *sql.DB
seen map[string]struct{}
@ -41,7 +44,9 @@ func NewCrawler(db *sql.DB, domain string) *Crawler {
Client: client,
Domain: domain,
DomainID: domainID,
Delay: 1 * time.Second, // TODO: Increase me
// TODO: Dynamic crawl delay based on remote performance
Delay: 2 * time.Second,
Robots: nil,
db: db,
seen: make(map[string]struct{}),
@ -51,29 +56,44 @@ func NewCrawler(db *sql.DB, domain string) *Crawler {
func (c *Crawler) Crawl() {
log.Printf("Indexing %s (domain %d)", c.Domain, c.DomainID)
ctx := database.Context(context.Background(), c.db)
url, err := url.Parse(fmt.Sprintf("https://%s", c.Domain))
url, err := url.Parse(fmt.Sprintf("https://%s/robots.txt", c.Domain))
if err != nil {
panic(err)
log.Fatal(err)
}
resp, err := c.Get(ctx, url)
robots, err := robotstxt.FromResponse(resp)
resp.Body.Close()
if err == nil {
log.Println("Found applicable robots.txt")
c.Robots = robots.FindGroup(USER_AGENT)
}
if c.Robots != nil && c.Robots.CrawlDelay != 0 {
c.Delay = c.Robots.CrawlDelay
}
url, err = url.Parse(fmt.Sprintf("https://%s", c.Domain))
if err != nil {
log.Fatal(err)
}
c.Schedule(url)
for len(c.schedule) != 0 {
next := c.schedule[0]
c.schedule = c.schedule[1:]
if err := c.Index(ctx, next); err != nil {
log.Fatal(err)
log.Println(err)
}
time.Sleep(c.Delay)
}
}
func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx,
"GET", url.String(), http.NoBody)
req, err := http.NewRequestWithContext(ctx, "GET", url.String(), http.NoBody)
if err != nil {
return nil, err
}
ua := "SearchHut Bot 0.0; https://sr.ht/~sircmpwn/searchhut"
req.Header.Add("User-Agent", ua)
req.Header.Add("User-Agent", USER_AGENT)
return c.Client.Do(req)
}
@ -81,6 +101,9 @@ func (c *Crawler) Schedule(url *url.URL) {
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
return
}
if c.Robots != nil && !c.Robots.Test(url.Path) {
return
}
trimmed := *url
trimmed.RawQuery = ""
trimmed.Fragment = ""

View file

@ -17,7 +17,15 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
resp, err := c.Get(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Printf("Unexpected status code %d", resp.StatusCode)
return nil
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
return nil