2022-07-08 19:46:11 +02:00
|
|
|
package crawler
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"database/sql"
|
|
|
|
"fmt"
|
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"net/url"
|
|
|
|
"time"
|
|
|
|
|
2022-07-08 20:30:09 +02:00
|
|
|
"github.com/temoto/robotstxt"
|
2022-07-08 19:46:11 +02:00
|
|
|
"golang.org/x/net/html"
|
|
|
|
|
|
|
|
"git.sr.ht/~sircmpwn/searchhut/database"
|
|
|
|
)
|
|
|
|
|
2022-07-08 20:30:09 +02:00
|
|
|
const USER_AGENT = "SearchHut Bot 0.0; https://sr.ht/~sircmpwn/searchhut"
|
|
|
|
|
2022-07-08 19:46:11 +02:00
|
|
|
type Crawler struct {
|
|
|
|
Client *http.Client
|
|
|
|
Domain string
|
|
|
|
DomainID int
|
|
|
|
Delay time.Duration
|
2022-07-08 20:30:09 +02:00
|
|
|
Robots *robotstxt.Group
|
2022-07-08 19:46:11 +02:00
|
|
|
|
|
|
|
db *sql.DB
|
|
|
|
seen map[string]struct{}
|
|
|
|
schedule []*url.URL
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewCrawler(db *sql.DB, domain string) *Crawler {
|
|
|
|
var domainID int
|
|
|
|
row := db.QueryRow(`SELECT id FROM domain WHERE hostname = $1`, domain)
|
|
|
|
if err := row.Scan(&domainID); err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
client := &http.Client{
|
|
|
|
Timeout: 5 * time.Second,
|
|
|
|
}
|
|
|
|
|
|
|
|
return &Crawler{
|
|
|
|
Client: client,
|
|
|
|
Domain: domain,
|
|
|
|
DomainID: domainID,
|
2022-07-08 20:30:09 +02:00
|
|
|
// TODO: Dynamic crawl delay based on remote performance
|
|
|
|
Delay: 2 * time.Second,
|
|
|
|
Robots: nil,
|
2022-07-08 19:46:11 +02:00
|
|
|
|
|
|
|
db: db,
|
|
|
|
seen: make(map[string]struct{}),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *Crawler) Crawl() {
|
|
|
|
log.Printf("Indexing %s (domain %d)", c.Domain, c.DomainID)
|
|
|
|
ctx := database.Context(context.Background(), c.db)
|
2022-07-08 20:30:09 +02:00
|
|
|
|
|
|
|
url, err := url.Parse(fmt.Sprintf("https://%s/robots.txt", c.Domain))
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
resp, err := c.Get(ctx, url)
|
|
|
|
robots, err := robotstxt.FromResponse(resp)
|
|
|
|
resp.Body.Close()
|
|
|
|
if err == nil {
|
|
|
|
log.Println("Found applicable robots.txt")
|
|
|
|
c.Robots = robots.FindGroup(USER_AGENT)
|
|
|
|
}
|
|
|
|
if c.Robots != nil && c.Robots.CrawlDelay != 0 {
|
|
|
|
c.Delay = c.Robots.CrawlDelay
|
|
|
|
}
|
|
|
|
|
|
|
|
url, err = url.Parse(fmt.Sprintf("https://%s", c.Domain))
|
2022-07-08 19:46:11 +02:00
|
|
|
if err != nil {
|
2022-07-08 20:30:09 +02:00
|
|
|
log.Fatal(err)
|
2022-07-08 19:46:11 +02:00
|
|
|
}
|
|
|
|
c.Schedule(url)
|
2022-07-08 20:30:09 +02:00
|
|
|
|
2022-07-08 19:46:11 +02:00
|
|
|
for len(c.schedule) != 0 {
|
|
|
|
next := c.schedule[0]
|
|
|
|
c.schedule = c.schedule[1:]
|
|
|
|
if err := c.Index(ctx, next); err != nil {
|
2022-07-08 20:30:09 +02:00
|
|
|
log.Println(err)
|
2022-07-08 19:46:11 +02:00
|
|
|
}
|
|
|
|
time.Sleep(c.Delay)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error) {
|
2022-07-08 20:30:09 +02:00
|
|
|
req, err := http.NewRequestWithContext(ctx, "GET", url.String(), http.NoBody)
|
2022-07-08 19:46:11 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2022-07-08 20:30:09 +02:00
|
|
|
req.Header.Add("User-Agent", USER_AGENT)
|
2022-07-08 19:46:11 +02:00
|
|
|
return c.Client.Do(req)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *Crawler) Schedule(url *url.URL) {
|
|
|
|
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
|
|
|
|
return
|
|
|
|
}
|
2022-07-08 20:30:09 +02:00
|
|
|
if c.Robots != nil && !c.Robots.Test(url.Path) {
|
|
|
|
return
|
|
|
|
}
|
2022-07-08 19:46:11 +02:00
|
|
|
trimmed := *url
|
|
|
|
trimmed.RawQuery = ""
|
2022-07-08 20:04:37 +02:00
|
|
|
trimmed.Fragment = ""
|
2022-07-08 19:46:11 +02:00
|
|
|
if _, seen := c.seen[trimmed.String()]; seen {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
c.seen[trimmed.String()] = struct{}{}
|
|
|
|
c.schedule = append(c.schedule, &trimmed)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *Crawler) ScheduleLinks(from *url.URL, node *html.Node) {
|
|
|
|
if node.Type == html.ElementNode && node.Data == "a" {
|
|
|
|
for _, attr := range node.Attr {
|
|
|
|
if attr.Key == "href" {
|
|
|
|
url, err := url.Parse(attr.Val)
|
|
|
|
if err == nil {
|
|
|
|
c.Schedule(from.ResolveReference(url))
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
|
|
|
c.ScheduleLinks(from, child)
|
|
|
|
}
|
|
|
|
}
|