package crawler import ( "context" "database/sql" "log" "mime" "net/url" "github.com/go-shiori/go-readability" "golang.org/x/net/html" "git.sr.ht/~sircmpwn/searchhut/database" ) func (c *Crawler) Index(ctx context.Context, url *url.URL) error { log.Println(url.String()) resp, err := c.Get(ctx, url) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode != 200 { log.Printf("Unexpected status code %d", resp.StatusCode) return nil } contentType := resp.Header.Get("Content-Type") if contentType == "" { return nil } if mt, _, err := mime.ParseMediaType(contentType); err != nil { return nil } else if mt != "text/html" { return nil } node, err := html.Parse(resp.Body) if err != nil { return err } c.ScheduleLinks(url, node) article, err := readability.FromDocument(node, url) if err != nil { return err } return database.WithTx(ctx, nil, func(tx *sql.Tx) error { // TODO: add more stuff _, err := tx.ExecContext(ctx, ` INSERT INTO page ( domain_id, last_index_date, weight, crawl_priority, crawl_delay, url, checksum, title, author, excerpt, fts_vector ) VALUES ( $1, now(), 0, 0, '0s'::interval, $2, $3, $4, $5, $6, setweight(to_tsvector(coalesce($4, '')), 'A') || setweight(to_tsvector(coalesce($5, '')), 'A') || setweight(to_tsvector(coalesce($7, '')), 'B') ) ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET last_index_date = now(), checksum = $3, title = $4, author = $5, excerpt = $6, -- TODO: Maybe move this to a sub-query fts_vector = setweight(to_tsvector(coalesce($4, '')), 'A') || setweight(to_tsvector(coalesce($5, '')), 'A') || setweight(to_tsvector(coalesce($7, '')), 'B');`, c.DomainID, url.String(), []byte{}, article.Title, article.Byline, article.Excerpt, article.TextContent) return err }) }