crawler: trim excerpt

Fixes: https://todo.sr.ht/~sircmpwn/searchhut/38
This commit is contained in:
Drew DeVault 2022-07-13 10:26:22 +02:00
parent 9473f3b49b
commit 731950a326
3 changed files with 9 additions and 1 deletions

View file

@ -14,6 +14,7 @@ import (
"time"
"github.com/go-shiori/go-readability"
"github.com/mattn/go-runewidth"
"golang.org/x/net/html"
"git.sr.ht/~sircmpwn/searchhut/database"
@ -115,6 +116,7 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
weight += 1
}
excerpt := runewidth.Truncate(article.Excerpt, 512, "…")
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
@ -161,7 +163,7 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
setweight(to_tsvector(coalesce($11, '')), $13) ||
setweight(to_tsvector(coalesce($12, '')), 'D');`,
c.DomainID, url.String(), counter.Length, hash.Sum([]byte{}),
meta.Title, meta.Author, meta.Description, article.Excerpt,
meta.Title, meta.Author, meta.Description, excerpt,
meta.JavaScript, url.Host, c.labels, article.TextContent,
weights[weight])
return err

2
go.mod
View file

@ -10,6 +10,7 @@ require (
github.com/go-chi/chi v1.5.4
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b
github.com/lib/pq v1.10.6
github.com/mattn/go-runewidth v0.0.13
github.com/temoto/robotstxt v1.1.2
github.com/vaughan0/go-ini v0.0.0-20130923145212-a98ad7ee00ec
github.com/vektah/gqlparser/v2 v2.4.6
@ -27,6 +28,7 @@ require (
github.com/hashicorp/golang-lru v0.5.4 // indirect
github.com/matryer/moq v0.2.7 // indirect
github.com/mitchellh/mapstructure v1.3.1 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sirupsen/logrus v1.8.1 // indirect
github.com/urfave/cli/v2 v2.8.1 // indirect

4
go.sum
View file

@ -98,6 +98,8 @@ github.com/matryer/moq v0.2.7 h1:RtpiPUM8L7ZSCbSwK+QcZH/E9tgqAkFjKQxsRs25b4w=
github.com/matryer/moq v0.2.7/go.mod h1:kITsx543GOENm48TUAQyJ9+SAvFSr7iGQXPoth/VUBk=
github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
@ -118,6 +120,8 @@ github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=