2022-07-08 19:46:11 +02:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"database/sql"
|
|
|
|
"log"
|
2022-07-11 20:55:39 +02:00
|
|
|
"net/url"
|
2022-07-08 19:46:11 +02:00
|
|
|
"os"
|
|
|
|
|
2022-07-11 20:55:39 +02:00
|
|
|
"git.sr.ht/~sircmpwn/getopt"
|
2022-07-08 19:46:11 +02:00
|
|
|
_ "github.com/lib/pq"
|
|
|
|
|
2022-07-09 15:31:16 +02:00
|
|
|
"git.sr.ht/~sircmpwn/searchhut/config"
|
2022-07-08 19:46:11 +02:00
|
|
|
"git.sr.ht/~sircmpwn/searchhut/crawler"
|
|
|
|
)
|
|
|
|
|
|
|
|
func main() {
|
2022-07-11 20:55:39 +02:00
|
|
|
var urls []*url.URL
|
|
|
|
opts, optind, err := getopt.Getopts(os.Args, "u:")
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
for _, opt := range opts {
|
|
|
|
switch opt.Option {
|
|
|
|
case 'u':
|
|
|
|
url, err := url.Parse(opt.Value)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
|
|
|
urls = append(urls, url)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
args := os.Args[optind:]
|
|
|
|
domain := args[0]
|
2022-07-09 15:31:16 +02:00
|
|
|
|
|
|
|
conf := config.Load()
|
|
|
|
connstr, ok := conf.Get("searchhut", "connection-string")
|
|
|
|
if !ok {
|
|
|
|
log.Fatal("Configuration missing connection string")
|
|
|
|
}
|
|
|
|
|
2022-07-08 19:46:11 +02:00
|
|
|
db, err := sql.Open("postgres", connstr)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
2022-07-09 15:31:16 +02:00
|
|
|
|
2022-07-09 18:14:06 +02:00
|
|
|
ua, ok := conf.Get("searchhut", "user-agent")
|
|
|
|
if !ok {
|
|
|
|
log.Fatal("Configuration missing user agent")
|
|
|
|
}
|
|
|
|
|
|
|
|
crawler := crawler.NewCrawler(ua, db, domain)
|
2022-07-11 20:55:39 +02:00
|
|
|
|
|
|
|
for _, url := range urls {
|
|
|
|
log.Printf("Manually scheduling %s", url.String())
|
|
|
|
crawler.Schedule(url)
|
|
|
|
}
|
|
|
|
|
2022-07-08 19:46:11 +02:00
|
|
|
crawler.Crawl()
|
|
|
|
}
|