searchhut/import/xkcd/main.py
2022-07-13 14:54:40 +02:00

95 lines
3.1 KiB
Python

import configparser
import hashlib
import json
import mwparserfromhell as mw
import psycopg2
import re
import requests
import time
# TODO:
# - Update domain last_index_date and crawl_duration
# - Index main page
conf = configparser.ConfigParser()
conf.read("config.ini") # TODO: Search multiple paths
cs = conf["searchhut"]["connection-string"]
conn = psycopg2.connect(cs)
with conn.cursor() as cur:
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("xkcd.com",))
domain_id = cur.fetchone()[0]
latest = requests.get("https://xkcd.com/info.0.json").json()
redir_re = re.compile(r"\[\[(.*)\]\]")
for n in range(1, latest["num"] + 1):
info = requests.get(f"https://xkcd.com/{n}/info.0.json").json()
comic = requests.get(f"https://xkcd.com/{n}")
redir = requests.get(f"https://explainxkcd.com/wiki/index.php?title={n}&action=raw").text
explain_page = redir_re.search(redir).groups(1)[0]
explain_page = explain_page.replace(" ", "_")
explain = requests.get(f"https://explainxkcd.com/wiki/index.php?title={explain_page}&action=raw").text
page = mw.parse(explain)
transcript = None
for sec in page.get_sections():
if sec.startswith("==Transcript=="):
transcript = sec.strip_code()
break
url = f"https://xkcd.com/{n}/"
print(url)
page_size = -1
title = "xkcd: " + info["title"]
sha = hashlib.sha512()
sha.update(comic.text.encode())
checksum = sha.digest()
try:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO page (
domain_id,
last_index_date,
source,
url,
page_size,
checksum,
title,
javascript,
fts_vector
-- text_content
-- hostname
) VALUES (
%(domain_id)s, now(), 'import/xkcd',
%(url)s, %(page_size)s, %(checksum)s,
%(title)s, false,
setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('xkcd.com'), 'A') ||
setweight(to_tsvector('xkcd'), 'A') ||
setweight(to_tsvector(%(content)s), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
page_size = %(page_size)s,
checksum = %(checksum)s,
title = %(title)s,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('xkcd.com'), 'A') ||
setweight(to_tsvector('xkcd'), 'A') ||
setweight(to_tsvector(%(content)s), 'D');
""", {
"domain_id": domain_id,
"url": url,
"page_size": len(comic.text),
"checksum": checksum,
"title": title,
"content": transcript or title,
})
conn.commit()
except psycopg2.errors.UniqueViolation:
conn.rollback()
time.sleep(3)