import/xkcd: new importer
This commit is contained in:
parent
cc0c144528
commit
c364bc316f
1 changed files with 95 additions and 0 deletions
95
import/xkcd/main.py
Normal file
95
import/xkcd/main.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
import configparser
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import mwparserfromhell as mw
|
||||||
|
import psycopg2
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
# TODO:
|
||||||
|
# - Update domain last_index_date and crawl_duration
|
||||||
|
# - Index main page
|
||||||
|
|
||||||
|
conf = configparser.ConfigParser()
|
||||||
|
conf.read("config.ini") # TODO: Search multiple paths
|
||||||
|
cs = conf["searchhut"]["connection-string"]
|
||||||
|
conn = psycopg2.connect(cs)
|
||||||
|
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("xkcd.com",))
|
||||||
|
domain_id = cur.fetchone()[0]
|
||||||
|
|
||||||
|
latest = requests.get("https://xkcd.com/info.0.json").json()
|
||||||
|
redir_re = re.compile(r"\[\[(.*)\]\]")
|
||||||
|
|
||||||
|
for n in range(1, latest["num"] + 1):
|
||||||
|
info = requests.get(f"https://xkcd.com/{n}/info.0.json").json()
|
||||||
|
comic = requests.get(f"https://xkcd.com/{n}")
|
||||||
|
redir = requests.get(f"https://explainxkcd.com/wiki/index.php?title={n}&action=raw").text
|
||||||
|
explain_page = redir_re.search(redir).groups(1)[0]
|
||||||
|
explain_page = explain_page.replace(" ", "_")
|
||||||
|
explain = requests.get(f"https://explainxkcd.com/wiki/index.php?title={explain_page}&action=raw").text
|
||||||
|
page = mw.parse(explain)
|
||||||
|
|
||||||
|
transcript = None
|
||||||
|
for sec in page.get_sections():
|
||||||
|
if sec.startswith("==Transcript=="):
|
||||||
|
transcript = sec.strip_code()
|
||||||
|
break
|
||||||
|
|
||||||
|
url = f"https://xkcd.com/{n}/"
|
||||||
|
print(url)
|
||||||
|
|
||||||
|
page_size = -1
|
||||||
|
title = "xkcd: " + info["title"]
|
||||||
|
sha = hashlib.sha512()
|
||||||
|
sha.update(comic.text.encode())
|
||||||
|
checksum = sha.digest()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO page (
|
||||||
|
domain_id,
|
||||||
|
last_index_date,
|
||||||
|
source,
|
||||||
|
url,
|
||||||
|
page_size,
|
||||||
|
checksum,
|
||||||
|
title,
|
||||||
|
javascript,
|
||||||
|
fts_vector
|
||||||
|
-- text_content
|
||||||
|
-- hostname
|
||||||
|
) VALUES (
|
||||||
|
%(domain_id)s, now(), 'import/xkcd',
|
||||||
|
%(url)s, %(page_size)s, %(checksum)s,
|
||||||
|
%(title)s, false,
|
||||||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||||||
|
setweight(to_tsvector('xkcd.com'), 'A') ||
|
||||||
|
setweight(to_tsvector('xkcd'), 'A') ||
|
||||||
|
setweight(to_tsvector(%(content)s), 'D')
|
||||||
|
)
|
||||||
|
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
||||||
|
last_index_date = now(),
|
||||||
|
page_size = %(page_size)s,
|
||||||
|
checksum = %(checksum)s,
|
||||||
|
title = %(title)s,
|
||||||
|
-- TODO: Maybe move this to a sub-query
|
||||||
|
fts_vector =
|
||||||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||||||
|
setweight(to_tsvector('xkcd.com'), 'A') ||
|
||||||
|
setweight(to_tsvector('xkcd'), 'A') ||
|
||||||
|
setweight(to_tsvector(%(content)s), 'D');
|
||||||
|
""", {
|
||||||
|
"domain_id": domain_id,
|
||||||
|
"url": url,
|
||||||
|
"page_size": len(comic.text),
|
||||||
|
"checksum": checksum,
|
||||||
|
"title": title,
|
||||||
|
"content": transcript or title,
|
||||||
|
})
|
||||||
|
conn.commit()
|
||||||
|
except psycopg2.errors.UniqueViolation:
|
||||||
|
conn.rollback()
|
||||||
|
|
||||||
|
time.sleep(3)
|
Loading…
Reference in a new issue