diff --git a/import/xkcd/main.py b/import/xkcd/main.py new file mode 100644 index 0000000..34fc4da --- /dev/null +++ b/import/xkcd/main.py @@ -0,0 +1,95 @@ +import configparser +import hashlib +import json +import mwparserfromhell as mw +import psycopg2 +import re +import requests +import time +# TODO: +# - Update domain last_index_date and crawl_duration +# - Index main page + +conf = configparser.ConfigParser() +conf.read("config.ini") # TODO: Search multiple paths +cs = conf["searchhut"]["connection-string"] +conn = psycopg2.connect(cs) + +with conn.cursor() as cur: + cur.execute("SELECT id FROM domain WHERE hostname = %s", ("xkcd.com",)) + domain_id = cur.fetchone()[0] + +latest = requests.get("https://xkcd.com/info.0.json").json() +redir_re = re.compile(r"\[\[(.*)\]\]") + +for n in range(1, latest["num"] + 1): + info = requests.get(f"https://xkcd.com/{n}/info.0.json").json() + comic = requests.get(f"https://xkcd.com/{n}") + redir = requests.get(f"https://explainxkcd.com/wiki/index.php?title={n}&action=raw").text + explain_page = redir_re.search(redir).groups(1)[0] + explain_page = explain_page.replace(" ", "_") + explain = requests.get(f"https://explainxkcd.com/wiki/index.php?title={explain_page}&action=raw").text + page = mw.parse(explain) + + transcript = None + for sec in page.get_sections(): + if sec.startswith("==Transcript=="): + transcript = sec.strip_code() + break + + url = f"https://xkcd.com/{n}/" + print(url) + + page_size = -1 + title = "xkcd: " + info["title"] + sha = hashlib.sha512() + sha.update(comic.text.encode()) + checksum = sha.digest() + try: + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO page ( + domain_id, + last_index_date, + source, + url, + page_size, + checksum, + title, + javascript, + fts_vector + -- text_content + -- hostname + ) VALUES ( + %(domain_id)s, now(), 'import/xkcd', + %(url)s, %(page_size)s, %(checksum)s, + %(title)s, false, + setweight(to_tsvector(%(title)s), 'A') || + setweight(to_tsvector('xkcd.com'), 'A') || + setweight(to_tsvector('xkcd'), 'A') || + setweight(to_tsvector(%(content)s), 'D') + ) + ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET + last_index_date = now(), + page_size = %(page_size)s, + checksum = %(checksum)s, + title = %(title)s, + -- TODO: Maybe move this to a sub-query + fts_vector = + setweight(to_tsvector(%(title)s), 'A') || + setweight(to_tsvector('xkcd.com'), 'A') || + setweight(to_tsvector('xkcd'), 'A') || + setweight(to_tsvector(%(content)s), 'D'); + """, { + "domain_id": domain_id, + "url": url, + "page_size": len(comic.text), + "checksum": checksum, + "title": title, + "content": transcript or title, + }) + conn.commit() + except psycopg2.errors.UniqueViolation: + conn.rollback() + + time.sleep(3)