import configparser import hashlib import json import mwparserfromhell as mw import psycopg2 import re import requests import time # TODO: # - Update domain last_index_date and crawl_duration # - Index main page conf = configparser.ConfigParser() conf.read("config.ini") # TODO: Search multiple paths cs = conf["searchhut"]["connection-string"] conn = psycopg2.connect(cs) with conn.cursor() as cur: cur.execute("SELECT id FROM domain WHERE hostname = %s", ("xkcd.com",)) domain_id = cur.fetchone()[0] latest = requests.get("https://xkcd.com/info.0.json").json() redir_re = re.compile(r"\[\[(.*)\]\]") for n in range(1, latest["num"] + 1): info = requests.get(f"https://xkcd.com/{n}/info.0.json").json() comic = requests.get(f"https://xkcd.com/{n}") redir = requests.get(f"https://explainxkcd.com/wiki/index.php?title={n}&action=raw").text explain_page = redir_re.search(redir).groups(1)[0] explain_page = explain_page.replace(" ", "_") explain = requests.get(f"https://explainxkcd.com/wiki/index.php?title={explain_page}&action=raw").text page = mw.parse(explain) transcript = None for sec in page.get_sections(): if sec.startswith("==Transcript=="): transcript = sec.strip_code() break url = f"https://xkcd.com/{n}/" print(url) page_size = -1 title = "xkcd: " + info["title"] sha = hashlib.sha512() sha.update(comic.text.encode()) checksum = sha.digest() try: with conn.cursor() as cur: cur.execute(""" INSERT INTO page ( domain_id, last_index_date, source, url, page_size, checksum, title, javascript, fts_vector -- text_content -- hostname ) VALUES ( %(domain_id)s, now(), 'import/xkcd', %(url)s, %(page_size)s, %(checksum)s, %(title)s, false, setweight(to_tsvector(%(title)s), 'A') || setweight(to_tsvector('xkcd.com'), 'A') || setweight(to_tsvector('xkcd'), 'A') || setweight(to_tsvector(%(content)s), 'D') ) ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET last_index_date = now(), page_size = %(page_size)s, checksum = %(checksum)s, title = %(title)s, -- TODO: Maybe move this to a sub-query fts_vector = setweight(to_tsvector(%(title)s), 'A') || setweight(to_tsvector('xkcd.com'), 'A') || setweight(to_tsvector('xkcd'), 'A') || setweight(to_tsvector(%(content)s), 'D'); """, { "domain_id": domain_id, "url": url, "page_size": len(comic.text), "checksum": checksum, "title": title, "content": transcript or title, }) conn.commit() except psycopg2.errors.UniqueViolation: conn.rollback() time.sleep(3)