From 21c6e11228a271809647f9b0606b5f3c676ecf8e Mon Sep 17 00:00:00 2001 From: xenofem Date: Wed, 10 Apr 2024 16:59:32 -0400 Subject: [PATCH] optimizations and extra debug info for generate mode --- dlibrary/dlibrary.py | 71 ++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index df518ff..38b0cfb 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -14,6 +14,7 @@ import shutil import sqlite3 import stat import textwrap +import time import unicodedata from urllib.parse import urlparse import zipfile @@ -105,7 +106,7 @@ READONLY_DIR = READONLY_FILE | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH debug_mode = False def debug(s): if debug_mode: - print(s) + print(f'{time.strftime("%Y-%m-%d %H:%M:%S")} - {s}') def open_zipfile_with_encoding(path): for enc in ["utf-8", "shift-jis", "shift-jisx0213"]: @@ -1046,8 +1047,7 @@ def copy_recursive(src, dest): memoized_similarities = {} - -def similarity(a, b): +def similarity(a, b, cache_cur=None): if len(a) < len(b) or (len(a) == len(b) and a < b): shorter = a longer = b @@ -1060,13 +1060,22 @@ def similarity(a, b): if (shorter, longer) in memoized_similarities: return memoized_similarities[(shorter, longer)] - options = [similarity(shorter[1:], longer)] - for i in range(1, len(shorter)+1): - match_idx = longer.find(shorter[:i]) - if match_idx == -1: - break - options.append(i*i + similarity(shorter[i:], longer[match_idx+i:])) - result = max(options) + if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None: + result = cached[0] + else: + options = [similarity(shorter[1:], longer)] + for i in range(1, len(shorter)+1): + match_idx = longer.find(shorter[:i]) + if match_idx == -1: + break + options.append(i*i + similarity(shorter[i:], longer[match_idx+i:])) + result = max(options) + + if cache_cur: + cache_cur.execute( + "INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)", + (shorter, longer, result), + ) memoized_similarities[(shorter, longer)] = result return result @@ -1090,6 +1099,7 @@ def top(items, n, key, overflow=0): return [item for (item, score) in winners[:n+overflow]] def generate(args): + debug('loading templates') jenv = Environment( loader=PackageLoader("dlibrary"), autoescape=select_autoescape() @@ -1099,16 +1109,29 @@ def generate(args): categorization_template = jenv.get_template("categorization.html") work_template = jenv.get_template("work.html") index_template = jenv.get_template("index.html") + debug('templates loaded') + debug('opening main database') con = sqlite3.connect(args.destdir / 'meta.db') cur = con.cursor() + debug('main database open') + + debug('opening cache database') + cache_con = sqlite3.connect(args.destdir / 'cache.db') + cache_cur = cache_con.cursor() + cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))") + debug('cache database open') site_dir = args.destdir / 'site' collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()} works = [] - for (work_id, title, circle, date, description, series) in cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall(): + debug('checking thumbnail files') + thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()} + debug(f'{len(thumbnail_files)} thumbnail files found') + debug('running database query for works') + for (idx, (work_id, title, circle, date, description, series)) in enumerate(cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall()): if work_id not in collated_work_ids: continue authors = [author for (author,) in cur.execute('SELECT author FROM authors WHERE work = ?', (work_id,))] @@ -1117,12 +1140,7 @@ def generate(args): images = [path.name for path in (site_dir / 'images' / work_id).iterdir()] images.sort() - try: - thumbnail_path = relpath(next( - f for f in (site_dir / 'thumbnails').iterdir() if f.stem == work_id - ), site_dir) - except StopIteration: - thumbnail_path = f'images/{work_id}/{images[0]}' + thumbnail_path = relpath(thumbnail_files.get(work_id, site_dir / 'images' / work_id / images[0]), site_dir) work = { 'id': work_id, 'title': title, @@ -1137,13 +1155,16 @@ def generate(args): } works.append(work) + print(f'\x1b[2K\r{idx+1} database entries read...', end='') + print() + for (idx, work) in enumerate(works): def suggestion_priority(other_work): if other_work is work: return -2 if work['series'] and work['series'] == other_work['series']: return -1 - return similarity(work['title'], other_work['title']) + return similarity(work['title'], other_work['title'], cache_cur) suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority) work_dir = site_dir / 'works' / work['id'] @@ -1155,6 +1176,7 @@ def generate(args): f.write(viewer_template.render(depth=3, work=work, title=work['title'])) print(f'\x1b[2K\r{idx+1}/{len(works)} works processed...', end=('' if idx+1 < len(works) else '\n')) + cache_con.commit() uca = pyuca.Collator().sort_key def make_categorization(categorization, query, work_filter, work_style_cards=False): @@ -1162,7 +1184,7 @@ def generate(args): cats = sorted((cat for (cat,) in cur.execute(query)), key=uca) cat_samples = {} - for cat in cats: + for (idx, cat) in enumerate(cats): cat_works = list(filter(work_filter(cat), works)) cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None @@ -1176,6 +1198,7 @@ def generate(args): title=cat, categorization=categorization, )) + print(f'\x1b[2K\r{idx+1}/{len(cats)} {categorization} processed...', end=('' if idx+1 < len(cats) else '\n')) categorization_dir.mkdir(parents=True, exist_ok=True) with open(categorization_dir / 'index.html', 'w') as f: @@ -1209,13 +1232,23 @@ def generate(args): work_style_cards=True, ) + debug('copying static files') with resources.as_file(resources.files("dlibrary")) as r: copy_recursive(r / 'static', site_dir / 'static') + debug('static files copied') + debug('writing index page') with open(site_dir / 'index.html', 'w') as f: f.write(index_template.render(depth=0, works=works)) + debug('index page written') + debug('closing cache database') + cache_con.close() + debug('cache database closed') + + debug('closing main database') con.close() + debug('main database closed') argparser = argparse.ArgumentParser(