optimizations and extra debug info for generate mode

2024-04-10 16:59:32 -04:00 · 2024-04-10 16:59:32 -04:00 · 21c6e11228
commit 21c6e11228
parent a822a504e8
1 changed files with 52 additions and 19 deletions
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@ -14,6 +14,7 @@ import shutil
 import sqlite3
 import stat
 import textwrap
+import time
 import unicodedata
 from urllib.parse import urlparse
 import zipfile
@ -105,7 +106,7 @@ READONLY_DIR = READONLY_FILE | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
 debug_mode = False
 def debug(s):
    if debug_mode:
-        print(s)
+        print(f'{time.strftime("%Y-%m-%d %H:%M:%S")} - {s}')

 def open_zipfile_with_encoding(path):
    for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
@ -1046,8 +1047,7 @@ def copy_recursive(src, dest):


 memoized_similarities = {}
-
-def similarity(a, b):
+def similarity(a, b, cache_cur=None):
    if len(a) < len(b) or (len(a) == len(b) and a < b):
        shorter = a
        longer = b
@ -1060,13 +1060,22 @@ def similarity(a, b):
    if (shorter, longer) in memoized_similarities:
        return memoized_similarities[(shorter, longer)]

-    options = [similarity(shorter[1:], longer)]
-    for i in range(1, len(shorter)+1):
-        match_idx = longer.find(shorter[:i])
-        if match_idx == -1:
-            break
-        options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
-    result = max(options)
+    if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None:
+        result = cached[0]
+    else:
+        options = [similarity(shorter[1:], longer)]
+        for i in range(1, len(shorter)+1):
+            match_idx = longer.find(shorter[:i])
+            if match_idx == -1:
+                break
+            options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
+        result = max(options)
+
+        if cache_cur:
+            cache_cur.execute(
+                "INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)",
+                (shorter, longer, result),
+            )

    memoized_similarities[(shorter, longer)] = result
    return result
@ -1090,6 +1099,7 @@ def top(items, n, key, overflow=0):
    return [item for (item, score) in winners[:n+overflow]]

 def generate(args):
+    debug('loading templates')
    jenv = Environment(
        loader=PackageLoader("dlibrary"),
        autoescape=select_autoescape()
@ -1099,16 +1109,29 @@ def generate(args):
    categorization_template = jenv.get_template("categorization.html")
    work_template = jenv.get_template("work.html")
    index_template = jenv.get_template("index.html")
+    debug('templates loaded')

+    debug('opening main database')
    con = sqlite3.connect(args.destdir / 'meta.db')
    cur = con.cursor()
+    debug('main database open')
+
+    debug('opening cache database')
+    cache_con = sqlite3.connect(args.destdir / 'cache.db')
+    cache_cur = cache_con.cursor()
+    cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))")
+    debug('cache database open')

    site_dir = args.destdir / 'site'

    collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}

    works = []
-    for (work_id, title, circle, date, description, series) in cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall():
+    debug('checking thumbnail files')
+    thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()}
+    debug(f'{len(thumbnail_files)} thumbnail files found')
+    debug('running database query for works')
+    for (idx, (work_id, title, circle, date, description, series)) in enumerate(cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall()):
        if work_id not in collated_work_ids:
            continue
        authors = [author for (author,) in cur.execute('SELECT author FROM authors WHERE work = ?', (work_id,))]
@ -1117,12 +1140,7 @@ def generate(args):
        images = [path.name for path in (site_dir / 'images' / work_id).iterdir()]
        images.sort()

-        try:
-            thumbnail_path = relpath(next(
-                f for f in (site_dir / 'thumbnails').iterdir() if f.stem == work_id
-            ), site_dir)
-        except StopIteration:
-            thumbnail_path = f'images/{work_id}/{images[0]}'
+        thumbnail_path = relpath(thumbnail_files.get(work_id, site_dir / 'images' / work_id / images[0]), site_dir)
        work = {
            'id': work_id,
            'title': title,
@ -1137,13 +1155,16 @@ def generate(args):
        }
        works.append(work)

+        print(f'\x1b[2K\r{idx+1} database entries read...', end='')
+    print()
+
    for (idx, work) in enumerate(works):
        def suggestion_priority(other_work):
            if other_work is work:
                return -2
            if work['series'] and work['series'] == other_work['series']:
                return -1
-            return similarity(work['title'], other_work['title'])
+            return similarity(work['title'], other_work['title'], cache_cur)
        suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority)

        work_dir = site_dir / 'works' / work['id']
@ -1155,6 +1176,7 @@ def generate(args):
            f.write(viewer_template.render(depth=3, work=work, title=work['title']))

        print(f'\x1b[2K\r{idx+1}/{len(works)} works processed...', end=('' if idx+1 < len(works) else '\n'))
+    cache_con.commit()

    uca = pyuca.Collator().sort_key
    def make_categorization(categorization, query, work_filter, work_style_cards=False):
@ -1162,7 +1184,7 @@ def generate(args):

        cats = sorted((cat for (cat,) in cur.execute(query)), key=uca)
        cat_samples = {}
-        for cat in cats:
+        for (idx, cat) in enumerate(cats):
            cat_works = list(filter(work_filter(cat), works))
            cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None

@ -1176,6 +1198,7 @@ def generate(args):
                    title=cat,
                    categorization=categorization,
                ))
+            print(f'\x1b[2K\r{idx+1}/{len(cats)} {categorization} processed...', end=('' if idx+1 < len(cats) else '\n'))

        categorization_dir.mkdir(parents=True, exist_ok=True)
        with open(categorization_dir / 'index.html', 'w') as f:
@ -1209,13 +1232,23 @@ def generate(args):
        work_style_cards=True,
    )

+    debug('copying static files')
    with resources.as_file(resources.files("dlibrary")) as r:
        copy_recursive(r / 'static', site_dir / 'static')
+    debug('static files copied')

+    debug('writing index page')
    with open(site_dir / 'index.html', 'w') as f:
        f.write(index_template.render(depth=0, works=works))
+    debug('index page written')

+    debug('closing cache database')
+    cache_con.close()
+    debug('cache database closed')
+
+    debug('closing main database')
    con.close()
+    debug('main database closed')


 argparser = argparse.ArgumentParser(