optimizations and extra debug info for generate mode

This commit is contained in:
xenofem 2024-04-10 16:59:32 -04:00
parent a822a504e8
commit 21c6e11228

View file

@ -14,6 +14,7 @@ import shutil
import sqlite3 import sqlite3
import stat import stat
import textwrap import textwrap
import time
import unicodedata import unicodedata
from urllib.parse import urlparse from urllib.parse import urlparse
import zipfile import zipfile
@ -105,7 +106,7 @@ READONLY_DIR = READONLY_FILE | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
debug_mode = False debug_mode = False
def debug(s): def debug(s):
if debug_mode: if debug_mode:
print(s) print(f'{time.strftime("%Y-%m-%d %H:%M:%S")} - {s}')
def open_zipfile_with_encoding(path): def open_zipfile_with_encoding(path):
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]: for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
@ -1046,8 +1047,7 @@ def copy_recursive(src, dest):
memoized_similarities = {} memoized_similarities = {}
def similarity(a, b, cache_cur=None):
def similarity(a, b):
if len(a) < len(b) or (len(a) == len(b) and a < b): if len(a) < len(b) or (len(a) == len(b) and a < b):
shorter = a shorter = a
longer = b longer = b
@ -1060,13 +1060,22 @@ def similarity(a, b):
if (shorter, longer) in memoized_similarities: if (shorter, longer) in memoized_similarities:
return memoized_similarities[(shorter, longer)] return memoized_similarities[(shorter, longer)]
options = [similarity(shorter[1:], longer)] if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None:
for i in range(1, len(shorter)+1): result = cached[0]
match_idx = longer.find(shorter[:i]) else:
if match_idx == -1: options = [similarity(shorter[1:], longer)]
break for i in range(1, len(shorter)+1):
options.append(i*i + similarity(shorter[i:], longer[match_idx+i:])) match_idx = longer.find(shorter[:i])
result = max(options) if match_idx == -1:
break
options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
result = max(options)
if cache_cur:
cache_cur.execute(
"INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)",
(shorter, longer, result),
)
memoized_similarities[(shorter, longer)] = result memoized_similarities[(shorter, longer)] = result
return result return result
@ -1090,6 +1099,7 @@ def top(items, n, key, overflow=0):
return [item for (item, score) in winners[:n+overflow]] return [item for (item, score) in winners[:n+overflow]]
def generate(args): def generate(args):
debug('loading templates')
jenv = Environment( jenv = Environment(
loader=PackageLoader("dlibrary"), loader=PackageLoader("dlibrary"),
autoescape=select_autoescape() autoescape=select_autoescape()
@ -1099,16 +1109,29 @@ def generate(args):
categorization_template = jenv.get_template("categorization.html") categorization_template = jenv.get_template("categorization.html")
work_template = jenv.get_template("work.html") work_template = jenv.get_template("work.html")
index_template = jenv.get_template("index.html") index_template = jenv.get_template("index.html")
debug('templates loaded')
debug('opening main database')
con = sqlite3.connect(args.destdir / 'meta.db') con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor() cur = con.cursor()
debug('main database open')
debug('opening cache database')
cache_con = sqlite3.connect(args.destdir / 'cache.db')
cache_cur = cache_con.cursor()
cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))")
debug('cache database open')
site_dir = args.destdir / 'site' site_dir = args.destdir / 'site'
collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()} collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}
works = [] works = []
for (work_id, title, circle, date, description, series) in cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall(): debug('checking thumbnail files')
thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()}
debug(f'{len(thumbnail_files)} thumbnail files found')
debug('running database query for works')
for (idx, (work_id, title, circle, date, description, series)) in enumerate(cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall()):
if work_id not in collated_work_ids: if work_id not in collated_work_ids:
continue continue
authors = [author for (author,) in cur.execute('SELECT author FROM authors WHERE work = ?', (work_id,))] authors = [author for (author,) in cur.execute('SELECT author FROM authors WHERE work = ?', (work_id,))]
@ -1117,12 +1140,7 @@ def generate(args):
images = [path.name for path in (site_dir / 'images' / work_id).iterdir()] images = [path.name for path in (site_dir / 'images' / work_id).iterdir()]
images.sort() images.sort()
try: thumbnail_path = relpath(thumbnail_files.get(work_id, site_dir / 'images' / work_id / images[0]), site_dir)
thumbnail_path = relpath(next(
f for f in (site_dir / 'thumbnails').iterdir() if f.stem == work_id
), site_dir)
except StopIteration:
thumbnail_path = f'images/{work_id}/{images[0]}'
work = { work = {
'id': work_id, 'id': work_id,
'title': title, 'title': title,
@ -1137,13 +1155,16 @@ def generate(args):
} }
works.append(work) works.append(work)
print(f'\x1b[2K\r{idx+1} database entries read...', end='')
print()
for (idx, work) in enumerate(works): for (idx, work) in enumerate(works):
def suggestion_priority(other_work): def suggestion_priority(other_work):
if other_work is work: if other_work is work:
return -2 return -2
if work['series'] and work['series'] == other_work['series']: if work['series'] and work['series'] == other_work['series']:
return -1 return -1
return similarity(work['title'], other_work['title']) return similarity(work['title'], other_work['title'], cache_cur)
suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority) suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority)
work_dir = site_dir / 'works' / work['id'] work_dir = site_dir / 'works' / work['id']
@ -1155,6 +1176,7 @@ def generate(args):
f.write(viewer_template.render(depth=3, work=work, title=work['title'])) f.write(viewer_template.render(depth=3, work=work, title=work['title']))
print(f'\x1b[2K\r{idx+1}/{len(works)} works processed...', end=('' if idx+1 < len(works) else '\n')) print(f'\x1b[2K\r{idx+1}/{len(works)} works processed...', end=('' if idx+1 < len(works) else '\n'))
cache_con.commit()
uca = pyuca.Collator().sort_key uca = pyuca.Collator().sort_key
def make_categorization(categorization, query, work_filter, work_style_cards=False): def make_categorization(categorization, query, work_filter, work_style_cards=False):
@ -1162,7 +1184,7 @@ def generate(args):
cats = sorted((cat for (cat,) in cur.execute(query)), key=uca) cats = sorted((cat for (cat,) in cur.execute(query)), key=uca)
cat_samples = {} cat_samples = {}
for cat in cats: for (idx, cat) in enumerate(cats):
cat_works = list(filter(work_filter(cat), works)) cat_works = list(filter(work_filter(cat), works))
cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None
@ -1176,6 +1198,7 @@ def generate(args):
title=cat, title=cat,
categorization=categorization, categorization=categorization,
)) ))
print(f'\x1b[2K\r{idx+1}/{len(cats)} {categorization} processed...', end=('' if idx+1 < len(cats) else '\n'))
categorization_dir.mkdir(parents=True, exist_ok=True) categorization_dir.mkdir(parents=True, exist_ok=True)
with open(categorization_dir / 'index.html', 'w') as f: with open(categorization_dir / 'index.html', 'w') as f:
@ -1209,13 +1232,23 @@ def generate(args):
work_style_cards=True, work_style_cards=True,
) )
debug('copying static files')
with resources.as_file(resources.files("dlibrary")) as r: with resources.as_file(resources.files("dlibrary")) as r:
copy_recursive(r / 'static', site_dir / 'static') copy_recursive(r / 'static', site_dir / 'static')
debug('static files copied')
debug('writing index page')
with open(site_dir / 'index.html', 'w') as f: with open(site_dir / 'index.html', 'w') as f:
f.write(index_template.render(depth=0, works=works)) f.write(index_template.render(depth=0, works=works))
debug('index page written')
debug('closing cache database')
cache_con.close()
debug('cache database closed')
debug('closing main database')
con.close() con.close()
debug('main database closed')
argparser = argparse.ArgumentParser( argparser = argparse.ArgumentParser(