optimizations and extra debug info for generate mode

This commit is contained in:
xenofem 2024-04-10 16:59:32 -04:00
parent a822a504e8
commit 21c6e11228

View file

@ -14,6 +14,7 @@ import shutil
import sqlite3
import stat
import textwrap
import time
import unicodedata
from urllib.parse import urlparse
import zipfile
@ -105,7 +106,7 @@ READONLY_DIR = READONLY_FILE | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
debug_mode = False
def debug(s):
if debug_mode:
print(s)
print(f'{time.strftime("%Y-%m-%d %H:%M:%S")} - {s}')
def open_zipfile_with_encoding(path):
for enc in ["utf-8", "shift-jis", "shift-jisx0213"]:
@ -1046,8 +1047,7 @@ def copy_recursive(src, dest):
memoized_similarities = {}
def similarity(a, b):
def similarity(a, b, cache_cur=None):
if len(a) < len(b) or (len(a) == len(b) and a < b):
shorter = a
longer = b
@ -1060,13 +1060,22 @@ def similarity(a, b):
if (shorter, longer) in memoized_similarities:
return memoized_similarities[(shorter, longer)]
options = [similarity(shorter[1:], longer)]
for i in range(1, len(shorter)+1):
match_idx = longer.find(shorter[:i])
if match_idx == -1:
break
options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
result = max(options)
if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None:
result = cached[0]
else:
options = [similarity(shorter[1:], longer)]
for i in range(1, len(shorter)+1):
match_idx = longer.find(shorter[:i])
if match_idx == -1:
break
options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
result = max(options)
if cache_cur:
cache_cur.execute(
"INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)",
(shorter, longer, result),
)
memoized_similarities[(shorter, longer)] = result
return result
@ -1090,6 +1099,7 @@ def top(items, n, key, overflow=0):
return [item for (item, score) in winners[:n+overflow]]
def generate(args):
debug('loading templates')
jenv = Environment(
loader=PackageLoader("dlibrary"),
autoescape=select_autoescape()
@ -1099,16 +1109,29 @@ def generate(args):
categorization_template = jenv.get_template("categorization.html")
work_template = jenv.get_template("work.html")
index_template = jenv.get_template("index.html")
debug('templates loaded')
debug('opening main database')
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
debug('main database open')
debug('opening cache database')
cache_con = sqlite3.connect(args.destdir / 'cache.db')
cache_cur = cache_con.cursor()
cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))")
debug('cache database open')
site_dir = args.destdir / 'site'
collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}
works = []
for (work_id, title, circle, date, description, series) in cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall():
debug('checking thumbnail files')
thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()}
debug(f'{len(thumbnail_files)} thumbnail files found')
debug('running database query for works')
for (idx, (work_id, title, circle, date, description, series)) in enumerate(cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall()):
if work_id not in collated_work_ids:
continue
authors = [author for (author,) in cur.execute('SELECT author FROM authors WHERE work = ?', (work_id,))]
@ -1117,12 +1140,7 @@ def generate(args):
images = [path.name for path in (site_dir / 'images' / work_id).iterdir()]
images.sort()
try:
thumbnail_path = relpath(next(
f for f in (site_dir / 'thumbnails').iterdir() if f.stem == work_id
), site_dir)
except StopIteration:
thumbnail_path = f'images/{work_id}/{images[0]}'
thumbnail_path = relpath(thumbnail_files.get(work_id, site_dir / 'images' / work_id / images[0]), site_dir)
work = {
'id': work_id,
'title': title,
@ -1137,13 +1155,16 @@ def generate(args):
}
works.append(work)
print(f'\x1b[2K\r{idx+1} database entries read...', end='')
print()
for (idx, work) in enumerate(works):
def suggestion_priority(other_work):
if other_work is work:
return -2
if work['series'] and work['series'] == other_work['series']:
return -1
return similarity(work['title'], other_work['title'])
return similarity(work['title'], other_work['title'], cache_cur)
suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority)
work_dir = site_dir / 'works' / work['id']
@ -1155,6 +1176,7 @@ def generate(args):
f.write(viewer_template.render(depth=3, work=work, title=work['title']))
print(f'\x1b[2K\r{idx+1}/{len(works)} works processed...', end=('' if idx+1 < len(works) else '\n'))
cache_con.commit()
uca = pyuca.Collator().sort_key
def make_categorization(categorization, query, work_filter, work_style_cards=False):
@ -1162,7 +1184,7 @@ def generate(args):
cats = sorted((cat for (cat,) in cur.execute(query)), key=uca)
cat_samples = {}
for cat in cats:
for (idx, cat) in enumerate(cats):
cat_works = list(filter(work_filter(cat), works))
cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None
@ -1176,6 +1198,7 @@ def generate(args):
title=cat,
categorization=categorization,
))
print(f'\x1b[2K\r{idx+1}/{len(cats)} {categorization} processed...', end=('' if idx+1 < len(cats) else '\n'))
categorization_dir.mkdir(parents=True, exist_ok=True)
with open(categorization_dir / 'index.html', 'w') as f:
@ -1209,13 +1232,23 @@ def generate(args):
work_style_cards=True,
)
debug('copying static files')
with resources.as_file(resources.files("dlibrary")) as r:
copy_recursive(r / 'static', site_dir / 'static')
debug('static files copied')
debug('writing index page')
with open(site_dir / 'index.html', 'w') as f:
f.write(index_template.render(depth=0, works=works))
debug('index page written')
debug('closing cache database')
cache_con.close()
debug('cache database closed')
debug('closing main database')
con.close()
debug('main database closed')
argparser = argparse.ArgumentParser(