From 8e32c7cbca44aad2fc5676ffa1bd4d447b88ae77 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Sat, 8 Jun 2024 06:40:17 -0400
Subject: [PATCH 1/2] more debug info

---
 dlibrary/dlibrary.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 386230b..cb6cb0e 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -531,6 +531,7 @@ class Collator:
                 return split_attempt
 
         if all(src.is_file() and is_image(src) for src in srcs):
+            debug('Attempting to detect ordering for image files')
             ordering = complete_prefix_number_ordering(srcs)
             if not ordering and self.args.sort:
                 ordering = srcs.copy()
@@ -542,6 +543,11 @@ class Collator:
             else:
                 return None
 
+        debug('Unable to collate available file types:')
+        debug(f'Images: {[src for src in srcs if src.is_file() and is_image(src)]}')
+        debug(f'PDFs: {[src for src in srcs if src.is_file() and is_pdf(src)]}')
+        debug(f'Directories: {[src for src in srcs if src.is_dir()]}')
+        debug(f'Unknown files: {[src for src in srcs if src.is_file() and not is_image(src) and not is_pdf(src)]}')
         return None
 
     def link_pdf(self, src):
@@ -590,6 +596,8 @@ class Collator:
         if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
             return False
 
+        debug(f'Splitting sources based on regex: {[early_srcs, middle_srcs, late_srcs]}')
+
         early_page_collation = self.collate_from_paths(early_srcs)
         if early_page_collation is None:
             return None
@@ -671,6 +679,7 @@ class Collator:
         if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs):
             return False
 
+        debug('Detected multiple language options, selecting preferred language')
         srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))]
         if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0:
             return False

From 06d782e77aa38b4dc007ea4d21351f582a31fadf Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Sat, 8 Jun 2024 07:20:44 -0400
Subject: [PATCH 2/2] much more efficient caching for suggestions

---
 dlibrary/dlibrary.py | 129 +++++++++++++++++++++++++++----------------
 1 file changed, 82 insertions(+), 47 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index cb6cb0e..c963ee1 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -1150,7 +1150,7 @@ def copy_recursive(src, dest):
 
 
 memoized_similarities = {}
-def similarity(a, b, cache_cur=None):
+def string_similarity(a, b):
     if len(a) < len(b) or (len(a) == len(b) and a < b):
         shorter = a
         longer = b
@@ -1163,43 +1163,49 @@ def similarity(a, b, cache_cur=None):
     if (shorter, longer) in memoized_similarities:
         return memoized_similarities[(shorter, longer)]
 
-    if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None:
-        result = cached[0]
-    else:
-        options = [similarity(shorter[1:], longer)]
-        for i in range(1, len(shorter)+1):
-            match_idx = longer.find(shorter[:i])
-            if match_idx == -1:
-                break
-            options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
-        result = max(options)
-
-        if cache_cur:
-            cache_cur.execute(
-                "INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)",
-                (shorter, longer, result),
-            )
+    options = [string_similarity(shorter[1:], longer)]
+    for i in range(1, len(shorter)+1):
+        match_idx = longer.find(shorter[:i])
+        if match_idx == -1:
+            break
+        options.append(i*i + string_similarity(shorter[i:], longer[match_idx+i:]))
+    result = max(options)
 
     memoized_similarities[(shorter, longer)] = result
     return result
 
-def top(items, n, key, overflow=0):
-    winners = []
-    for item in items:
-        score = key(item)
-        if len(winners) < n or score >= winners[-1][1]:
-            for i in range(len(winners) + 1):
-                if i == len(winners) or score >= winners[i][1]:
-                    winners.insert(i, (item, score))
-                    break
-            while len(winners) > n and winners[-1][1] < winners[n-1][1]:
-                winners.pop()
+class TopScoreList:
+    def __init__(self, limit):
+        self.limit = limit
+        self.items_with_scores = []
+        self.randomized = True
 
-    # shuffle followed by stable sort to randomly shuffle within each score tier
-    random.shuffle(winners)
-    winners.sort(key=lambda w: w[1], reverse=True)
+    def insert(self, item, score):
+        if len(self.items_with_scores) >= self.limit and score < self.items_with_scores[-1][1]:
+            return [item]
 
-    return [item for (item, score) in winners[:n+overflow]]
+        self.randomized = False
+        for i in range(len(self.items_with_scores) + 1):
+            if i == len(self.items_with_scores) or score >= self.items_with_scores[i][1]:
+                self.items_with_scores.insert(i, (item, score))
+                break
+        removed_items = []
+        while len(self.items_with_scores) > self.limit and self.items_with_scores[-1][1] < self.items_with_scores[self.limit-1][1]:
+            removed_items.append(self.items_with_scores.pop()[0])
+        return removed_items
+
+    def _randomize(self):
+        if self.randomized:
+            return
+
+        # shuffle followed by stable sort to randomly shuffle within each score tier
+        random.shuffle(self.items_with_scores)
+        self.items_with_scores.sort(key=lambda i: i[1], reverse=True)
+        self.randomized = True
+
+    def __iter__(self):
+        self._randomize()
+        return (item for (item, _) in self.items_with_scores[:self.limit])
 
 def generate(args):
     debug('loading templates')
@@ -1219,17 +1225,21 @@ def generate(args):
     cur = con.cursor()
     debug('main database open')
 
-    debug('opening cache database')
+    debug('opening suggestion cache database')
     cache_con = sqlite3.connect(args.destdir / 'cache.db')
     cache_cur = cache_con.cursor()
-    cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))")
-    debug('cache database open')
+    cache_cur.execute("CREATE TABLE IF NOT EXISTS suggestions(work TEXT, suggested TEXT, similarity INT, PRIMARY KEY(work, suggested))")
+    debug('suggestion cache database open')
+    cached_suggestions = {}
+    for (work, suggested, similarity) in cache_cur.execute('SELECT work, suggested, similarity FROM suggestions'):
+        cached_suggestions.setdefault(work, TopScoreList(SUGGESTED_WORKS_COUNT)).insert(suggested, similarity)
+    debug('cached suggestions loaded')
 
     site_dir = args.destdir / 'site'
 
     collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}
 
-    works = []
+    works = {}
     debug('checking thumbnail files')
     thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()}
     debug(f'{len(thumbnail_files)} thumbnail files found')
@@ -1256,30 +1266,55 @@ def generate(args):
             'thumbnail_path': thumbnail_path,
             'images': images,
         }
-        works.append(work)
+        works[work_id] = work
 
         print(f'{ANSI_LINECLEAR}{idx+1} database entries read...', end='')
     print()
 
-    for (idx, work) in enumerate(works):
-        def suggestion_priority(other_work):
+    for work in works.values():
+        if work['id'] in cached_suggestions:
+            continue
+        debug(f'Computing suggestions for new work {work["title"]}')
+        cached_suggestions[work['id']] = TopScoreList(SUGGESTED_WORKS_COUNT)
+        for other_work in works.values():
             if other_work is work:
-                return -2
+                continue
             if work['series'] and work['series'] == other_work['series']:
-                return -1
-            return similarity(work['title'], other_work['title'], cache_cur)
-        suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority)
+                continue
+            if other_work['id'] not in cached_suggestions:
+                continue # we'll get to it later
 
+            similarity = string_similarity(work['title'], other_work['title'])
+            cached_suggestions[work['id']].insert(other_work['id'], similarity)
+            removed = cached_suggestions[other_work['id']].insert(work['id'], similarity)
+            if removed != [work['id']]:
+                cache_cur.executemany(
+                    'DELETE FROM suggestions WHERE work = :work AND suggested = :suggested',
+                    [{ "work": other_work['id'], "suggested": item } for item in removed],
+                )
+                cache_cur.execute(
+                    'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)',
+                    { "work": other_work['id'], "suggested": work['id'], "similarity": similarity },
+                )
+        cache_cur.executemany(
+            'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)',
+            [{ "work": work['id'], "suggested": suggested, "similarity": similarity } for (suggested, similarity) in cached_suggestions[work['id']].items_with_scores],
+        )
+        cache_con.commit()
+
+    for (idx, work) in enumerate(works.values()):
         work_dir = site_dir / 'works' / work['id']
         viewer_dir = work_dir / 'view'
         viewer_dir.mkdir(parents=True, exist_ok=True)
         with open(work_dir / 'index.html', 'w') as f:
-            f.write(work_template.render(depth=2, work=work, title=work['title'], suggested=suggested))
+            f.write(work_template.render(
+                depth=2, work=work, title=work['title'],
+                suggested=[works[suggested] for suggested in cached_suggestions[work['id']]],
+            ))
         with open(viewer_dir / 'index.html', 'w') as f:
             f.write(viewer_template.render(depth=3, work=work, title=work['title']))
 
         count_progress(idx, len(works), 'works processed')
-    cache_con.commit()
 
     uca = pyuca.Collator().sort_key
     def make_categorization(categorization, query, work_filter, work_style_cards=False):
@@ -1288,7 +1323,7 @@ def generate(args):
         cats = sorted((cat for (cat,) in cur.execute(query)), key=uca)
         cat_samples = {}
         for (idx, cat) in enumerate(cats):
-            cat_works = list(filter(work_filter(cat), works))
+            cat_works = list(filter(work_filter(cat), works.values()))
             cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None
 
             safeish_cat = cat.replace('/', ' ')
@@ -1342,7 +1377,7 @@ def generate(args):
 
     debug('writing index page')
     with open(site_dir / 'index.html', 'w') as f:
-        f.write(index_template.render(depth=0, works=works))
+        f.write(index_template.render(depth=0, works=list(works.values())))
     debug('index page written')
 
     debug('closing cache database')