much more efficient caching for suggestions

This commit is contained in:
xenofem 2024-06-08 07:20:44 -04:00
parent 8e32c7cbca
commit 06d782e77a

View file

@ -1150,7 +1150,7 @@ def copy_recursive(src, dest):
memoized_similarities = {}
def similarity(a, b, cache_cur=None):
def string_similarity(a, b):
if len(a) < len(b) or (len(a) == len(b) and a < b):
shorter = a
longer = b
@ -1163,43 +1163,49 @@ def similarity(a, b, cache_cur=None):
if (shorter, longer) in memoized_similarities:
return memoized_similarities[(shorter, longer)]
if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None:
result = cached[0]
options = [similarity(shorter[1:], longer)]
options = [string_similarity(shorter[1:], longer)]
for i in range(1, len(shorter)+1):
match_idx = longer.find(shorter[:i])
if match_idx == -1:
options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
options.append(i*i + string_similarity(shorter[i:], longer[match_idx+i:]))
result = max(options)
if cache_cur:
"INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)",
(shorter, longer, result),
memoized_similarities[(shorter, longer)] = result
return result
def top(items, n, key, overflow=0):
winners = []
for item in items:
score = key(item)
if len(winners) < n or score >= winners[-1][1]:
for i in range(len(winners) + 1):
if i == len(winners) or score >= winners[i][1]:
winners.insert(i, (item, score))
class TopScoreList:
def __init__(self, limit):
self.limit = limit
self.items_with_scores = []
self.randomized = True
def insert(self, item, score):
if len(self.items_with_scores) >= self.limit and score < self.items_with_scores[-1][1]:
return [item]
self.randomized = False
for i in range(len(self.items_with_scores) + 1):
if i == len(self.items_with_scores) or score >= self.items_with_scores[i][1]:
self.items_with_scores.insert(i, (item, score))
while len(winners) > n and winners[-1][1] < winners[n-1][1]:
removed_items = []
while len(self.items_with_scores) > self.limit and self.items_with_scores[-1][1] < self.items_with_scores[self.limit-1][1]:
return removed_items
def _randomize(self):
if self.randomized:
# shuffle followed by stable sort to randomly shuffle within each score tier
winners.sort(key=lambda w: w[1], reverse=True)
self.items_with_scores.sort(key=lambda i: i[1], reverse=True)
self.randomized = True
return [item for (item, score) in winners[:n+overflow]]
def __iter__(self):
return (item for (item, _) in self.items_with_scores[:self.limit])
def generate(args):
debug('loading templates')
@ -1219,17 +1225,21 @@ def generate(args):
cur = con.cursor()
debug('main database open')
debug('opening cache database')
debug('opening suggestion cache database')
cache_con = sqlite3.connect(args.destdir / 'cache.db')
cache_cur = cache_con.cursor()
cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))")
debug('cache database open')
cache_cur.execute("CREATE TABLE IF NOT EXISTS suggestions(work TEXT, suggested TEXT, similarity INT, PRIMARY KEY(work, suggested))")
debug('suggestion cache database open')
cached_suggestions = {}
for (work, suggested, similarity) in cache_cur.execute('SELECT work, suggested, similarity FROM suggestions'):
cached_suggestions.setdefault(work, TopScoreList(SUGGESTED_WORKS_COUNT)).insert(suggested, similarity)
debug('cached suggestions loaded')
site_dir = args.destdir / 'site'
collated_work_ids = { for p in (site_dir / 'images').iterdir()}
works = []
works = {}
debug('checking thumbnail files')
thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()}
debug(f'{len(thumbnail_files)} thumbnail files found')
@ -1256,30 +1266,55 @@ def generate(args):
'thumbnail_path': thumbnail_path,
'images': images,
works[work_id] = work
print(f'{ANSI_LINECLEAR}{idx+1} database entries read...', end='')
for (idx, work) in enumerate(works):
def suggestion_priority(other_work):
for work in works.values():
if work['id'] in cached_suggestions:
debug(f'Computing suggestions for new work {work["title"]}')
cached_suggestions[work['id']] = TopScoreList(SUGGESTED_WORKS_COUNT)
for other_work in works.values():
if other_work is work:
return -2
if work['series'] and work['series'] == other_work['series']:
return -1
return similarity(work['title'], other_work['title'], cache_cur)
suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority)
if other_work['id'] not in cached_suggestions:
continue # we'll get to it later
similarity = string_similarity(work['title'], other_work['title'])
cached_suggestions[work['id']].insert(other_work['id'], similarity)
removed = cached_suggestions[other_work['id']].insert(work['id'], similarity)
if removed != [work['id']]:
'DELETE FROM suggestions WHERE work = :work AND suggested = :suggested',
[{ "work": other_work['id'], "suggested": item } for item in removed],
'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)',
{ "work": other_work['id'], "suggested": work['id'], "similarity": similarity },
'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)',
[{ "work": work['id'], "suggested": suggested, "similarity": similarity } for (suggested, similarity) in cached_suggestions[work['id']].items_with_scores],
for (idx, work) in enumerate(works.values()):
work_dir = site_dir / 'works' / work['id']
viewer_dir = work_dir / 'view'
viewer_dir.mkdir(parents=True, exist_ok=True)
with open(work_dir / 'index.html', 'w') as f:
f.write(work_template.render(depth=2, work=work, title=work['title'], suggested=suggested))
depth=2, work=work, title=work['title'],
suggested=[works[suggested] for suggested in cached_suggestions[work['id']]],
with open(viewer_dir / 'index.html', 'w') as f:
f.write(viewer_template.render(depth=3, work=work, title=work['title']))
count_progress(idx, len(works), 'works processed')
uca = pyuca.Collator().sort_key
def make_categorization(categorization, query, work_filter, work_style_cards=False):
@ -1288,7 +1323,7 @@ def generate(args):
cats = sorted((cat for (cat,) in cur.execute(query)), key=uca)
cat_samples = {}
for (idx, cat) in enumerate(cats):
cat_works = list(filter(work_filter(cat), works))
cat_works = list(filter(work_filter(cat), works.values()))
cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None
safeish_cat = cat.replace('/', ' ')
@ -1342,7 +1377,7 @@ def generate(args):
debug('writing index page')
with open(site_dir / 'index.html', 'w') as f:
f.write(index_template.render(depth=0, works=works))
f.write(index_template.render(depth=0, works=list(works.values())))
debug('index page written')
debug('closing cache database')