Compare commits
No commits in common. "06d782e77aa38b4dc007ea4d21351f582a31fadf" and "c26300d752e4506a43c31ff40167443c168d1829" have entirely different histories.
06d782e77a
...
c26300d752
|
@ -531,7 +531,6 @@ class Collator:
|
||||||
return split_attempt
|
return split_attempt
|
||||||
|
|
||||||
if all(src.is_file() and is_image(src) for src in srcs):
|
if all(src.is_file() and is_image(src) for src in srcs):
|
||||||
debug('Attempting to detect ordering for image files')
|
|
||||||
ordering = complete_prefix_number_ordering(srcs)
|
ordering = complete_prefix_number_ordering(srcs)
|
||||||
if not ordering and self.args.sort:
|
if not ordering and self.args.sort:
|
||||||
ordering = srcs.copy()
|
ordering = srcs.copy()
|
||||||
|
@ -543,11 +542,6 @@ class Collator:
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
debug('Unable to collate available file types:')
|
|
||||||
debug(f'Images: {[src for src in srcs if src.is_file() and is_image(src)]}')
|
|
||||||
debug(f'PDFs: {[src for src in srcs if src.is_file() and is_pdf(src)]}')
|
|
||||||
debug(f'Directories: {[src for src in srcs if src.is_dir()]}')
|
|
||||||
debug(f'Unknown files: {[src for src in srcs if src.is_file() and not is_image(src) and not is_pdf(src)]}')
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def link_pdf(self, src):
|
def link_pdf(self, src):
|
||||||
|
@ -596,8 +590,6 @@ class Collator:
|
||||||
if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
|
if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
debug(f'Splitting sources based on regex: {[early_srcs, middle_srcs, late_srcs]}')
|
|
||||||
|
|
||||||
early_page_collation = self.collate_from_paths(early_srcs)
|
early_page_collation = self.collate_from_paths(early_srcs)
|
||||||
if early_page_collation is None:
|
if early_page_collation is None:
|
||||||
return None
|
return None
|
||||||
|
@ -679,7 +671,6 @@ class Collator:
|
||||||
if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs):
|
if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
debug('Detected multiple language options, selecting preferred language')
|
|
||||||
srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))]
|
srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))]
|
||||||
if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0:
|
if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0:
|
||||||
return False
|
return False
|
||||||
|
@ -1150,7 +1141,7 @@ def copy_recursive(src, dest):
|
||||||
|
|
||||||
|
|
||||||
memoized_similarities = {}
|
memoized_similarities = {}
|
||||||
def string_similarity(a, b):
|
def similarity(a, b, cache_cur=None):
|
||||||
if len(a) < len(b) or (len(a) == len(b) and a < b):
|
if len(a) < len(b) or (len(a) == len(b) and a < b):
|
||||||
shorter = a
|
shorter = a
|
||||||
longer = b
|
longer = b
|
||||||
|
@ -1163,49 +1154,43 @@ def string_similarity(a, b):
|
||||||
if (shorter, longer) in memoized_similarities:
|
if (shorter, longer) in memoized_similarities:
|
||||||
return memoized_similarities[(shorter, longer)]
|
return memoized_similarities[(shorter, longer)]
|
||||||
|
|
||||||
options = [string_similarity(shorter[1:], longer)]
|
if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None:
|
||||||
for i in range(1, len(shorter)+1):
|
result = cached[0]
|
||||||
match_idx = longer.find(shorter[:i])
|
else:
|
||||||
if match_idx == -1:
|
options = [similarity(shorter[1:], longer)]
|
||||||
break
|
for i in range(1, len(shorter)+1):
|
||||||
options.append(i*i + string_similarity(shorter[i:], longer[match_idx+i:]))
|
match_idx = longer.find(shorter[:i])
|
||||||
result = max(options)
|
if match_idx == -1:
|
||||||
|
break
|
||||||
|
options.append(i*i + similarity(shorter[i:], longer[match_idx+i:]))
|
||||||
|
result = max(options)
|
||||||
|
|
||||||
|
if cache_cur:
|
||||||
|
cache_cur.execute(
|
||||||
|
"INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)",
|
||||||
|
(shorter, longer, result),
|
||||||
|
)
|
||||||
|
|
||||||
memoized_similarities[(shorter, longer)] = result
|
memoized_similarities[(shorter, longer)] = result
|
||||||
return result
|
return result
|
||||||
|
|
||||||
class TopScoreList:
|
def top(items, n, key, overflow=0):
|
||||||
def __init__(self, limit):
|
winners = []
|
||||||
self.limit = limit
|
for item in items:
|
||||||
self.items_with_scores = []
|
score = key(item)
|
||||||
self.randomized = True
|
if len(winners) < n or score >= winners[-1][1]:
|
||||||
|
for i in range(len(winners) + 1):
|
||||||
|
if i == len(winners) or score >= winners[i][1]:
|
||||||
|
winners.insert(i, (item, score))
|
||||||
|
break
|
||||||
|
while len(winners) > n and winners[-1][1] < winners[n-1][1]:
|
||||||
|
winners.pop()
|
||||||
|
|
||||||
def insert(self, item, score):
|
# shuffle followed by stable sort to randomly shuffle within each score tier
|
||||||
if len(self.items_with_scores) >= self.limit and score < self.items_with_scores[-1][1]:
|
random.shuffle(winners)
|
||||||
return [item]
|
winners.sort(key=lambda w: w[1], reverse=True)
|
||||||
|
|
||||||
self.randomized = False
|
return [item for (item, score) in winners[:n+overflow]]
|
||||||
for i in range(len(self.items_with_scores) + 1):
|
|
||||||
if i == len(self.items_with_scores) or score >= self.items_with_scores[i][1]:
|
|
||||||
self.items_with_scores.insert(i, (item, score))
|
|
||||||
break
|
|
||||||
removed_items = []
|
|
||||||
while len(self.items_with_scores) > self.limit and self.items_with_scores[-1][1] < self.items_with_scores[self.limit-1][1]:
|
|
||||||
removed_items.append(self.items_with_scores.pop()[0])
|
|
||||||
return removed_items
|
|
||||||
|
|
||||||
def _randomize(self):
|
|
||||||
if self.randomized:
|
|
||||||
return
|
|
||||||
|
|
||||||
# shuffle followed by stable sort to randomly shuffle within each score tier
|
|
||||||
random.shuffle(self.items_with_scores)
|
|
||||||
self.items_with_scores.sort(key=lambda i: i[1], reverse=True)
|
|
||||||
self.randomized = True
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
self._randomize()
|
|
||||||
return (item for (item, _) in self.items_with_scores[:self.limit])
|
|
||||||
|
|
||||||
def generate(args):
|
def generate(args):
|
||||||
debug('loading templates')
|
debug('loading templates')
|
||||||
|
@ -1225,21 +1210,17 @@ def generate(args):
|
||||||
cur = con.cursor()
|
cur = con.cursor()
|
||||||
debug('main database open')
|
debug('main database open')
|
||||||
|
|
||||||
debug('opening suggestion cache database')
|
debug('opening cache database')
|
||||||
cache_con = sqlite3.connect(args.destdir / 'cache.db')
|
cache_con = sqlite3.connect(args.destdir / 'cache.db')
|
||||||
cache_cur = cache_con.cursor()
|
cache_cur = cache_con.cursor()
|
||||||
cache_cur.execute("CREATE TABLE IF NOT EXISTS suggestions(work TEXT, suggested TEXT, similarity INT, PRIMARY KEY(work, suggested))")
|
cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))")
|
||||||
debug('suggestion cache database open')
|
debug('cache database open')
|
||||||
cached_suggestions = {}
|
|
||||||
for (work, suggested, similarity) in cache_cur.execute('SELECT work, suggested, similarity FROM suggestions'):
|
|
||||||
cached_suggestions.setdefault(work, TopScoreList(SUGGESTED_WORKS_COUNT)).insert(suggested, similarity)
|
|
||||||
debug('cached suggestions loaded')
|
|
||||||
|
|
||||||
site_dir = args.destdir / 'site'
|
site_dir = args.destdir / 'site'
|
||||||
|
|
||||||
collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}
|
collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}
|
||||||
|
|
||||||
works = {}
|
works = []
|
||||||
debug('checking thumbnail files')
|
debug('checking thumbnail files')
|
||||||
thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()}
|
thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()}
|
||||||
debug(f'{len(thumbnail_files)} thumbnail files found')
|
debug(f'{len(thumbnail_files)} thumbnail files found')
|
||||||
|
@ -1266,55 +1247,30 @@ def generate(args):
|
||||||
'thumbnail_path': thumbnail_path,
|
'thumbnail_path': thumbnail_path,
|
||||||
'images': images,
|
'images': images,
|
||||||
}
|
}
|
||||||
works[work_id] = work
|
works.append(work)
|
||||||
|
|
||||||
print(f'{ANSI_LINECLEAR}{idx+1} database entries read...', end='')
|
print(f'{ANSI_LINECLEAR}{idx+1} database entries read...', end='')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
for work in works.values():
|
for (idx, work) in enumerate(works):
|
||||||
if work['id'] in cached_suggestions:
|
def suggestion_priority(other_work):
|
||||||
continue
|
|
||||||
debug(f'Computing suggestions for new work {work["title"]}')
|
|
||||||
cached_suggestions[work['id']] = TopScoreList(SUGGESTED_WORKS_COUNT)
|
|
||||||
for other_work in works.values():
|
|
||||||
if other_work is work:
|
if other_work is work:
|
||||||
continue
|
return -2
|
||||||
if work['series'] and work['series'] == other_work['series']:
|
if work['series'] and work['series'] == other_work['series']:
|
||||||
continue
|
return -1
|
||||||
if other_work['id'] not in cached_suggestions:
|
return similarity(work['title'], other_work['title'], cache_cur)
|
||||||
continue # we'll get to it later
|
suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority)
|
||||||
|
|
||||||
similarity = string_similarity(work['title'], other_work['title'])
|
|
||||||
cached_suggestions[work['id']].insert(other_work['id'], similarity)
|
|
||||||
removed = cached_suggestions[other_work['id']].insert(work['id'], similarity)
|
|
||||||
if removed != [work['id']]:
|
|
||||||
cache_cur.executemany(
|
|
||||||
'DELETE FROM suggestions WHERE work = :work AND suggested = :suggested',
|
|
||||||
[{ "work": other_work['id'], "suggested": item } for item in removed],
|
|
||||||
)
|
|
||||||
cache_cur.execute(
|
|
||||||
'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)',
|
|
||||||
{ "work": other_work['id'], "suggested": work['id'], "similarity": similarity },
|
|
||||||
)
|
|
||||||
cache_cur.executemany(
|
|
||||||
'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)',
|
|
||||||
[{ "work": work['id'], "suggested": suggested, "similarity": similarity } for (suggested, similarity) in cached_suggestions[work['id']].items_with_scores],
|
|
||||||
)
|
|
||||||
cache_con.commit()
|
|
||||||
|
|
||||||
for (idx, work) in enumerate(works.values()):
|
|
||||||
work_dir = site_dir / 'works' / work['id']
|
work_dir = site_dir / 'works' / work['id']
|
||||||
viewer_dir = work_dir / 'view'
|
viewer_dir = work_dir / 'view'
|
||||||
viewer_dir.mkdir(parents=True, exist_ok=True)
|
viewer_dir.mkdir(parents=True, exist_ok=True)
|
||||||
with open(work_dir / 'index.html', 'w') as f:
|
with open(work_dir / 'index.html', 'w') as f:
|
||||||
f.write(work_template.render(
|
f.write(work_template.render(depth=2, work=work, title=work['title'], suggested=suggested))
|
||||||
depth=2, work=work, title=work['title'],
|
|
||||||
suggested=[works[suggested] for suggested in cached_suggestions[work['id']]],
|
|
||||||
))
|
|
||||||
with open(viewer_dir / 'index.html', 'w') as f:
|
with open(viewer_dir / 'index.html', 'w') as f:
|
||||||
f.write(viewer_template.render(depth=3, work=work, title=work['title']))
|
f.write(viewer_template.render(depth=3, work=work, title=work['title']))
|
||||||
|
|
||||||
count_progress(idx, len(works), 'works processed')
|
count_progress(idx, len(works), 'works processed')
|
||||||
|
cache_con.commit()
|
||||||
|
|
||||||
uca = pyuca.Collator().sort_key
|
uca = pyuca.Collator().sort_key
|
||||||
def make_categorization(categorization, query, work_filter, work_style_cards=False):
|
def make_categorization(categorization, query, work_filter, work_style_cards=False):
|
||||||
|
@ -1323,7 +1279,7 @@ def generate(args):
|
||||||
cats = sorted((cat for (cat,) in cur.execute(query)), key=uca)
|
cats = sorted((cat for (cat,) in cur.execute(query)), key=uca)
|
||||||
cat_samples = {}
|
cat_samples = {}
|
||||||
for (idx, cat) in enumerate(cats):
|
for (idx, cat) in enumerate(cats):
|
||||||
cat_works = list(filter(work_filter(cat), works.values()))
|
cat_works = list(filter(work_filter(cat), works))
|
||||||
cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None
|
cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None
|
||||||
|
|
||||||
safeish_cat = cat.replace('/', ' ')
|
safeish_cat = cat.replace('/', ' ')
|
||||||
|
@ -1377,7 +1333,7 @@ def generate(args):
|
||||||
|
|
||||||
debug('writing index page')
|
debug('writing index page')
|
||||||
with open(site_dir / 'index.html', 'w') as f:
|
with open(site_dir / 'index.html', 'w') as f:
|
||||||
f.write(index_template.render(depth=0, works=list(works.values())))
|
f.write(index_template.render(depth=0, works=works))
|
||||||
debug('index page written')
|
debug('index page written')
|
||||||
|
|
||||||
debug('closing cache database')
|
debug('closing cache database')
|
||||||
|
|
Loading…
Reference in a new issue