diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index c963ee1..386230b 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -531,7 +531,6 @@ class Collator: return split_attempt if all(src.is_file() and is_image(src) for src in srcs): - debug('Attempting to detect ordering for image files') ordering = complete_prefix_number_ordering(srcs) if not ordering and self.args.sort: ordering = srcs.copy() @@ -543,11 +542,6 @@ class Collator: else: return None - debug('Unable to collate available file types:') - debug(f'Images: {[src for src in srcs if src.is_file() and is_image(src)]}') - debug(f'PDFs: {[src for src in srcs if src.is_file() and is_pdf(src)]}') - debug(f'Directories: {[src for src in srcs if src.is_dir()]}') - debug(f'Unknown files: {[src for src in srcs if src.is_file() and not is_image(src) and not is_pdf(src)]}') return None def link_pdf(self, src): @@ -596,8 +590,6 @@ class Collator: if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1: return False - debug(f'Splitting sources based on regex: {[early_srcs, middle_srcs, late_srcs]}') - early_page_collation = self.collate_from_paths(early_srcs) if early_page_collation is None: return None @@ -679,7 +671,6 @@ class Collator: if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs): return False - debug('Detected multiple language options, selecting preferred language') srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))] if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0: return False @@ -1150,7 +1141,7 @@ def copy_recursive(src, dest): memoized_similarities = {} -def string_similarity(a, b): +def similarity(a, b, cache_cur=None): if len(a) < len(b) or (len(a) == len(b) and a < b): shorter = a longer = b @@ -1163,49 +1154,43 @@ def string_similarity(a, b): if (shorter, longer) in memoized_similarities: return memoized_similarities[(shorter, longer)] - options = [string_similarity(shorter[1:], longer)] - for i in range(1, len(shorter)+1): - match_idx = longer.find(shorter[:i]) - if match_idx == -1: - break - options.append(i*i + string_similarity(shorter[i:], longer[match_idx+i:])) - result = max(options) + if cache_cur and (cached := cache_cur.execute("SELECT similarity FROM similarities WHERE shorter = ? AND longer = ?", (shorter, longer)).fetchone()) is not None: + result = cached[0] + else: + options = [similarity(shorter[1:], longer)] + for i in range(1, len(shorter)+1): + match_idx = longer.find(shorter[:i]) + if match_idx == -1: + break + options.append(i*i + similarity(shorter[i:], longer[match_idx+i:])) + result = max(options) + + if cache_cur: + cache_cur.execute( + "INSERT INTO similarities(shorter, longer, similarity) VALUES(?, ?, ?)", + (shorter, longer, result), + ) memoized_similarities[(shorter, longer)] = result return result -class TopScoreList: - def __init__(self, limit): - self.limit = limit - self.items_with_scores = [] - self.randomized = True +def top(items, n, key, overflow=0): + winners = [] + for item in items: + score = key(item) + if len(winners) < n or score >= winners[-1][1]: + for i in range(len(winners) + 1): + if i == len(winners) or score >= winners[i][1]: + winners.insert(i, (item, score)) + break + while len(winners) > n and winners[-1][1] < winners[n-1][1]: + winners.pop() - def insert(self, item, score): - if len(self.items_with_scores) >= self.limit and score < self.items_with_scores[-1][1]: - return [item] + # shuffle followed by stable sort to randomly shuffle within each score tier + random.shuffle(winners) + winners.sort(key=lambda w: w[1], reverse=True) - self.randomized = False - for i in range(len(self.items_with_scores) + 1): - if i == len(self.items_with_scores) or score >= self.items_with_scores[i][1]: - self.items_with_scores.insert(i, (item, score)) - break - removed_items = [] - while len(self.items_with_scores) > self.limit and self.items_with_scores[-1][1] < self.items_with_scores[self.limit-1][1]: - removed_items.append(self.items_with_scores.pop()[0]) - return removed_items - - def _randomize(self): - if self.randomized: - return - - # shuffle followed by stable sort to randomly shuffle within each score tier - random.shuffle(self.items_with_scores) - self.items_with_scores.sort(key=lambda i: i[1], reverse=True) - self.randomized = True - - def __iter__(self): - self._randomize() - return (item for (item, _) in self.items_with_scores[:self.limit]) + return [item for (item, score) in winners[:n+overflow]] def generate(args): debug('loading templates') @@ -1225,21 +1210,17 @@ def generate(args): cur = con.cursor() debug('main database open') - debug('opening suggestion cache database') + debug('opening cache database') cache_con = sqlite3.connect(args.destdir / 'cache.db') cache_cur = cache_con.cursor() - cache_cur.execute("CREATE TABLE IF NOT EXISTS suggestions(work TEXT, suggested TEXT, similarity INT, PRIMARY KEY(work, suggested))") - debug('suggestion cache database open') - cached_suggestions = {} - for (work, suggested, similarity) in cache_cur.execute('SELECT work, suggested, similarity FROM suggestions'): - cached_suggestions.setdefault(work, TopScoreList(SUGGESTED_WORKS_COUNT)).insert(suggested, similarity) - debug('cached suggestions loaded') + cache_cur.execute("CREATE TABLE IF NOT EXISTS similarities(shorter TEXT, longer TEXT, similarity INT, PRIMARY KEY(shorter, longer))") + debug('cache database open') site_dir = args.destdir / 'site' collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()} - works = {} + works = [] debug('checking thumbnail files') thumbnail_files = {f.stem: f for f in (site_dir / 'thumbnails').iterdir()} debug(f'{len(thumbnail_files)} thumbnail files found') @@ -1266,55 +1247,30 @@ def generate(args): 'thumbnail_path': thumbnail_path, 'images': images, } - works[work_id] = work + works.append(work) print(f'{ANSI_LINECLEAR}{idx+1} database entries read...', end='') print() - for work in works.values(): - if work['id'] in cached_suggestions: - continue - debug(f'Computing suggestions for new work {work["title"]}') - cached_suggestions[work['id']] = TopScoreList(SUGGESTED_WORKS_COUNT) - for other_work in works.values(): + for (idx, work) in enumerate(works): + def suggestion_priority(other_work): if other_work is work: - continue + return -2 if work['series'] and work['series'] == other_work['series']: - continue - if other_work['id'] not in cached_suggestions: - continue # we'll get to it later + return -1 + return similarity(work['title'], other_work['title'], cache_cur) + suggested = top(works, SUGGESTED_WORKS_COUNT, suggestion_priority) - similarity = string_similarity(work['title'], other_work['title']) - cached_suggestions[work['id']].insert(other_work['id'], similarity) - removed = cached_suggestions[other_work['id']].insert(work['id'], similarity) - if removed != [work['id']]: - cache_cur.executemany( - 'DELETE FROM suggestions WHERE work = :work AND suggested = :suggested', - [{ "work": other_work['id'], "suggested": item } for item in removed], - ) - cache_cur.execute( - 'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)', - { "work": other_work['id'], "suggested": work['id'], "similarity": similarity }, - ) - cache_cur.executemany( - 'INSERT INTO suggestions(work, suggested, similarity) VALUES(:work, :suggested, :similarity)', - [{ "work": work['id'], "suggested": suggested, "similarity": similarity } for (suggested, similarity) in cached_suggestions[work['id']].items_with_scores], - ) - cache_con.commit() - - for (idx, work) in enumerate(works.values()): work_dir = site_dir / 'works' / work['id'] viewer_dir = work_dir / 'view' viewer_dir.mkdir(parents=True, exist_ok=True) with open(work_dir / 'index.html', 'w') as f: - f.write(work_template.render( - depth=2, work=work, title=work['title'], - suggested=[works[suggested] for suggested in cached_suggestions[work['id']]], - )) + f.write(work_template.render(depth=2, work=work, title=work['title'], suggested=suggested)) with open(viewer_dir / 'index.html', 'w') as f: f.write(viewer_template.render(depth=3, work=work, title=work['title'])) count_progress(idx, len(works), 'works processed') + cache_con.commit() uca = pyuca.Collator().sort_key def make_categorization(categorization, query, work_filter, work_style_cards=False): @@ -1323,7 +1279,7 @@ def generate(args): cats = sorted((cat for (cat,) in cur.execute(query)), key=uca) cat_samples = {} for (idx, cat) in enumerate(cats): - cat_works = list(filter(work_filter(cat), works.values())) + cat_works = list(filter(work_filter(cat), works)) cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None safeish_cat = cat.replace('/', ' ') @@ -1377,7 +1333,7 @@ def generate(args): debug('writing index page') with open(site_dir / 'index.html', 'w') as f: - f.write(index_template.render(depth=0, works=list(works.values()))) + f.write(index_template.render(depth=0, works=works)) debug('index page written') debug('closing cache database')