From 25f44473c4db551fbe554ee0694025e06aeb3e6a Mon Sep 17 00:00:00 2001 From: xenofem Date: Fri, 16 Feb 2024 16:08:56 -0500 Subject: [PATCH] apply unicode normalization while finding prefix orderings, because the world is bad --- dlibrary/dlibrary.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index ff19e73..e8720c8 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -11,6 +11,7 @@ import readline import shutil import sqlite3 import textwrap +import unicodedata from urllib.parse import urlparse import zipfile @@ -222,6 +223,12 @@ def link_pdf(src, dest, start_index): return pdf.page_count +def nfc(s): + return unicodedata.normalize('NFC', s) + +def nname(entry): + return nfc(entry.name) + def complete_prefix_number_ordering(entries): if len(entries) == 1: return entries @@ -230,7 +237,7 @@ def complete_prefix_number_ordering(entries): for entry in entries: version_code = 0 for (i, version) in enumerate(ALT_VERSIONS): - if version in entry.name: + if version in nname(entry): version_code |= (1 << i) entries_by_version.setdefault(version_code, []).append(entry) @@ -279,23 +286,23 @@ def complete_prefix_number_ordering(entries): return result def unique_hierarchical_prefix_numbering(entries, start_point=0): - if len(entries) == 1 and not NUMBER_REGEX.search(entries[0].name): + if len(entries) == 1 and not NUMBER_REGEX.search(nname(entries[0])): return {None: entries} - longest_entry = max(entries, key=lambda e: len(e.name)) - matches = reversed(list(NUMBER_REGEX.finditer(longest_entry.name))) + longest_entry = max(entries, key=lambda e: len(nname(e))) + matches = reversed(list(NUMBER_REGEX.finditer(nname(longest_entry)))) for m in matches: pos = m.start() if pos < start_point: return None - prefix = longest_entry.name[:pos] - if all(e.name.startswith(prefix) or prefix.startswith(e.stem) for e in entries): + prefix = nname(longest_entry)[:pos] + if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries): numbering = {} for e in entries: - if pos >= len(e.stem): + if pos >= len(nfc(e.stem)): i = 0 else: - n = NUMBER_REGEX.match(e.name[pos:]) + n = NUMBER_REGEX.match(nname(e)[pos:]) if n is None: return None i = int(n.group()) @@ -305,8 +312,8 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0): for idx in indices: if len(numbering[idx]) > 1: ents_idx = numbering.pop(idx) - longest = max(ents_idx, key=lambda e: len(e.name)) - next_layer_start = pos + NUMBER_REGEX.match(longest.name[pos:]).end() + longest = max(ents_idx, key=lambda e: len(nname(e))) + next_layer_start = pos + NUMBER_REGEX.match(nname(longest)[pos:]).end() sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start) if not sub_numbering: return None @@ -421,9 +428,9 @@ def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, late middle_srcs = [] late_srcs = [] for src in srcs: - if earlier and earlier.search(src.name): + if earlier and earlier.search(nname(src)): early_srcs.append(src) - elif later and later.search(src.name): + elif later and later.search(nname(src)): late_srcs.append(src) else: middle_srcs.append(src) @@ -548,7 +555,7 @@ def collate_from_paths(srcs, dest, start_index, exclude): return 0 if len(srcs) == 2 and all(src.is_dir() for src in srcs): - hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(src.name)] + hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(nname(src))] if len(hi_res_dirs) == 1: hi_res_dir = hi_res_dirs[0] lo_res_dir = next(src for src in srcs if src != hi_res_dir)