From 02671ca2a972fbe23b9252e0377c7be9c3489ce1 Mon Sep 17 00:00:00 2001 From: xenofem Date: Tue, 23 Apr 2024 22:12:38 -0400 Subject: [PATCH] more aggressive unicode normalization --- dlibrary/dlibrary.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 45cb376..ebeb216 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -32,6 +32,8 @@ import requests NUMBER_REGEX = re.compile('[0-90-9]+') ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P[^a-za-z0-90-9]*)((?P[a-za-z])(?P[^a-za-z0-90-9]*))?$', re.I) +EXTRA_NORMALIZATION_TABLE = str.maketrans({'\u301c': '\uff5e'}) + DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$') FANZA_ID_REGEX = re.compile('^d_[0-9]+$') FAKKU_ID_REGEX = re.compile('.*_FAKKU$') @@ -480,6 +482,10 @@ class Collator: if len(srcs) == 0: return True + if all(src.is_dir() and nname(src) == nname(srcs[0]) for src in srcs): + debug(f'Merging unicode-fucked directories for {srcs[0]}') + return self.collate_from_paths([item for src in srcs for item in ls_ignore(src, self.exclude)]) + debug(f'Auto-collating {srcs}') select_language = self.try_collate_select_language(srcs) @@ -796,11 +802,14 @@ def pdf_image_extractors(pdf, strategy): return image_extractors -def nfc(s): - return unicodedata.normalize('NFC', s) +def normalize_string(s): + return unicodedata.normalize('NFKC', s.translate(EXTRA_NORMALIZATION_TABLE)) def nname(entry): - return nfc(entry.name) + return normalize_string(entry.name) + +def nstem(entry): + return normalize_string(entry.stem) def complete_prefix_number_ordering(entries): if len(entries) == 1: @@ -872,10 +881,10 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0): break prefix = nname(longest_entry)[:pos] debug(f'Checking prefix {prefix}') - if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries): + if all(nname(e).startswith(prefix) or prefix.startswith(nstem(e)) for e in entries): numbering = {} for e in entries: - if pos >= len(nfc(e.stem)): + if pos >= len(nstem(e)): i = 0 else: n = NUMBER_REGEX.match(nname(e)[pos:]) @@ -909,7 +918,7 @@ def alphabetic_numbering(entries, start_point): alphabetized = {} prefix_suffix = None for entry in entries: - ending = nfc(entry.stem)[start_point:] + ending = nstem(entry)[start_point:] debug(f'{entry} has ending {ending}') ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending)