From 02671ca2a972fbe23b9252e0377c7be9c3489ce1 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 23 Apr 2024 22:12:38 -0400
Subject: [PATCH] more aggressive unicode normalization

---
 dlibrary/dlibrary.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 45cb376..ebeb216 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -32,6 +32,8 @@ import requests
 NUMBER_REGEX = re.compile('[0-9０-９]+')
 ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P<prefix>[^a-zａ-ｚ0-9０-９]*)((?P<letter>[a-zａ-ｚ])(?P<suffix>[^a-zａ-ｚ0-9０-９]*))?$', re.I)
 
+EXTRA_NORMALIZATION_TABLE = str.maketrans({'\u301c': '\uff5e'})
+
 DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$')
 FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
 FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
@@ -480,6 +482,10 @@ class Collator:
         if len(srcs) == 0:
             return True
 
+        if all(src.is_dir() and nname(src) == nname(srcs[0]) for src in srcs):
+            debug(f'Merging unicode-fucked directories for {srcs[0]}')
+            return self.collate_from_paths([item for src in srcs for item in ls_ignore(src, self.exclude)])
+
         debug(f'Auto-collating {srcs}')
 
         select_language = self.try_collate_select_language(srcs)
@@ -796,11 +802,14 @@ def pdf_image_extractors(pdf, strategy):
 
     return image_extractors
 
-def nfc(s):
-    return unicodedata.normalize('NFC', s)
+def normalize_string(s):
+    return unicodedata.normalize('NFKC', s.translate(EXTRA_NORMALIZATION_TABLE))
 
 def nname(entry):
-    return nfc(entry.name)
+    return normalize_string(entry.name)
+
+def nstem(entry):
+    return normalize_string(entry.stem)
 
 def complete_prefix_number_ordering(entries):
     if len(entries) == 1:
@@ -872,10 +881,10 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
             break
         prefix = nname(longest_entry)[:pos]
         debug(f'Checking prefix {prefix}')
-        if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries):
+        if all(nname(e).startswith(prefix) or prefix.startswith(nstem(e)) for e in entries):
             numbering = {}
             for e in entries:
-                if pos >= len(nfc(e.stem)):
+                if pos >= len(nstem(e)):
                     i = 0
                 else:
                     n = NUMBER_REGEX.match(nname(e)[pos:])
@@ -909,7 +918,7 @@ def alphabetic_numbering(entries, start_point):
     alphabetized = {}
     prefix_suffix = None
     for entry in entries:
-        ending = nfc(entry.stem)[start_point:]
+        ending = nstem(entry)[start_point:]
         debug(f'{entry} has ending {ending}')
 
         ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending)