apply unicode normalization while finding prefix orderings, because the world is bad

This commit is contained in:
xenofem 2024-02-16 16:08:56 -05:00
parent 34c0435d05
commit 25f44473c4

View file

@ -11,6 +11,7 @@ import readline
import shutil import shutil
import sqlite3 import sqlite3
import textwrap import textwrap
import unicodedata
from urllib.parse import urlparse from urllib.parse import urlparse
import zipfile import zipfile
@ -222,6 +223,12 @@ def link_pdf(src, dest, start_index):
return pdf.page_count return pdf.page_count
def nfc(s):
return unicodedata.normalize('NFC', s)
def nname(entry):
return nfc(entry.name)
def complete_prefix_number_ordering(entries): def complete_prefix_number_ordering(entries):
if len(entries) == 1: if len(entries) == 1:
return entries return entries
@ -230,7 +237,7 @@ def complete_prefix_number_ordering(entries):
for entry in entries: for entry in entries:
version_code = 0 version_code = 0
for (i, version) in enumerate(ALT_VERSIONS): for (i, version) in enumerate(ALT_VERSIONS):
if version in entry.name: if version in nname(entry):
version_code |= (1 << i) version_code |= (1 << i)
entries_by_version.setdefault(version_code, []).append(entry) entries_by_version.setdefault(version_code, []).append(entry)
@ -279,23 +286,23 @@ def complete_prefix_number_ordering(entries):
return result return result
def unique_hierarchical_prefix_numbering(entries, start_point=0): def unique_hierarchical_prefix_numbering(entries, start_point=0):
if len(entries) == 1 and not NUMBER_REGEX.search(entries[0].name): if len(entries) == 1 and not NUMBER_REGEX.search(nname(entries[0])):
return {None: entries} return {None: entries}
longest_entry = max(entries, key=lambda e: len(e.name)) longest_entry = max(entries, key=lambda e: len(nname(e)))
matches = reversed(list(NUMBER_REGEX.finditer(longest_entry.name))) matches = reversed(list(NUMBER_REGEX.finditer(nname(longest_entry))))
for m in matches: for m in matches:
pos = m.start() pos = m.start()
if pos < start_point: if pos < start_point:
return None return None
prefix = longest_entry.name[:pos] prefix = nname(longest_entry)[:pos]
if all(e.name.startswith(prefix) or prefix.startswith(e.stem) for e in entries): if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries):
numbering = {} numbering = {}
for e in entries: for e in entries:
if pos >= len(e.stem): if pos >= len(nfc(e.stem)):
i = 0 i = 0
else: else:
n = NUMBER_REGEX.match(e.name[pos:]) n = NUMBER_REGEX.match(nname(e)[pos:])
if n is None: if n is None:
return None return None
i = int(n.group()) i = int(n.group())
@ -305,8 +312,8 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
for idx in indices: for idx in indices:
if len(numbering[idx]) > 1: if len(numbering[idx]) > 1:
ents_idx = numbering.pop(idx) ents_idx = numbering.pop(idx)
longest = max(ents_idx, key=lambda e: len(e.name)) longest = max(ents_idx, key=lambda e: len(nname(e)))
next_layer_start = pos + NUMBER_REGEX.match(longest.name[pos:]).end() next_layer_start = pos + NUMBER_REGEX.match(nname(longest)[pos:]).end()
sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start) sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
if not sub_numbering: if not sub_numbering:
return None return None
@ -421,9 +428,9 @@ def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, late
middle_srcs = [] middle_srcs = []
late_srcs = [] late_srcs = []
for src in srcs: for src in srcs:
if earlier and earlier.search(src.name): if earlier and earlier.search(nname(src)):
early_srcs.append(src) early_srcs.append(src)
elif later and later.search(src.name): elif later and later.search(nname(src)):
late_srcs.append(src) late_srcs.append(src)
else: else:
middle_srcs.append(src) middle_srcs.append(src)
@ -548,7 +555,7 @@ def collate_from_paths(srcs, dest, start_index, exclude):
return 0 return 0
if len(srcs) == 2 and all(src.is_dir() for src in srcs): if len(srcs) == 2 and all(src.is_dir() for src in srcs):
hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(src.name)] hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(nname(src))]
if len(hi_res_dirs) == 1: if len(hi_res_dirs) == 1:
hi_res_dir = hi_res_dirs[0] hi_res_dir = hi_res_dirs[0]
lo_res_dir = next(src for src in srcs if src != hi_res_dir) lo_res_dir = next(src for src in srcs if src != hi_res_dir)