apply unicode normalization while finding prefix orderings, because the world is bad
This commit is contained in:
parent
34c0435d05
commit
25f44473c4
|
@ -11,6 +11,7 @@ import readline
|
||||||
import shutil
|
import shutil
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import textwrap
|
import textwrap
|
||||||
|
import unicodedata
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
|
@ -222,6 +223,12 @@ def link_pdf(src, dest, start_index):
|
||||||
|
|
||||||
return pdf.page_count
|
return pdf.page_count
|
||||||
|
|
||||||
|
def nfc(s):
|
||||||
|
return unicodedata.normalize('NFC', s)
|
||||||
|
|
||||||
|
def nname(entry):
|
||||||
|
return nfc(entry.name)
|
||||||
|
|
||||||
def complete_prefix_number_ordering(entries):
|
def complete_prefix_number_ordering(entries):
|
||||||
if len(entries) == 1:
|
if len(entries) == 1:
|
||||||
return entries
|
return entries
|
||||||
|
@ -230,7 +237,7 @@ def complete_prefix_number_ordering(entries):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
version_code = 0
|
version_code = 0
|
||||||
for (i, version) in enumerate(ALT_VERSIONS):
|
for (i, version) in enumerate(ALT_VERSIONS):
|
||||||
if version in entry.name:
|
if version in nname(entry):
|
||||||
version_code |= (1 << i)
|
version_code |= (1 << i)
|
||||||
entries_by_version.setdefault(version_code, []).append(entry)
|
entries_by_version.setdefault(version_code, []).append(entry)
|
||||||
|
|
||||||
|
@ -279,23 +286,23 @@ def complete_prefix_number_ordering(entries):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def unique_hierarchical_prefix_numbering(entries, start_point=0):
|
def unique_hierarchical_prefix_numbering(entries, start_point=0):
|
||||||
if len(entries) == 1 and not NUMBER_REGEX.search(entries[0].name):
|
if len(entries) == 1 and not NUMBER_REGEX.search(nname(entries[0])):
|
||||||
return {None: entries}
|
return {None: entries}
|
||||||
|
|
||||||
longest_entry = max(entries, key=lambda e: len(e.name))
|
longest_entry = max(entries, key=lambda e: len(nname(e)))
|
||||||
matches = reversed(list(NUMBER_REGEX.finditer(longest_entry.name)))
|
matches = reversed(list(NUMBER_REGEX.finditer(nname(longest_entry))))
|
||||||
for m in matches:
|
for m in matches:
|
||||||
pos = m.start()
|
pos = m.start()
|
||||||
if pos < start_point:
|
if pos < start_point:
|
||||||
return None
|
return None
|
||||||
prefix = longest_entry.name[:pos]
|
prefix = nname(longest_entry)[:pos]
|
||||||
if all(e.name.startswith(prefix) or prefix.startswith(e.stem) for e in entries):
|
if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries):
|
||||||
numbering = {}
|
numbering = {}
|
||||||
for e in entries:
|
for e in entries:
|
||||||
if pos >= len(e.stem):
|
if pos >= len(nfc(e.stem)):
|
||||||
i = 0
|
i = 0
|
||||||
else:
|
else:
|
||||||
n = NUMBER_REGEX.match(e.name[pos:])
|
n = NUMBER_REGEX.match(nname(e)[pos:])
|
||||||
if n is None:
|
if n is None:
|
||||||
return None
|
return None
|
||||||
i = int(n.group())
|
i = int(n.group())
|
||||||
|
@ -305,8 +312,8 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
|
||||||
for idx in indices:
|
for idx in indices:
|
||||||
if len(numbering[idx]) > 1:
|
if len(numbering[idx]) > 1:
|
||||||
ents_idx = numbering.pop(idx)
|
ents_idx = numbering.pop(idx)
|
||||||
longest = max(ents_idx, key=lambda e: len(e.name))
|
longest = max(ents_idx, key=lambda e: len(nname(e)))
|
||||||
next_layer_start = pos + NUMBER_REGEX.match(longest.name[pos:]).end()
|
next_layer_start = pos + NUMBER_REGEX.match(nname(longest)[pos:]).end()
|
||||||
sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
|
sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
|
||||||
if not sub_numbering:
|
if not sub_numbering:
|
||||||
return None
|
return None
|
||||||
|
@ -421,9 +428,9 @@ def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, late
|
||||||
middle_srcs = []
|
middle_srcs = []
|
||||||
late_srcs = []
|
late_srcs = []
|
||||||
for src in srcs:
|
for src in srcs:
|
||||||
if earlier and earlier.search(src.name):
|
if earlier and earlier.search(nname(src)):
|
||||||
early_srcs.append(src)
|
early_srcs.append(src)
|
||||||
elif later and later.search(src.name):
|
elif later and later.search(nname(src)):
|
||||||
late_srcs.append(src)
|
late_srcs.append(src)
|
||||||
else:
|
else:
|
||||||
middle_srcs.append(src)
|
middle_srcs.append(src)
|
||||||
|
@ -548,7 +555,7 @@ def collate_from_paths(srcs, dest, start_index, exclude):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if len(srcs) == 2 and all(src.is_dir() for src in srcs):
|
if len(srcs) == 2 and all(src.is_dir() for src in srcs):
|
||||||
hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(src.name)]
|
hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(nname(src))]
|
||||||
if len(hi_res_dirs) == 1:
|
if len(hi_res_dirs) == 1:
|
||||||
hi_res_dir = hi_res_dirs[0]
|
hi_res_dir = hi_res_dirs[0]
|
||||||
lo_res_dir = next(src for src in srcs if src != hi_res_dir)
|
lo_res_dir = next(src for src in srcs if src != hi_res_dir)
|
||||||
|
|
Loading…
Reference in a new issue