more aggressive unicode normalization
This commit is contained in:
parent
d3dd29cf81
commit
02671ca2a9
|
@ -32,6 +32,8 @@ import requests
|
||||||
NUMBER_REGEX = re.compile('[0-90-9]+')
|
NUMBER_REGEX = re.compile('[0-90-9]+')
|
||||||
ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P<prefix>[^a-za-z0-90-9]*)((?P<letter>[a-za-z])(?P<suffix>[^a-za-z0-90-9]*))?$', re.I)
|
ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P<prefix>[^a-za-z0-90-9]*)((?P<letter>[a-za-z])(?P<suffix>[^a-za-z0-90-9]*))?$', re.I)
|
||||||
|
|
||||||
|
EXTRA_NORMALIZATION_TABLE = str.maketrans({'\u301c': '\uff5e'})
|
||||||
|
|
||||||
DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$')
|
DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$')
|
||||||
FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
|
FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
|
||||||
FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
|
FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
|
||||||
|
@ -480,6 +482,10 @@ class Collator:
|
||||||
if len(srcs) == 0:
|
if len(srcs) == 0:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
if all(src.is_dir() and nname(src) == nname(srcs[0]) for src in srcs):
|
||||||
|
debug(f'Merging unicode-fucked directories for {srcs[0]}')
|
||||||
|
return self.collate_from_paths([item for src in srcs for item in ls_ignore(src, self.exclude)])
|
||||||
|
|
||||||
debug(f'Auto-collating {srcs}')
|
debug(f'Auto-collating {srcs}')
|
||||||
|
|
||||||
select_language = self.try_collate_select_language(srcs)
|
select_language = self.try_collate_select_language(srcs)
|
||||||
|
@ -796,11 +802,14 @@ def pdf_image_extractors(pdf, strategy):
|
||||||
|
|
||||||
return image_extractors
|
return image_extractors
|
||||||
|
|
||||||
def nfc(s):
|
def normalize_string(s):
|
||||||
return unicodedata.normalize('NFC', s)
|
return unicodedata.normalize('NFKC', s.translate(EXTRA_NORMALIZATION_TABLE))
|
||||||
|
|
||||||
def nname(entry):
|
def nname(entry):
|
||||||
return nfc(entry.name)
|
return normalize_string(entry.name)
|
||||||
|
|
||||||
|
def nstem(entry):
|
||||||
|
return normalize_string(entry.stem)
|
||||||
|
|
||||||
def complete_prefix_number_ordering(entries):
|
def complete_prefix_number_ordering(entries):
|
||||||
if len(entries) == 1:
|
if len(entries) == 1:
|
||||||
|
@ -872,10 +881,10 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
|
||||||
break
|
break
|
||||||
prefix = nname(longest_entry)[:pos]
|
prefix = nname(longest_entry)[:pos]
|
||||||
debug(f'Checking prefix {prefix}')
|
debug(f'Checking prefix {prefix}')
|
||||||
if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries):
|
if all(nname(e).startswith(prefix) or prefix.startswith(nstem(e)) for e in entries):
|
||||||
numbering = {}
|
numbering = {}
|
||||||
for e in entries:
|
for e in entries:
|
||||||
if pos >= len(nfc(e.stem)):
|
if pos >= len(nstem(e)):
|
||||||
i = 0
|
i = 0
|
||||||
else:
|
else:
|
||||||
n = NUMBER_REGEX.match(nname(e)[pos:])
|
n = NUMBER_REGEX.match(nname(e)[pos:])
|
||||||
|
@ -909,7 +918,7 @@ def alphabetic_numbering(entries, start_point):
|
||||||
alphabetized = {}
|
alphabetized = {}
|
||||||
prefix_suffix = None
|
prefix_suffix = None
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
ending = nfc(entry.stem)[start_point:]
|
ending = nstem(entry)[start_point:]
|
||||||
debug(f'{entry} has ending {ending}')
|
debug(f'{entry} has ending {ending}')
|
||||||
|
|
||||||
ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending)
|
ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending)
|
||||||
|
|
Loading…
Reference in a new issue