Compare commits

..

No commits in common. "02671ca2a972fbe23b9252e0377c7be9c3489ce1" and "850b9db6d6363add03beab91c48d9e027a7a3652" have entirely different histories.

View file

@ -32,8 +32,6 @@ import requests
NUMBER_REGEX = re.compile('[0-9-]+') NUMBER_REGEX = re.compile('[0-9-]+')
ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P<prefix>[^a-z-0-9-]*)((?P<letter>[a-z-])(?P<suffix>[^a-z-0-9-]*))?$', re.I) ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P<prefix>[^a-z-0-9-]*)((?P<letter>[a-z-])(?P<suffix>[^a-z-0-9-]*))?$', re.I)
EXTRA_NORMALIZATION_TABLE = str.maketrans({'\u301c': '\uff5e'})
DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$') DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$')
FANZA_ID_REGEX = re.compile('^d_[0-9]+$') FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
FAKKU_ID_REGEX = re.compile('.*_FAKKU$') FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
@ -482,10 +480,6 @@ class Collator:
if len(srcs) == 0: if len(srcs) == 0:
return True return True
if all(src.is_dir() and nname(src) == nname(srcs[0]) for src in srcs):
debug(f'Merging unicode-fucked directories for {srcs[0]}')
return self.collate_from_paths([item for src in srcs for item in ls_ignore(src, self.exclude)])
debug(f'Auto-collating {srcs}') debug(f'Auto-collating {srcs}')
select_language = self.try_collate_select_language(srcs) select_language = self.try_collate_select_language(srcs)
@ -802,14 +796,11 @@ def pdf_image_extractors(pdf, strategy):
return image_extractors return image_extractors
def normalize_string(s): def nfc(s):
return unicodedata.normalize('NFKC', s.translate(EXTRA_NORMALIZATION_TABLE)) return unicodedata.normalize('NFC', s)
def nname(entry): def nname(entry):
return normalize_string(entry.name) return nfc(entry.name)
def nstem(entry):
return normalize_string(entry.stem)
def complete_prefix_number_ordering(entries): def complete_prefix_number_ordering(entries):
if len(entries) == 1: if len(entries) == 1:
@ -878,13 +869,13 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0):
for m in matches: for m in matches:
pos = m.start() pos = m.start()
if pos < start_point: if pos < start_point:
break return None
prefix = nname(longest_entry)[:pos] prefix = nname(longest_entry)[:pos]
debug(f'Checking prefix {prefix}') debug(f'Checking prefix {prefix}')
if all(nname(e).startswith(prefix) or prefix.startswith(nstem(e)) for e in entries): if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries):
numbering = {} numbering = {}
for e in entries: for e in entries:
if pos >= len(nstem(e)): if pos >= len(nfc(e.stem)):
i = 0 i = 0
else: else:
n = NUMBER_REGEX.match(nname(e)[pos:]) n = NUMBER_REGEX.match(nname(e)[pos:])
@ -918,7 +909,7 @@ def alphabetic_numbering(entries, start_point):
alphabetized = {} alphabetized = {}
prefix_suffix = None prefix_suffix = None
for entry in entries: for entry in entries:
ending = nstem(entry)[start_point:] ending = nfc(entry.stem)[start_point:]
debug(f'{entry} has ending {ending}') debug(f'{entry} has ending {ending}')
ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending) ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending)