more aggressive unicode normalization
This commit is contained in:
		
							parent
							
								
									d3dd29cf81
								
							
						
					
					
						commit
						02671ca2a9
					
				
					 1 changed files with 15 additions and 6 deletions
				
			
		|  | @ -32,6 +32,8 @@ import requests | ||||||
| NUMBER_REGEX = re.compile('[0-90-9]+') | NUMBER_REGEX = re.compile('[0-90-9]+') | ||||||
| ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P<prefix>[^a-za-z0-90-9]*)((?P<letter>[a-za-z])(?P<suffix>[^a-za-z0-90-9]*))?$', re.I) | ALPHABETIC_NUMBERING_REGEX = re.compile('^(?P<prefix>[^a-za-z0-90-9]*)((?P<letter>[a-za-z])(?P<suffix>[^a-za-z0-90-9]*))?$', re.I) | ||||||
| 
 | 
 | ||||||
|  | EXTRA_NORMALIZATION_TABLE = str.maketrans({'\u301c': '\uff5e'}) | ||||||
|  | 
 | ||||||
| DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$') | DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$') | ||||||
| FANZA_ID_REGEX = re.compile('^d_[0-9]+$') | FANZA_ID_REGEX = re.compile('^d_[0-9]+$') | ||||||
| FAKKU_ID_REGEX = re.compile('.*_FAKKU$') | FAKKU_ID_REGEX = re.compile('.*_FAKKU$') | ||||||
|  | @ -480,6 +482,10 @@ class Collator: | ||||||
|         if len(srcs) == 0: |         if len(srcs) == 0: | ||||||
|             return True |             return True | ||||||
| 
 | 
 | ||||||
|  |         if all(src.is_dir() and nname(src) == nname(srcs[0]) for src in srcs): | ||||||
|  |             debug(f'Merging unicode-fucked directories for {srcs[0]}') | ||||||
|  |             return self.collate_from_paths([item for src in srcs for item in ls_ignore(src, self.exclude)]) | ||||||
|  | 
 | ||||||
|         debug(f'Auto-collating {srcs}') |         debug(f'Auto-collating {srcs}') | ||||||
| 
 | 
 | ||||||
|         select_language = self.try_collate_select_language(srcs) |         select_language = self.try_collate_select_language(srcs) | ||||||
|  | @ -796,11 +802,14 @@ def pdf_image_extractors(pdf, strategy): | ||||||
| 
 | 
 | ||||||
|     return image_extractors |     return image_extractors | ||||||
| 
 | 
 | ||||||
| def nfc(s): | def normalize_string(s): | ||||||
|     return unicodedata.normalize('NFC', s) |     return unicodedata.normalize('NFKC', s.translate(EXTRA_NORMALIZATION_TABLE)) | ||||||
| 
 | 
 | ||||||
| def nname(entry): | def nname(entry): | ||||||
|     return nfc(entry.name) |     return normalize_string(entry.name) | ||||||
|  | 
 | ||||||
|  | def nstem(entry): | ||||||
|  |     return normalize_string(entry.stem) | ||||||
| 
 | 
 | ||||||
| def complete_prefix_number_ordering(entries): | def complete_prefix_number_ordering(entries): | ||||||
|     if len(entries) == 1: |     if len(entries) == 1: | ||||||
|  | @ -872,10 +881,10 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0): | ||||||
|             break |             break | ||||||
|         prefix = nname(longest_entry)[:pos] |         prefix = nname(longest_entry)[:pos] | ||||||
|         debug(f'Checking prefix {prefix}') |         debug(f'Checking prefix {prefix}') | ||||||
|         if all(nname(e).startswith(prefix) or prefix.startswith(nfc(e.stem)) for e in entries): |         if all(nname(e).startswith(prefix) or prefix.startswith(nstem(e)) for e in entries): | ||||||
|             numbering = {} |             numbering = {} | ||||||
|             for e in entries: |             for e in entries: | ||||||
|                 if pos >= len(nfc(e.stem)): |                 if pos >= len(nstem(e)): | ||||||
|                     i = 0 |                     i = 0 | ||||||
|                 else: |                 else: | ||||||
|                     n = NUMBER_REGEX.match(nname(e)[pos:]) |                     n = NUMBER_REGEX.match(nname(e)[pos:]) | ||||||
|  | @ -909,7 +918,7 @@ def alphabetic_numbering(entries, start_point): | ||||||
|     alphabetized = {} |     alphabetized = {} | ||||||
|     prefix_suffix = None |     prefix_suffix = None | ||||||
|     for entry in entries: |     for entry in entries: | ||||||
|         ending = nfc(entry.stem)[start_point:] |         ending = nstem(entry)[start_point:] | ||||||
|         debug(f'{entry} has ending {ending}') |         debug(f'{entry} has ending {ending}') | ||||||
| 
 | 
 | ||||||
|         ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending) |         ending_match = ALPHABETIC_NUMBERING_REGEX.fullmatch(ending) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue