more flexible splitting out of textless pages
This commit is contained in:
		
							parent
							
								
									aefaf824a8
								
							
						
					
					
						commit
						330b10c85b
					
				
					 1 changed files with 68 additions and 49 deletions
				
			
		|  | @ -25,7 +25,7 @@ FANZA_ID_REGEX = re.compile('^d_[0-9]+$') | ||||||
| FAKKU_ID_REGEX = re.compile('.*_FAKKU$') | FAKKU_ID_REGEX = re.compile('.*_FAKKU$') | ||||||
| 
 | 
 | ||||||
| TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless') | TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless') | ||||||
| ALT_VERSIONS = ['褐色', '日焼け'] | ALT_VERSIONS = ['褐色', '日焼け', 'pink'] | ||||||
| 
 | 
 | ||||||
| IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff'] | IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff'] | ||||||
| 
 | 
 | ||||||
|  | @ -179,12 +179,12 @@ def image_xrefs(pdf): | ||||||
|     print('\nSuccess') |     print('\nSuccess') | ||||||
|     return xrefs |     return xrefs | ||||||
| 
 | 
 | ||||||
| def link_pdf(src, dest, start_index=0): | def link_pdf(src, dest, start_index): | ||||||
|     with fitz.open(src) as pdf: |     with fitz.open(src) as pdf: | ||||||
|         xrefs = image_xrefs(pdf) |         xrefs = image_xrefs(pdf) | ||||||
|         if xrefs is None: |         if xrefs is None: | ||||||
|             print(f'Support for weirder PDFs not yet implemented, skipping {src}') |             print(f'Support for weirder PDFs not yet implemented, skipping {src}') | ||||||
|             return |             return None | ||||||
| 
 | 
 | ||||||
|         dest.mkdir(parents=True, exist_ok=True) |         dest.mkdir(parents=True, exist_ok=True) | ||||||
|         for (idx, xref) in enumerate(xrefs, start=start_index): |         for (idx, xref) in enumerate(xrefs, start=start_index): | ||||||
|  | @ -193,6 +193,8 @@ def link_pdf(src, dest, start_index=0): | ||||||
|             with open(file_path, 'wb') as f: |             with open(file_path, 'wb') as f: | ||||||
|                 f.write(image["image"]) |                 f.write(image["image"]) | ||||||
| 
 | 
 | ||||||
|  |         return pdf.page_count | ||||||
|  | 
 | ||||||
| def complete_prefix_number_ordering(entries): | def complete_prefix_number_ordering(entries): | ||||||
|     if len(entries) == 1: |     if len(entries) == 1: | ||||||
|         return entries |         return entries | ||||||
|  | @ -272,7 +274,7 @@ def unique_hierarchical_prefix_numbering(entries, start_point=0): | ||||||
| 
 | 
 | ||||||
|     return None |     return None | ||||||
| 
 | 
 | ||||||
| def link_ordered_files(ordering, dest, start_index=0): | def link_ordered_files(ordering, dest, start_index): | ||||||
|     dest.mkdir(parents=True, exist_ok=True) |     dest.mkdir(parents=True, exist_ok=True) | ||||||
| 
 | 
 | ||||||
|     for (idx, src_path) in enumerate(ordering, start=start_index): |     for (idx, src_path) in enumerate(ordering, start=start_index): | ||||||
|  | @ -293,6 +295,9 @@ def collate(args): | ||||||
|     extraction_dir = args.destdir / 'extract' |     extraction_dir = args.destdir / 'extract' | ||||||
|     hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints} |     hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints} | ||||||
| 
 | 
 | ||||||
|  |     collation_staging_area = args.destdir / 'site' / 'images-staging' | ||||||
|  |     collation_staging_area.mkdir(parents=True) | ||||||
|  | 
 | ||||||
|     for work_path in extraction_dir.iterdir(): |     for work_path in extraction_dir.iterdir(): | ||||||
|         work_id = work_path.name |         work_id = work_path.name | ||||||
| 
 | 
 | ||||||
|  | @ -304,51 +309,64 @@ def collate(args): | ||||||
|         if virtual == (1,): |         if virtual == (1,): | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         if work_id in hint_map: |         work_staging_dir = collation_staging_area / work_id | ||||||
|             hint = hint_map[work_id] |  | ||||||
|             entries = [hint] if hint.is_file() else ls_ignore(hint) |  | ||||||
|         else: |  | ||||||
|             search_dir = work_path |  | ||||||
|             while True: |  | ||||||
|                 entries = ls_ignore(search_dir) |  | ||||||
|                 if len(entries) == 1 and entries[0].is_dir(): |  | ||||||
|                     search_dir = entries[0] |  | ||||||
|                 else: |  | ||||||
|                     break |  | ||||||
| 
 | 
 | ||||||
|         if len(entries) == 1 and entries[0].suffix.lower() == '.pdf': |         pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0) | ||||||
|             print(f'Extracting images from {entries[0]} for {work_id}') |         if pages_collated: | ||||||
|             link_pdf(entries[0], collation_dir) |             print(f'Collated {pages_collated} pages for {work_id}') | ||||||
|             continue |             work_staging_dir.rename(collation_dir) | ||||||
|  |         else: | ||||||
|  |             if work_staging_dir.is_dir(): | ||||||
|  |                 for f in work_staging_dir.iterdir(): | ||||||
|  |                     f.unlink() | ||||||
|  |                 work_staging_dir.rmdir() | ||||||
| 
 | 
 | ||||||
|         if len(entries) == 0: |             if pages_collated == 0: | ||||||
|                 print(f'{work_id} contains no files? skipping') |                 print(f'{work_id} contains no files? skipping') | ||||||
|             continue |             elif pages_collated is None: | ||||||
| 
 |  | ||||||
|         if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries): |  | ||||||
|             ordering = complete_prefix_number_ordering(entries) |  | ||||||
|             if not ordering: |  | ||||||
|                 with_text = [] |  | ||||||
|                 textless = [] |  | ||||||
|                 for entry in entries: |  | ||||||
|                     if TEXTLESS_REGEX.search(entry.name): |  | ||||||
|                         textless.append(entry) |  | ||||||
|                     else: |  | ||||||
|                         with_text.append(entry) |  | ||||||
|                 if with_text and textless: |  | ||||||
|                     with_text_ordering = complete_prefix_number_ordering(with_text) |  | ||||||
|                     textless_ordering = complete_prefix_number_ordering(textless) |  | ||||||
|                     if with_text_ordering and textless_ordering: |  | ||||||
|                         ordering = with_text_ordering + textless_ordering |  | ||||||
|             if ordering: |  | ||||||
|                 print(f'Symlinking image files for {work_id}') |  | ||||||
|                 link_ordered_files(ordering, collation_dir) |  | ||||||
|                 continue |  | ||||||
| 
 |  | ||||||
|                 print(f'Unable to deduce file structure for {work_id}, skipping') |                 print(f'Unable to deduce file structure for {work_id}, skipping') | ||||||
| 
 | 
 | ||||||
|  |     collation_staging_area.rmdir() | ||||||
|     con.close() |     con.close() | ||||||
| 
 | 
 | ||||||
|  | def collate_from_paths(srcs, dest, start_index): | ||||||
|  |     if len(srcs) == 1 and srcs[0].is_dir(): | ||||||
|  |         return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) | ||||||
|  | 
 | ||||||
|  |     if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf': | ||||||
|  |         print(f'Extracting images from {srcs[0]}') | ||||||
|  |         return link_pdf(srcs[0], dest, start_index) | ||||||
|  | 
 | ||||||
|  |     if len(srcs) == 0: | ||||||
|  |         return 0 | ||||||
|  | 
 | ||||||
|  |     with_text = [] | ||||||
|  |     textless = [] | ||||||
|  |     for src in srcs: | ||||||
|  |         if TEXTLESS_REGEX.search(src.name): | ||||||
|  |             textless.append(src) | ||||||
|  |         else: | ||||||
|  |             with_text.append(src) | ||||||
|  |     if with_text and textless: | ||||||
|  |         text_pages = collate_from_paths(with_text, dest, start_index) | ||||||
|  |         if text_pages is None: | ||||||
|  |             return None | ||||||
|  |         textless_pages = collate_from_paths(textless, dest, start_index+text_pages) | ||||||
|  |         if textless_pages is None: | ||||||
|  |             return None | ||||||
|  |         return text_pages + textless_pages | ||||||
|  | 
 | ||||||
|  |     if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs): | ||||||
|  |         ordering = complete_prefix_number_ordering(srcs) | ||||||
|  |         if ordering: | ||||||
|  |             print(f'Symlinking image files: {ordering[0]}...') | ||||||
|  |             link_ordered_files(ordering, dest, start_index) | ||||||
|  |             return len(ordering) | ||||||
|  |         else: | ||||||
|  |             return None | ||||||
|  | 
 | ||||||
|  |     return None | ||||||
|  | 
 | ||||||
| def self_and_parents(path): | def self_and_parents(path): | ||||||
|     return [path] + list(path.parents) |     return [path] + list(path.parents) | ||||||
| 
 | 
 | ||||||
|  | @ -375,15 +393,16 @@ def manual_collate(args): | ||||||
|             if ordering is None: |             if ordering is None: | ||||||
|                 ordering = entries |                 ordering = entries | ||||||
|                 ordering.sort() |                 ordering.sort() | ||||||
|             link_ordered_files(ordering, collation_dir, start_index=index) |             link_ordered_files(ordering, collation_dir, index) | ||||||
|             index += len(ordering) |             index += len(ordering) | ||||||
|         elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS: |         elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS: | ||||||
|             link_ordered_files([path], collation_dir, start_index=index) |             link_ordered_files([path], collation_dir, index) | ||||||
|             index += 1 |             index += 1 | ||||||
|         elif path.suffix.lower() == ".pdf": |         elif path.suffix.lower() == ".pdf": | ||||||
|             link_pdf(path, collation_dir, start_index=index) |             pdf_page_count = link_pdf(path, collation_dir, index) | ||||||
|             with fitz.open(path) as pdf: |             if pdf_page_count is None: | ||||||
|                 index += pdf.page_count |                 return | ||||||
|  |             index += pdf_page_count | ||||||
|         else: |         else: | ||||||
|             print(f'Unknown file type {path}, stopping') |             print(f'Unknown file type {path}, stopping') | ||||||
|             return |             return | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue