fancier options for PDF strategy
This commit is contained in:
		
							parent
							
								
									35f0c1fffe
								
							
						
					
					
						commit
						18fbc7f8dc
					
				
					 1 changed files with 36 additions and 23 deletions
				
			
		|  | @ -382,7 +382,7 @@ class Collator: | |||
| 
 | ||||
|     def link_pdf(self, src): | ||||
|         with fitz.open(src) as pdf: | ||||
|             images = pdf_images(pdf, self.args.force_convert_pdf) | ||||
|             images = pdf_images(pdf, self.args.pdf_strategy) | ||||
|             if images is None: | ||||
|                 print(f'Failed to enumerate page images in PDF, skipping {src}') | ||||
|                 return None | ||||
|  | @ -569,7 +569,7 @@ def display_sixel_page(page): | |||
|     finally: | ||||
|         sixel_output_unref(output) | ||||
| 
 | ||||
| def pdf_images(pdf, force=False): | ||||
| def pdf_images(pdf, strategy): | ||||
|     images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] | ||||
|     if all(len(images) == 1 and single for (images, single) in images_by_page): | ||||
|         return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) | ||||
|  | @ -587,24 +587,29 @@ def pdf_images(pdf, force=False): | |||
|             image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) | ||||
|         else: | ||||
|             print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') | ||||
|             if force: | ||||
|                 print(f'Converting page {idx+1}') | ||||
|                 choice = 'c' | ||||
|             else: | ||||
|                 shown = False | ||||
|                 while True: | ||||
|                     choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ') | ||||
|                     if not shown and choice != '' and choice[0].lower() == 's': | ||||
|                         display_sixel_page(page) | ||||
|                         shown = True | ||||
|                     else: | ||||
|                         break | ||||
|             if xref is not None and choice != '' and choice[0].lower() == 'x': | ||||
|                 image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) | ||||
|             elif choice != '' and choice[0].lower() == 'c': | ||||
|                 image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') }) | ||||
|             else: | ||||
|                 return None | ||||
|             choice = strategy | ||||
|             while True: | ||||
|                 if choice.lower().startswith('n'): | ||||
|                     return None | ||||
|                 if choice.lower().startswith('c'): | ||||
|                     if choice == strategy: | ||||
|                         print(f'Converting page {idx+1}') | ||||
|                     image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') }) | ||||
|                     break | ||||
|                 if xref is not None and (choice.lower().startswith('x') or choice.lower() == 'extract'): | ||||
|                     if choice == strategy: | ||||
|                         print(f'Extracting image from page {idx+1} without text') | ||||
|                     image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) | ||||
|                     break | ||||
|                 if choice.lower().startswith('d'): | ||||
|                     if choice == strategy: | ||||
|                         print(f'Dropping page {idx+1}') | ||||
|                     break | ||||
| 
 | ||||
|                 if choice.lower().startswith('s'): | ||||
|                     display_sixel_page(page) | ||||
| 
 | ||||
|                 choice = input(f'[N]ope out / [c]onvert page{"" if xref is None else " / e[x]tract image"} / [d]rop page / [s]how page? [n/c{"" if xref is None else "/x"}/d/s] ') | ||||
|         print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n')) | ||||
| 
 | ||||
|     return (extractor() for extractor in image_extractors) | ||||
|  | @ -1182,9 +1187,17 @@ parser_collate.add_argument( | |||
|     help="only collate works that are explicitly specified", | ||||
| ) | ||||
| parser_collate.add_argument( | ||||
|     '--force-convert-pdf', | ||||
|     action='store_true', | ||||
|     help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly", | ||||
|     '-p', '--pdf-strategy', | ||||
|     choices=[ | ||||
|         'ask', '?', | ||||
|         'show-ask', 's', | ||||
|         'convert', 'c', | ||||
|         'extract', 'x', | ||||
|         'drop', 'd', | ||||
|         'nope', 'n' | ||||
|     ], | ||||
|     default='show-ask', | ||||
|     help="how to handle PDF pages that aren't a single image with no text", | ||||
| ) | ||||
| parser_collate.add_argument( | ||||
|     'expression', | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue