also check whether PDFs have text alongside images
This commit is contained in:
		
							parent
							
								
									c042163e85
								
							
						
					
					
						commit
						7535cb6162
					
				
					 1 changed files with 8 additions and 4 deletions
				
			
		|  | @ -437,17 +437,21 @@ class Collator: | ||||||
| 
 | 
 | ||||||
|         return self.collate_from_paths(srcs_matching_language) |         return self.collate_from_paths(srcs_matching_language) | ||||||
| 
 | 
 | ||||||
|  | def is_single_image(page): | ||||||
|  |     blocks = page.get_text('blocks') | ||||||
|  |     return len(blocks) == 1 and blocks[0][6] == 1 | ||||||
|  | 
 | ||||||
| def pdf_images(pdf, force=False): | def pdf_images(pdf, force=False): | ||||||
|     images_by_page = [page.get_images() for page in pdf] |     images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] | ||||||
|     if all(len(images) == 1 for images in images_by_page): |     if all(len(images) == 1 and single for (images, single) in images_by_page): | ||||||
|         return (pdf.extract_image(images[0][0]) for images in images_by_page) |         return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page) | ||||||
| 
 | 
 | ||||||
|     print("Checking PDF images the quick way failed, trying the slow way") |     print("Checking PDF images the quick way failed, trying the slow way") | ||||||
|     def xref_or_image_generator(): |     def xref_or_image_generator(): | ||||||
|         xref_mode = not force |         xref_mode = not force | ||||||
|         for (idx, page) in enumerate(pdf): |         for (idx, page) in enumerate(pdf): | ||||||
|             page_images = page.get_image_info(xrefs=True) |             page_images = page.get_image_info(xrefs=True) | ||||||
|             if len(page_images) == 1 and page_images[0]['xref'] != 0: |             if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page): | ||||||
|                 xref = page_images[0]['xref'] |                 xref = page_images[0]['xref'] | ||||||
|                 if xref_mode: |                 if xref_mode: | ||||||
|                     yield xref |                     yield xref | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue