add a second fallback method for getting PDF image xrefs per page
This commit is contained in:
		
							parent
							
								
									578a46c217
								
							
						
					
					
						commit
						716a42858c
					
				
					 1 changed files with 28 additions and 10 deletions
				
			
		
							
								
								
									
										38
									
								
								dlibrary.py
									
										
									
									
									
								
							
							
						
						
									
										38
									
								
								dlibrary.py
									
										
									
									
									
								
							|  | @ -111,19 +111,37 @@ def fetch(args): | |||
|     asyncio.run(fetch_async(args)) | ||||
| 
 | ||||
| 
 | ||||
| def image_xrefs(pdf): | ||||
|     images_by_page = [page.get_images() for page in pdf] | ||||
|     if all(len(images) == 1 for images in images_by_page): | ||||
|         return [images[0][0] for images in images_by_page] | ||||
| 
 | ||||
|     print("Checking PDF images the quick way failed, trying the slow way") | ||||
|     xrefs = [] | ||||
|     for (idx, page) in enumerate(pdf): | ||||
|         print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='') | ||||
|         images = page.get_image_info(xrefs=True) | ||||
|         if len(images) != 1 or images[0]['xref'] == 0: | ||||
|             print('\nFailed') | ||||
|             return None | ||||
|         xrefs.append(images[0]['xref']) | ||||
| 
 | ||||
|     print('\nSuccess') | ||||
|     return xrefs | ||||
| 
 | ||||
| def link_pdf(src, dest, start_index=0): | ||||
|     with fitz.open(src) as pdf: | ||||
|         images_by_page = [page.get_images() for page in pdf] | ||||
|         if all(len(images) == 1 for images in images_by_page): | ||||
|             dest.mkdir(parents=True, exist_ok=True) | ||||
|             for (idx, images) in enumerate(images_by_page, start=start_index): | ||||
|                 xref = images[0][0] | ||||
|                 image = pdf.extract_image(xref) | ||||
|                 file_path = dest / f'{idx:04d}.{image["ext"]}' | ||||
|                 with open(file_path, 'wb') as f: | ||||
|                     f.write(image["image"]) | ||||
|         else: | ||||
|         xrefs = image_xrefs(pdf) | ||||
|         if xrefs is None: | ||||
|             print(f'Support for weirder PDFs not yet implemented, skipping {src}') | ||||
|             return | ||||
| 
 | ||||
|         dest.mkdir(parents=True, exist_ok=True) | ||||
|         for (idx, xref) in enumerate(xrefs, start=start_index): | ||||
|             image = pdf.extract_image(xref) | ||||
|             file_path = dest / f'{idx:04d}.{image["ext"]}' | ||||
|             with open(file_path, 'wb') as f: | ||||
|                 f.write(image["image"]) | ||||
| 
 | ||||
| def complete_prefix_number_ordering(entries): | ||||
|     matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name))) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue