add a second fallback method for getting PDF image xrefs per page
This commit is contained in:
		
							parent
							
								
									578a46c217
								
							
						
					
					
						commit
						716a42858c
					
				
					 1 changed files with 28 additions and 10 deletions
				
			
		
							
								
								
									
										30
									
								
								dlibrary.py
									
										
									
									
									
								
							
							
						
						
									
										30
									
								
								dlibrary.py
									
										
									
									
									
								
							|  | @ -111,19 +111,37 @@ def fetch(args): | ||||||
|     asyncio.run(fetch_async(args)) |     asyncio.run(fetch_async(args)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def link_pdf(src, dest, start_index=0): | def image_xrefs(pdf): | ||||||
|     with fitz.open(src) as pdf: |  | ||||||
|     images_by_page = [page.get_images() for page in pdf] |     images_by_page = [page.get_images() for page in pdf] | ||||||
|     if all(len(images) == 1 for images in images_by_page): |     if all(len(images) == 1 for images in images_by_page): | ||||||
|  |         return [images[0][0] for images in images_by_page] | ||||||
|  | 
 | ||||||
|  |     print("Checking PDF images the quick way failed, trying the slow way") | ||||||
|  |     xrefs = [] | ||||||
|  |     for (idx, page) in enumerate(pdf): | ||||||
|  |         print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='') | ||||||
|  |         images = page.get_image_info(xrefs=True) | ||||||
|  |         if len(images) != 1 or images[0]['xref'] == 0: | ||||||
|  |             print('\nFailed') | ||||||
|  |             return None | ||||||
|  |         xrefs.append(images[0]['xref']) | ||||||
|  | 
 | ||||||
|  |     print('\nSuccess') | ||||||
|  |     return xrefs | ||||||
|  | 
 | ||||||
|  | def link_pdf(src, dest, start_index=0): | ||||||
|  |     with fitz.open(src) as pdf: | ||||||
|  |         xrefs = image_xrefs(pdf) | ||||||
|  |         if xrefs is None: | ||||||
|  |             print(f'Support for weirder PDFs not yet implemented, skipping {src}') | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|         dest.mkdir(parents=True, exist_ok=True) |         dest.mkdir(parents=True, exist_ok=True) | ||||||
|             for (idx, images) in enumerate(images_by_page, start=start_index): |         for (idx, xref) in enumerate(xrefs, start=start_index): | ||||||
|                 xref = images[0][0] |  | ||||||
|             image = pdf.extract_image(xref) |             image = pdf.extract_image(xref) | ||||||
|             file_path = dest / f'{idx:04d}.{image["ext"]}' |             file_path = dest / f'{idx:04d}.{image["ext"]}' | ||||||
|             with open(file_path, 'wb') as f: |             with open(file_path, 'wb') as f: | ||||||
|                 f.write(image["image"]) |                 f.write(image["image"]) | ||||||
|         else: |  | ||||||
|             print(f'Support for weirder PDFs not yet implemented, skipping {src}') |  | ||||||
| 
 | 
 | ||||||
| def complete_prefix_number_ordering(entries): | def complete_prefix_number_ordering(entries): | ||||||
|     matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name))) |     matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name))) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue