Compare commits
2 commits
6f4444f67e
...
6b3982ecf0
Author | SHA1 | Date | |
---|---|---|---|
xenofem | 6b3982ecf0 | ||
xenofem | 11ea5a0d58 |
|
@ -79,6 +79,8 @@ IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
|
||||||
|
|
||||||
PDF_FALLBACK_DPI = 300
|
PDF_FALLBACK_DPI = 300
|
||||||
|
|
||||||
|
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
|
||||||
|
|
||||||
def open_zipfile_with_encoding(path):
|
def open_zipfile_with_encoding(path):
|
||||||
try:
|
try:
|
||||||
return zipfile.ZipFile(path, metadata_encoding="utf-8")
|
return zipfile.ZipFile(path, metadata_encoding="utf-8")
|
||||||
|
@ -316,7 +318,7 @@ class Collator:
|
||||||
with fitz.open(src) as pdf:
|
with fitz.open(src) as pdf:
|
||||||
images = pdf_images(pdf, self.args.force_convert_pdf)
|
images = pdf_images(pdf, self.args.force_convert_pdf)
|
||||||
if images is None:
|
if images is None:
|
||||||
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
print(f'Failed to enumerate page images in PDF, skipping {src}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self.dest.mkdir(parents=True, exist_ok=True)
|
self.dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
@ -437,9 +439,22 @@ class Collator:
|
||||||
|
|
||||||
return self.collate_from_paths(srcs_matching_language)
|
return self.collate_from_paths(srcs_matching_language)
|
||||||
|
|
||||||
def is_single_image(page):
|
def block_is_image(block):
|
||||||
|
return block[6] == 1
|
||||||
|
|
||||||
|
def block_text(block):
|
||||||
|
return block[4]
|
||||||
|
|
||||||
|
def block_relevant(block):
|
||||||
|
return block_is_image(block) or not IRRELEVANT_PDF_BLOCK_REGEX.search(block_text(block))
|
||||||
|
|
||||||
|
def relevant_blocks(page):
|
||||||
blocks = page.get_text('blocks')
|
blocks = page.get_text('blocks')
|
||||||
return len(blocks) == 1 and blocks[0][6] == 1
|
return [block for block in blocks if block_relevant(block)]
|
||||||
|
|
||||||
|
def is_single_image(page):
|
||||||
|
blocks = relevant_blocks(page)
|
||||||
|
return len(blocks) == 1 and block_is_image(blocks[0])
|
||||||
|
|
||||||
def extract_image(pdf, xref):
|
def extract_image(pdf, xref):
|
||||||
image = pdf.extract_image(xref)
|
image = pdf.extract_image(xref)
|
||||||
|
@ -466,10 +481,11 @@ def pdf_images(pdf, force=False):
|
||||||
else:
|
else:
|
||||||
yield extract_image(pdf, xref)
|
yield extract_image(pdf, xref)
|
||||||
else:
|
else:
|
||||||
|
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
|
||||||
if xref_mode:
|
if xref_mode:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
else:
|
else:
|
||||||
print(f'\nGenerating pixmap for page {idx+1}')
|
print(f'Generating pixmap for page {idx+1}')
|
||||||
pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
|
pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
|
||||||
yield { 'ext': 'png', 'image': pix.tobytes('png') }
|
yield { 'ext': 'png', 'image': pix.tobytes('png') }
|
||||||
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')
|
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')
|
||||||
|
|
Loading…
Reference in a new issue