add option to convert PDF pages to pixmaps as needed
This commit is contained in:
parent
2db4553570
commit
9fea03c270
|
@ -77,6 +77,8 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
|
||||||
IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
|
IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
|
||||||
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
|
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
|
||||||
|
|
||||||
|
PDF_FALLBACK_DPI = 300
|
||||||
|
|
||||||
def open_zipfile_with_encoding(path):
|
def open_zipfile_with_encoding(path):
|
||||||
try:
|
try:
|
||||||
return zipfile.ZipFile(path, metadata_encoding="utf-8")
|
return zipfile.ZipFile(path, metadata_encoding="utf-8")
|
||||||
|
@ -232,7 +234,7 @@ def collate(args):
|
||||||
|
|
||||||
work_staging_dir = collation_staging_area / work_id
|
work_staging_dir = collation_staging_area / work_id
|
||||||
|
|
||||||
collator = Collator(work_staging_dir, [], args.locale)
|
collator = Collator(work_staging_dir, [], args)
|
||||||
collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)])
|
collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)])
|
||||||
if collation_result and collator.index > 0:
|
if collation_result and collator.index > 0:
|
||||||
print(f'Collated {collator.index} pages for {work_id}')
|
print(f'Collated {collator.index} pages for {work_id}')
|
||||||
|
@ -252,10 +254,10 @@ def collate(args):
|
||||||
con.close()
|
con.close()
|
||||||
|
|
||||||
class Collator:
|
class Collator:
|
||||||
def __init__(self, dest, exclude, locale):
|
def __init__(self, dest, exclude, args):
|
||||||
self.dest = dest
|
self.dest = dest
|
||||||
self.exclude = exclude
|
self.exclude = exclude
|
||||||
self.locale = locale
|
self.args = args
|
||||||
self.index = 0
|
self.index = 0
|
||||||
|
|
||||||
def collate_from_paths(self, srcs):
|
def collate_from_paths(self, srcs):
|
||||||
|
@ -309,14 +311,13 @@ class Collator:
|
||||||
|
|
||||||
def link_pdf(self, src):
|
def link_pdf(self, src):
|
||||||
with fitz.open(src) as pdf:
|
with fitz.open(src) as pdf:
|
||||||
xrefs = image_xrefs(pdf)
|
images = pdf_images(pdf, self.args.force_convert_pdf)
|
||||||
if xrefs is None:
|
if images is None:
|
||||||
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self.dest.mkdir(parents=True, exist_ok=True)
|
self.dest.mkdir(parents=True, exist_ok=True)
|
||||||
for (idx, xref) in enumerate(xrefs, start=self.index):
|
for (idx, image) in enumerate(images, start=self.index):
|
||||||
image = pdf.extract_image(xref)
|
|
||||||
file_path = self.dest / f'{idx:04d}.{image["ext"]}'
|
file_path = self.dest / f'{idx:04d}.{image["ext"]}'
|
||||||
with open(file_path, 'wb') as f:
|
with open(file_path, 'wb') as f:
|
||||||
f.write(image["image"])
|
f.write(image["image"])
|
||||||
|
@ -422,34 +423,53 @@ class Collator:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def try_collate_select_language(self, srcs):
|
def try_collate_select_language(self, srcs):
|
||||||
if self.locale not in LANGUAGE_REGEXES:
|
if self.args.locale not in LANGUAGE_REGEXES:
|
||||||
return False
|
return False
|
||||||
if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs):
|
if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.locale].search(nname(src))]
|
srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))]
|
||||||
if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0:
|
if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return self.collate_from_paths(srcs_matching_language)
|
return self.collate_from_paths(srcs_matching_language)
|
||||||
|
|
||||||
def image_xrefs(pdf):
|
def pdf_images(pdf, force=False):
|
||||||
images_by_page = [page.get_images() for page in pdf]
|
images_by_page = [page.get_images() for page in pdf]
|
||||||
if all(len(images) == 1 for images in images_by_page):
|
if all(len(images) == 1 for images in images_by_page):
|
||||||
return [images[0][0] for images in images_by_page]
|
return (pdf.extract_image(images[0][0]) for images in images_by_page)
|
||||||
|
|
||||||
print("Checking PDF images the quick way failed, trying the slow way")
|
print("Checking PDF images the quick way failed, trying the slow way")
|
||||||
xrefs = []
|
def xref_or_image_generator():
|
||||||
|
xref_mode = not force
|
||||||
for (idx, page) in enumerate(pdf):
|
for (idx, page) in enumerate(pdf):
|
||||||
print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
|
page_images = page.get_image_info(xrefs=True)
|
||||||
images = page.get_image_info(xrefs=True)
|
if len(page_images) == 1 and page_images[0]['xref'] != 0:
|
||||||
if len(images) != 1 or images[0]['xref'] == 0:
|
xref = page_images[0]['xref']
|
||||||
|
if xref_mode:
|
||||||
|
yield xref
|
||||||
|
else:
|
||||||
|
yield pdf.extract_image(xref)
|
||||||
|
else:
|
||||||
|
if xref_mode:
|
||||||
|
raise ValueError
|
||||||
|
else:
|
||||||
|
print(f'\nGenerating pixmap for page {idx+1}')
|
||||||
|
pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
|
||||||
|
yield { 'ext': 'png', 'image': pix.tobytes('png') }
|
||||||
|
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')
|
||||||
|
print('')
|
||||||
|
|
||||||
|
if force:
|
||||||
|
return xref_or_image_generator()
|
||||||
|
|
||||||
|
try:
|
||||||
|
xrefs = list(xref_or_image_generator())
|
||||||
|
except ValueError:
|
||||||
print('\nFailed')
|
print('\nFailed')
|
||||||
return None
|
return None
|
||||||
xrefs.append(images[0]['xref'])
|
print('Success')
|
||||||
|
return (pdf.extract_image(xref) for xref in xrefs)
|
||||||
print('\nSuccess')
|
|
||||||
return xrefs
|
|
||||||
|
|
||||||
def nfc(s):
|
def nfc(s):
|
||||||
return unicodedata.normalize('NFC', s)
|
return unicodedata.normalize('NFC', s)
|
||||||
|
@ -706,7 +726,7 @@ def manual_collate(args):
|
||||||
work_staging_dir = collation_staging_area / work_id
|
work_staging_dir = collation_staging_area / work_id
|
||||||
work_staging_dir.mkdir(parents=True)
|
work_staging_dir.mkdir(parents=True)
|
||||||
|
|
||||||
collator = Collator(work_staging_dir, exclusions, args.locale)
|
collator = Collator(work_staging_dir, exclusions, args)
|
||||||
for group in groups:
|
for group in groups:
|
||||||
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
|
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
|
||||||
if collation_result is None:
|
if collation_result is None:
|
||||||
|
@ -1002,7 +1022,7 @@ parser_collate.add_argument(
|
||||||
nargs='*',
|
nargs='*',
|
||||||
help='paths within extraction folders as collation hints'
|
help='paths within extraction folders as collation hints'
|
||||||
)
|
)
|
||||||
parser_collate.set_defaults(func=collate)
|
parser_collate.set_defaults(func=collate, force_convert_pdf=False)
|
||||||
|
|
||||||
parser_manual_collate = subparsers.add_parser(
|
parser_manual_collate = subparsers.add_parser(
|
||||||
'manual-collate',
|
'manual-collate',
|
||||||
|
@ -1042,6 +1062,11 @@ parser_manual_collate = subparsers.add_parser(
|
||||||
only handle one work at a time.
|
only handle one work at a time.
|
||||||
"""),
|
"""),
|
||||||
)
|
)
|
||||||
|
parser_manual_collate.add_argument(
|
||||||
|
'--force-convert-pdf',
|
||||||
|
action='store_true',
|
||||||
|
help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
|
||||||
|
)
|
||||||
parser_manual_collate.add_argument(
|
parser_manual_collate.add_argument(
|
||||||
'expression',
|
'expression',
|
||||||
nargs='+',
|
nargs='+',
|
||||||
|
|
Loading…
Reference in a new issue