Compare commits

...

3 commits

View file

@ -77,6 +77,8 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
PDF_FALLBACK_DPI = 300
def open_zipfile_with_encoding(path):
try:
return zipfile.ZipFile(path, metadata_encoding="utf-8")
@ -206,12 +208,15 @@ def fetch(args):
asyncio.run(fetch_async(args))
def self_and_parents(path):
return [path] + list(path.parents)
def collate(args):
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
extraction_dir = args.destdir / 'extract'
hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints}
collation_staging_area = args.destdir / 'site' / 'images-staging'
collation_staging_area.mkdir(parents=True)
@ -232,7 +237,7 @@ def collate(args):
work_staging_dir = collation_staging_area / work_id
collator = Collator(work_staging_dir, [], args.locale)
collator = Collator(work_staging_dir, [], args)
collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)])
if collation_result and collator.index > 0:
print(f'Collated {collator.index} pages for {work_id}')
@ -252,10 +257,10 @@ def collate(args):
con.close()
class Collator:
def __init__(self, dest, exclude, locale):
def __init__(self, dest, exclude, args):
self.dest = dest
self.exclude = exclude
self.locale = locale
self.args = args
self.index = 0
def collate_from_paths(self, srcs):
@ -309,14 +314,13 @@ class Collator:
def link_pdf(self, src):
with fitz.open(src) as pdf:
xrefs = image_xrefs(pdf)
if xrefs is None:
images = pdf_images(pdf, self.args.force_convert_pdf)
if images is None:
print(f'Support for weirder PDFs not yet implemented, skipping {src}')
return None
self.dest.mkdir(parents=True, exist_ok=True)
for (idx, xref) in enumerate(xrefs, start=self.index):
image = pdf.extract_image(xref)
for (idx, image) in enumerate(images, start=self.index):
file_path = self.dest / f'{idx:04d}.{image["ext"]}'
with open(file_path, 'wb') as f:
f.write(image["image"])
@ -422,34 +426,57 @@ class Collator:
return False
def try_collate_select_language(self, srcs):
if self.locale not in LANGUAGE_REGEXES:
if self.args.locale not in LANGUAGE_REGEXES:
return False
if not all(any(lang.search(nname(src)) for lang in LANGUAGE_REGEXES.values()) for src in srcs):
return False
srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.locale].search(nname(src))]
srcs_matching_language = [src for src in srcs if LANGUAGE_REGEXES[self.args.locale].search(nname(src))]
if len(srcs_matching_language) == len(srcs) or len(srcs_matching_language) == 0:
return False
return self.collate_from_paths(srcs_matching_language)
def image_xrefs(pdf):
images_by_page = [page.get_images() for page in pdf]
if all(len(images) == 1 for images in images_by_page):
return [images[0][0] for images in images_by_page]
def is_single_image(page):
blocks = page.get_text('blocks')
return len(blocks) == 1 and blocks[0][6] == 1
def pdf_images(pdf, force=False):
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
if all(len(images) == 1 and single for (images, single) in images_by_page):
return (pdf.extract_image(images[0][0]) for (images, _) in images_by_page)
print("Checking PDF images the quick way failed, trying the slow way")
xrefs = []
def xref_or_image_generator():
xref_mode = not force
for (idx, page) in enumerate(pdf):
print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
images = page.get_image_info(xrefs=True)
if len(images) != 1 or images[0]['xref'] == 0:
page_images = page.get_image_info(xrefs=True)
if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page):
xref = page_images[0]['xref']
if xref_mode:
yield xref
else:
yield pdf.extract_image(xref)
else:
if xref_mode:
raise ValueError
else:
print(f'\nGenerating pixmap for page {idx+1}')
pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
yield { 'ext': 'png', 'image': pix.tobytes('png') }
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')
print('')
if force:
return xref_or_image_generator()
try:
xrefs = list(xref_or_image_generator())
except ValueError:
print('\nFailed')
return None
xrefs.append(images[0]['xref'])
print('\nSuccess')
return xrefs
print('Success')
return (pdf.extract_image(xref) for xref in xrefs)
def nfc(s):
return unicodedata.normalize('NFC', s)
@ -677,7 +704,7 @@ def manual_collate(args):
extraction_dir = args.destdir / 'extract'
sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group)
work_id = Path(relpath(sample_path, extraction_dir)).parents[-2].name
work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
@ -706,7 +733,7 @@ def manual_collate(args):
work_staging_dir = collation_staging_area / work_id
work_staging_dir.mkdir(parents=True)
collator = Collator(work_staging_dir, exclusions, args.locale)
collator = Collator(work_staging_dir, exclusions, args)
for group in groups:
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
if collation_result is None:
@ -1002,7 +1029,7 @@ parser_collate.add_argument(
nargs='*',
help='paths within extraction folders as collation hints'
)
parser_collate.set_defaults(func=collate)
parser_collate.set_defaults(func=collate, force_convert_pdf=False)
parser_manual_collate = subparsers.add_parser(
'manual-collate',
@ -1042,6 +1069,11 @@ parser_manual_collate = subparsers.add_parser(
only handle one work at a time.
"""),
)
parser_manual_collate.add_argument(
'--force-convert-pdf',
action='store_true',
help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
)
parser_manual_collate.add_argument(
'expression',
nargs='+',