From ee9eba32872f003ae23c22c11a519693e24817cd Mon Sep 17 00:00:00 2001 From: xenofem Date: Tue, 12 Mar 2024 04:35:09 -0400 Subject: [PATCH] add option to display weird PDF pages as sixel graphics before deciding how to handle them --- dlibrary/dlibrary.py | 58 +++++++++++++++++++++++++++++++++++++++----- flake.nix | 1 + pyproject.toml | 1 + 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index bd3fa8f..ab14717 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -3,6 +3,7 @@ import argparse import asyncio import importlib_resources as resources +from io import BytesIO from pathlib import Path import os from os.path import relpath, splitext @@ -17,6 +18,7 @@ import zipfile from dlsite_async import DlsiteAPI import fitz +from libsixel import * from PIL import Image from jinja2 import Environment, PackageLoader, select_autoescape import requests @@ -77,7 +79,8 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp'] IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store'] IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4'] -PDF_FALLBACK_DPI = 300 +PDF_CONVERSION_DPI = 300 +PDF_PREVIEW_DPI = 72 IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I) @@ -527,6 +530,45 @@ def extract_image(pdf, xref): pix = fitz.Pixmap(pdf, xref) return { 'ext': 'png', 'image': pix.tobytes('png') } +def display_sixel_page(page): + s = BytesIO() + image = Image.open(BytesIO(page.get_pixmap(dpi=PDF_PREVIEW_DPI).tobytes('png'))) + width, height = image.size + + try: + data = image.tobytes() + except NotImplementedError: + data = image.tostring() + output = sixel_output_new(lambda data, s: s.write(data), s) + + try: + if image.mode == 'RGBA': + dither = sixel_dither_new(256) + sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGBA8888) + elif image.mode == 'RGB': + dither = sixel_dither_new(256) + sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGB888) + elif image.mode == 'P': + palette = image.getpalette() + dither = sixel_dither_new(256) + sixel_dither_set_palette(dither, palette) + sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_PAL8) + elif image.mode == 'L': + dither = sixel_dither_get(SIXEL_BUILTIN_G8) + sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G8) + elif image.mode == '1': + dither = sixel_dither_get(SIXEL_BUILTIN_G1) + sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G1) + else: + raise RuntimeError('unexpected image mode') + try: + sixel_encode(data, width, height, 1, dither, output) + print(s.getvalue().decode('ascii')) + finally: + sixel_dither_unref(dither) + finally: + sixel_output_unref(output) + def pdf_images(pdf, force=False): images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] if all(len(images) == 1 and single for (images, single) in images_by_page): @@ -549,14 +591,18 @@ def pdf_images(pdf, force=False): print(f'Converting page {idx+1}') choice = 'c' else: - if xref is None: - choice = input('[N]ope out of this PDF or [c]onvert the page lossily to an image? [N/c] ') - else: - choice = input('[N]ope out of this PDF, e[x]tract the image without the text, or [c]onvert the entire page lossily to an image? [N/x/c] ') + shown = False + while True: + choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ') + if not shown and choice != '' and choice[0].lower() == 's': + display_sixel_page(page) + shown = True + else: + break if xref is not None and choice != '' and choice[0].lower() == 'x': image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x)) elif choice != '' and choice[0].lower() == 'c': - image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_FALLBACK_DPI).tobytes('png') }) + image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') }) else: return None print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n')) diff --git a/flake.nix b/flake.nix index 3c8adff..8ef597c 100644 --- a/flake.nix +++ b/flake.nix @@ -57,6 +57,7 @@ jinja2 importlib-resources setuptools + libsixel ]; src = ./.; }; diff --git a/pyproject.toml b/pyproject.toml index acc2a38..f888f0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "dlsite-async", "jinja2", "importlib_resources", + "libsixel", ] [project.scripts]