add option to display weird PDF pages as sixel graphics before deciding how to handle them
This commit is contained in:
parent
3a9199b847
commit
ee9eba3287
|
@ -3,6 +3,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import importlib_resources as resources
|
import importlib_resources as resources
|
||||||
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
from os.path import relpath, splitext
|
from os.path import relpath, splitext
|
||||||
|
@ -17,6 +18,7 @@ import zipfile
|
||||||
|
|
||||||
from dlsite_async import DlsiteAPI
|
from dlsite_async import DlsiteAPI
|
||||||
import fitz
|
import fitz
|
||||||
|
from libsixel import *
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from jinja2 import Environment, PackageLoader, select_autoescape
|
from jinja2 import Environment, PackageLoader, select_autoescape
|
||||||
import requests
|
import requests
|
||||||
|
@ -77,7 +79,8 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
|
||||||
IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
|
IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
|
||||||
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
|
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
|
||||||
|
|
||||||
PDF_FALLBACK_DPI = 300
|
PDF_CONVERSION_DPI = 300
|
||||||
|
PDF_PREVIEW_DPI = 72
|
||||||
|
|
||||||
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
|
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
|
||||||
|
|
||||||
|
@ -527,6 +530,45 @@ def extract_image(pdf, xref):
|
||||||
pix = fitz.Pixmap(pdf, xref)
|
pix = fitz.Pixmap(pdf, xref)
|
||||||
return { 'ext': 'png', 'image': pix.tobytes('png') }
|
return { 'ext': 'png', 'image': pix.tobytes('png') }
|
||||||
|
|
||||||
|
def display_sixel_page(page):
|
||||||
|
s = BytesIO()
|
||||||
|
image = Image.open(BytesIO(page.get_pixmap(dpi=PDF_PREVIEW_DPI).tobytes('png')))
|
||||||
|
width, height = image.size
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = image.tobytes()
|
||||||
|
except NotImplementedError:
|
||||||
|
data = image.tostring()
|
||||||
|
output = sixel_output_new(lambda data, s: s.write(data), s)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if image.mode == 'RGBA':
|
||||||
|
dither = sixel_dither_new(256)
|
||||||
|
sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGBA8888)
|
||||||
|
elif image.mode == 'RGB':
|
||||||
|
dither = sixel_dither_new(256)
|
||||||
|
sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGB888)
|
||||||
|
elif image.mode == 'P':
|
||||||
|
palette = image.getpalette()
|
||||||
|
dither = sixel_dither_new(256)
|
||||||
|
sixel_dither_set_palette(dither, palette)
|
||||||
|
sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_PAL8)
|
||||||
|
elif image.mode == 'L':
|
||||||
|
dither = sixel_dither_get(SIXEL_BUILTIN_G8)
|
||||||
|
sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G8)
|
||||||
|
elif image.mode == '1':
|
||||||
|
dither = sixel_dither_get(SIXEL_BUILTIN_G1)
|
||||||
|
sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G1)
|
||||||
|
else:
|
||||||
|
raise RuntimeError('unexpected image mode')
|
||||||
|
try:
|
||||||
|
sixel_encode(data, width, height, 1, dither, output)
|
||||||
|
print(s.getvalue().decode('ascii'))
|
||||||
|
finally:
|
||||||
|
sixel_dither_unref(dither)
|
||||||
|
finally:
|
||||||
|
sixel_output_unref(output)
|
||||||
|
|
||||||
def pdf_images(pdf, force=False):
|
def pdf_images(pdf, force=False):
|
||||||
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
|
||||||
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
if all(len(images) == 1 and single for (images, single) in images_by_page):
|
||||||
|
@ -549,14 +591,18 @@ def pdf_images(pdf, force=False):
|
||||||
print(f'Converting page {idx+1}')
|
print(f'Converting page {idx+1}')
|
||||||
choice = 'c'
|
choice = 'c'
|
||||||
else:
|
else:
|
||||||
if xref is None:
|
shown = False
|
||||||
choice = input('[N]ope out of this PDF or [c]onvert the page lossily to an image? [N/c] ')
|
while True:
|
||||||
else:
|
choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ')
|
||||||
choice = input('[N]ope out of this PDF, e[x]tract the image without the text, or [c]onvert the entire page lossily to an image? [N/x/c] ')
|
if not shown and choice != '' and choice[0].lower() == 's':
|
||||||
|
display_sixel_page(page)
|
||||||
|
shown = True
|
||||||
|
else:
|
||||||
|
break
|
||||||
if xref is not None and choice != '' and choice[0].lower() == 'x':
|
if xref is not None and choice != '' and choice[0].lower() == 'x':
|
||||||
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
|
||||||
elif choice != '' and choice[0].lower() == 'c':
|
elif choice != '' and choice[0].lower() == 'c':
|
||||||
image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_FALLBACK_DPI).tobytes('png') })
|
image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
|
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
|
||||||
|
|
|
@ -57,6 +57,7 @@
|
||||||
jinja2
|
jinja2
|
||||||
importlib-resources
|
importlib-resources
|
||||||
setuptools
|
setuptools
|
||||||
|
libsixel
|
||||||
];
|
];
|
||||||
src = ./.;
|
src = ./.;
|
||||||
};
|
};
|
||||||
|
|
|
@ -11,6 +11,7 @@ dependencies = [
|
||||||
"dlsite-async",
|
"dlsite-async",
|
||||||
"jinja2",
|
"jinja2",
|
||||||
"importlib_resources",
|
"importlib_resources",
|
||||||
|
"libsixel",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|
Loading…
Reference in a new issue