From ee9eba32872f003ae23c22c11a519693e24817cd Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 12 Mar 2024 04:35:09 -0400
Subject: [PATCH] add option to display weird PDF pages as sixel graphics
 before deciding how to handle them

---
 dlibrary/dlibrary.py | 58 +++++++++++++++++++++++++++++++++++++++-----
 flake.nix            |  1 +
 pyproject.toml       |  1 +
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index bd3fa8f..ab14717 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -3,6 +3,7 @@
 import argparse
 import asyncio
 import importlib_resources as resources
+from io import BytesIO
 from pathlib import Path
 import os
 from os.path import relpath, splitext
@@ -17,6 +18,7 @@ import zipfile
 
 from dlsite_async import DlsiteAPI
 import fitz
+from libsixel import *
 from PIL import Image
 from jinja2 import Environment, PackageLoader, select_autoescape
 import requests
@@ -77,7 +79,8 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
 IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
 IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
 
-PDF_FALLBACK_DPI = 300
+PDF_CONVERSION_DPI = 300
+PDF_PREVIEW_DPI = 72
 
 IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
 
@@ -527,6 +530,45 @@ def extract_image(pdf, xref):
     pix = fitz.Pixmap(pdf, xref)
     return { 'ext': 'png', 'image': pix.tobytes('png') }
 
+def display_sixel_page(page):
+    s = BytesIO()
+    image = Image.open(BytesIO(page.get_pixmap(dpi=PDF_PREVIEW_DPI).tobytes('png')))
+    width, height = image.size
+
+    try:
+        data = image.tobytes()
+    except NotImplementedError:
+        data = image.tostring()
+    output = sixel_output_new(lambda data, s: s.write(data), s)
+
+    try:
+        if image.mode == 'RGBA':
+            dither = sixel_dither_new(256)
+            sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGBA8888)
+        elif image.mode == 'RGB':
+            dither = sixel_dither_new(256)
+            sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGB888)
+        elif image.mode == 'P':
+            palette = image.getpalette()
+            dither = sixel_dither_new(256)
+            sixel_dither_set_palette(dither, palette)
+            sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_PAL8)
+        elif image.mode == 'L':
+            dither = sixel_dither_get(SIXEL_BUILTIN_G8)
+            sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G8)
+        elif image.mode == '1':
+            dither = sixel_dither_get(SIXEL_BUILTIN_G1)
+            sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G1)
+        else:
+            raise RuntimeError('unexpected image mode')
+        try:
+            sixel_encode(data, width, height, 1, dither, output)
+            print(s.getvalue().decode('ascii'))
+        finally:
+            sixel_dither_unref(dither)
+    finally:
+        sixel_output_unref(output)
+
 def pdf_images(pdf, force=False):
     images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
     if all(len(images) == 1 and single for (images, single) in images_by_page):
@@ -549,14 +591,18 @@ def pdf_images(pdf, force=False):
                 print(f'Converting page {idx+1}')
                 choice = 'c'
             else:
-                if xref is None:
-                    choice = input('[N]ope out of this PDF or [c]onvert the page lossily to an image? [N/c] ')
-                else:
-                    choice = input('[N]ope out of this PDF, e[x]tract the image without the text, or [c]onvert the entire page lossily to an image? [N/x/c] ')
+                shown = False
+                while True:
+                    choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ')
+                    if not shown and choice != '' and choice[0].lower() == 's':
+                        display_sixel_page(page)
+                        shown = True
+                    else:
+                        break
             if xref is not None and choice != '' and choice[0].lower() == 'x':
                 image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
             elif choice != '' and choice[0].lower() == 'c':
-                image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_FALLBACK_DPI).tobytes('png') })
+                image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
             else:
                 return None
         print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
diff --git a/flake.nix b/flake.nix
index 3c8adff..8ef597c 100644
--- a/flake.nix
+++ b/flake.nix
@@ -57,6 +57,7 @@
           jinja2
           importlib-resources
           setuptools
+          libsixel
         ];
         src = ./.;
       };
diff --git a/pyproject.toml b/pyproject.toml
index acc2a38..f888f0d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
     "dlsite-async",
     "jinja2",
     "importlib_resources",
+    "libsixel",
 ]
 
 [project.scripts]