From 7e429f3160e1b092907c98027279b6c9cd3087bc Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Tue, 6 Feb 2024 23:01:59 -0500
Subject: [PATCH] detect if there are equivalent PDF and image-file versions,
 and choose whichever is higher-resolution

---
 dlibrary/dlibrary.py | 73 +++++++++++++++++++++++++++++++++++++++++---
 flake.nix            |  1 +
 pyproject.toml       |  1 +
 3 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py
index 5e2ddb8..2ba3b42 100755
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@@ -4,7 +4,7 @@ import argparse
 import asyncio
 import importlib_resources as resources
 from pathlib import Path
-from os import getenv
+import os
 from os.path import relpath, splitext
 import re
 import shutil
@@ -15,6 +15,7 @@ import zipfile
 
 from dlsite_async import DlsiteAPI
 import fitz
+from PIL import Image
 from jinja2 import Environment, PackageLoader, select_autoescape
 import requests
 
@@ -325,12 +326,25 @@ def link_ordered_files(ordering, dest, start_index):
         link_path = dest / f'{idx:04d}{ext}'
         link_path.symlink_to(relpath(src_path, dest))
 
+def ignoreable(path):
+    return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS
+
 def ls_ignore(directory):
     return [
         path for path in directory.iterdir()
-        if path.name not in IGNOREABLE_FILES and path.suffix.lower() not in IGNOREABLE_EXTENSIONS
+        if not ignoreable(path)
     ]
 
+def descendant_files_ignore(directory):
+    result = []
+    for item in ls_ignore(directory):
+        if item.is_dir():
+            result.extend(descendant_files_ignore(item))
+        else:
+            result.append(item)
+
+    return result
+
 def collate(args):
     con = sqlite3.connect(args.destdir / 'meta.db')
     cur = con.cursor()
@@ -392,6 +406,29 @@ def collate_regex_later(srcs, dest, regex, start_index):
         return None
     return nonmatching_pages + matching_pages
 
+def standalone_image_size(filepath):
+    with Image.open(filepath) as im:
+        return im.size
+
+def pdf_image_sizes(filepath):
+    sizes_by_xref = {}
+
+    with fitz.open(filepath) as pdf:
+        for page in pdf:
+            for (xref, _, width, height, *_) in page.get_images():
+                if xref in sizes_by_xref:
+                    continue
+                sizes_by_xref[xref] = (width, height)
+
+    return list(sizes_by_xref.values())
+
+def median(items):
+    if len(items) == 0:
+        return None
+
+    items.sort()
+    return items[len(items) // 2]
+
 def collate_from_paths(srcs, dest, start_index):
     if len(srcs) == 1 and srcs[0].is_dir():
         return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
@@ -420,6 +457,34 @@ def collate_from_paths(srcs, dest, start_index):
         else:
             return None
 
+    pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf']
+    if len(pdfs) == 1:
+        pdf = pdfs[0]
+        images = []
+        non_images = []
+        descendant_files = [
+            src for src in srcs if src != pdf and src.is_file()
+        ] + [
+            f for src in srcs if src.is_dir() for f in descendant_files_ignore(src)
+        ]
+        for f in descendant_files:
+            if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
+                images.append(f)
+            else:
+                non_images.append(f)
+                break
+        if len(non_images) == 0 and len(images) > 0:
+            pdf_sizes = pdf_image_sizes(pdf)
+            standalone_sizes = [standalone_image_size(f) for f in images]
+            if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2:
+                median_pdf_size = median(pdf_sizes)
+                median_standalone_size = median(standalone_sizes)
+                if median_pdf_size and median_standalone_size:
+                    if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]:
+                        return collate_from_paths([src for src in srcs if src != pdf], dest, start_index)
+                    if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]:
+                        return collate_from_paths([pdf], dest, start_index)
+
     return None
 
 def self_and_parents(path):
@@ -641,7 +706,7 @@ argparser = argparse.ArgumentParser(
 argparser.add_argument(
     '-d', '--destdir',
     type=Path,
-    default=Path(getenv('DLIBRARY_DIR', './dlibrary')),
+    default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')),
     help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)',
 )
 subparsers = argparser.add_subparsers(title="subcommands", required=True)
@@ -665,7 +730,7 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
 parser_fetch.add_argument(
     '-l', '--locale',
     type=str,
-    default=getenv('DLIBRARY_LOCALE', 'en_US'),
+    default=os.getenv('DLIBRARY_LOCALE', 'en_US'),
     help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). '
           'May still fall back to Japanese if metadata in other languages is unavailable. '
           '(default: $DLIBRARY_LOCALE or en_US)'),
diff --git a/flake.nix b/flake.nix
index 63d5f48..4ae830b 100644
--- a/flake.nix
+++ b/flake.nix
@@ -51,6 +51,7 @@
         pyproject = true;
         propagatedBuildInputs = [
           pymupdf
+          pillow
           requests
           dlsite-async
           jinja2
diff --git a/pyproject.toml b/pyproject.toml
index 58df9bd..7ecfcfd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ authors = [{name = "xenofem"}]
 dependencies = [
     "requests",
     "PyMuPDF",
+    "pillow",
     "dlsite-async",
     "jinja2",
     "importlib_resources",