add subcommand to show page counts and image sizes of an extracted work's files

ignore the mp4 files that fanza throws in sometimes
detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution
2024-02-06 23:52:59 -05:00 · 2024-02-06 23:02:59 -05:00 · 2024-02-06 23:01:59 -05:00
3 changed files with 98 additions and 5 deletions
--- a/dlibrary/dlibrary.py
+++ b/dlibrary/dlibrary.py
@ -4,7 +4,7 @@ import argparse
 import asyncio
 import importlib_resources as resources
 from pathlib import Path
-from os import getenv
+import os
 from os.path import relpath, splitext
 import re
 import shutil
@ -15,6 +15,7 @@ import zipfile

 from dlsite_async import DlsiteAPI
 import fitz
+from PIL import Image
 from jinja2 import Environment, PackageLoader, select_autoescape
 import requests

@ -43,7 +44,7 @@ ALT_VERSIONS = [
 IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff']

 IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
-IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd']
+IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']

 def open_zipfile_with_encoding(path):
    try:
@ -325,12 +326,25 @@ def link_ordered_files(ordering, dest, start_index):
        link_path = dest / f'{idx:04d}{ext}'
        link_path.symlink_to(relpath(src_path, dest))

+def ignoreable(path):
+    return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS
+
 def ls_ignore(directory):
    return [
        path for path in directory.iterdir()
-        if path.name not in IGNOREABLE_FILES and path.suffix.lower() not in IGNOREABLE_EXTENSIONS
+        if not ignoreable(path)
    ]

+def descendant_files_ignore(directory):
+    result = []
+    for item in ls_ignore(directory):
+        if item.is_dir():
+            result.extend(descendant_files_ignore(item))
+        else:
+            result.append(item)
+
+    return result
+
 def collate(args):
    con = sqlite3.connect(args.destdir / 'meta.db')
    cur = con.cursor()
@ -392,6 +406,29 @@ def collate_regex_later(srcs, dest, regex, start_index):
        return None
    return nonmatching_pages + matching_pages

+def standalone_image_size(filepath):
+    with Image.open(filepath) as im:
+        return im.size
+
+def pdf_image_sizes(filepath):
+    sizes_by_xref = {}
+
+    with fitz.open(filepath) as pdf:
+        for page in pdf:
+            for (xref, _, width, height, *_) in page.get_images():
+                if xref in sizes_by_xref:
+                    continue
+                sizes_by_xref[xref] = (width, height)
+
+    return list(sizes_by_xref.values())
+
+def median(items):
+    if len(items) == 0:
+        return None
+
+    items.sort()
+    return items[len(items) // 2]
+
 def collate_from_paths(srcs, dest, start_index):
    if len(srcs) == 1 and srcs[0].is_dir():
        return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
@ -420,6 +457,34 @@ def collate_from_paths(srcs, dest, start_index):
        else:
            return None

+    pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf']
+    if len(pdfs) == 1:
+        pdf = pdfs[0]
+        images = []
+        non_images = []
+        descendant_files = [
+            src for src in srcs if src != pdf and src.is_file()
+        ] + [
+            f for src in srcs if src.is_dir() for f in descendant_files_ignore(src)
+        ]
+        for f in descendant_files:
+            if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
+                images.append(f)
+            else:
+                non_images.append(f)
+                break
+        if len(non_images) == 0 and len(images) > 0:
+            pdf_sizes = pdf_image_sizes(pdf)
+            standalone_sizes = [standalone_image_size(f) for f in images]
+            if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2:
+                median_pdf_size = median(pdf_sizes)
+                median_standalone_size = median(standalone_sizes)
+                if median_pdf_size and median_standalone_size:
+                    if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]:
+                        return collate_from_paths([src for src in srcs if src != pdf], dest, start_index)
+                    if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]:
+                        return collate_from_paths([pdf], dest, start_index)
+
    return None

 def self_and_parents(path):
@ -462,6 +527,28 @@ def manual_collate(args):
            print(f'Unknown file type {path}, stopping')
            return

+def fmt_size(s):
+    return f'{s[0]}x{s[1]}px'
+
+def analyze(args):
+    extract_dir = args.destdir / 'extract'
+    files = descendant_files_ignore(extract_dir / args.work_id)
+    files.sort()
+
+    for f in files:
+        print(f'{relpath(f, extract_dir)}', end='')
+        if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
+            size = standalone_image_size(f)
+            print(f'\t{fmt_size(size)}')
+        elif f.suffix.lower() == '.pdf':
+            sizes = pdf_image_sizes(f)
+            if len(sizes) == 0:
+                print(f'\tContains no images')
+            else:
+                print(f'\t{len(sizes)} images, median {fmt_size(median(sizes))}, min {fmt_size(min(sizes))}, max {fmt_size(max(sizes))}')
+        else:
+            print()
+
 def metadata(args):
    con = sqlite3.connect(args.destdir / 'meta.db')
    cur = con.cursor()
@ -641,7 +728,7 @@ argparser = argparse.ArgumentParser(
 argparser.add_argument(
    '-d', '--destdir',
    type=Path,
-    default=Path(getenv('DLIBRARY_DIR', './dlibrary')),
+    default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')),
    help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)',
 )
 subparsers = argparser.add_subparsers(title="subcommands", required=True)
@ -665,7 +752,7 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
 parser_fetch.add_argument(
    '-l', '--locale',
    type=str,
-    default=getenv('DLIBRARY_LOCALE', 'en_US'),
+    default=os.getenv('DLIBRARY_LOCALE', 'en_US'),
    help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). '
          'May still fall back to Japanese if metadata in other languages is unavailable. '
          '(default: $DLIBRARY_LOCALE or en_US)'),
@ -748,6 +835,10 @@ parser_manual_collate.add_argument(
 )
 parser_manual_collate.set_defaults(func=manual_collate)

+parser_analyze = subparsers.add_parser('analyze', help='analyze an extracted folder to assist in collation')
+parser_analyze.add_argument('work_id')
+parser_analyze.set_defaults(func=analyze)
+
 parser_metadata = subparsers.add_parser('metadata', help='view or modify metadata for a work')
 parser_metadata.add_argument('work_id')
 parser_metadata.add_argument(
--- a/flake.nix
+++ b/flake.nix
@ -51,6 +51,7 @@
        pyproject = true;
        propagatedBuildInputs = [
          pymupdf
+          pillow
          requests
          dlsite-async
          jinja2
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,6 +7,7 @@ authors = [{name = "xenofem"}]
 dependencies = [
    "requests",
    "PyMuPDF",
+    "pillow",
    "dlsite-async",
    "jinja2",
    "importlib_resources",
Author	SHA1	Message	Date
xenofem	c2f516a281	add subcommand to show page counts and image sizes of an extracted work's files	2024-02-06 23:52:59 -05:00
xenofem	cdf06d9aa0	ignore the mp4 files that fanza throws in sometimes	2024-02-06 23:02:59 -05:00
xenofem	7e429f3160	detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution	2024-02-06 23:01:59 -05:00