From 7e429f3160e1b092907c98027279b6c9cd3087bc Mon Sep 17 00:00:00 2001 From: xenofem Date: Tue, 6 Feb 2024 23:01:59 -0500 Subject: [PATCH] detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution --- dlibrary/dlibrary.py | 73 +++++++++++++++++++++++++++++++++++++++++--- flake.nix | 1 + pyproject.toml | 1 + 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/dlibrary/dlibrary.py b/dlibrary/dlibrary.py index 5e2ddb8..2ba3b42 100755 --- a/dlibrary/dlibrary.py +++ b/dlibrary/dlibrary.py @@ -4,7 +4,7 @@ import argparse import asyncio import importlib_resources as resources from pathlib import Path -from os import getenv +import os from os.path import relpath, splitext import re import shutil @@ -15,6 +15,7 @@ import zipfile from dlsite_async import DlsiteAPI import fitz +from PIL import Image from jinja2 import Environment, PackageLoader, select_autoescape import requests @@ -325,12 +326,25 @@ def link_ordered_files(ordering, dest, start_index): link_path = dest / f'{idx:04d}{ext}' link_path.symlink_to(relpath(src_path, dest)) +def ignoreable(path): + return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS + def ls_ignore(directory): return [ path for path in directory.iterdir() - if path.name not in IGNOREABLE_FILES and path.suffix.lower() not in IGNOREABLE_EXTENSIONS + if not ignoreable(path) ] +def descendant_files_ignore(directory): + result = [] + for item in ls_ignore(directory): + if item.is_dir(): + result.extend(descendant_files_ignore(item)) + else: + result.append(item) + + return result + def collate(args): con = sqlite3.connect(args.destdir / 'meta.db') cur = con.cursor() @@ -392,6 +406,29 @@ def collate_regex_later(srcs, dest, regex, start_index): return None return nonmatching_pages + matching_pages +def standalone_image_size(filepath): + with Image.open(filepath) as im: + return im.size + +def pdf_image_sizes(filepath): + sizes_by_xref = {} + + with fitz.open(filepath) as pdf: + for page in pdf: + for (xref, _, width, height, *_) in page.get_images(): + if xref in sizes_by_xref: + continue + sizes_by_xref[xref] = (width, height) + + return list(sizes_by_xref.values()) + +def median(items): + if len(items) == 0: + return None + + items.sort() + return items[len(items) // 2] + def collate_from_paths(srcs, dest, start_index): if len(srcs) == 1 and srcs[0].is_dir(): return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) @@ -420,6 +457,34 @@ def collate_from_paths(srcs, dest, start_index): else: return None + pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf'] + if len(pdfs) == 1: + pdf = pdfs[0] + images = [] + non_images = [] + descendant_files = [ + src for src in srcs if src != pdf and src.is_file() + ] + [ + f for src in srcs if src.is_dir() for f in descendant_files_ignore(src) + ] + for f in descendant_files: + if f.suffix.lower() in IMAGE_FILE_EXTENSIONS: + images.append(f) + else: + non_images.append(f) + break + if len(non_images) == 0 and len(images) > 0: + pdf_sizes = pdf_image_sizes(pdf) + standalone_sizes = [standalone_image_size(f) for f in images] + if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2: + median_pdf_size = median(pdf_sizes) + median_standalone_size = median(standalone_sizes) + if median_pdf_size and median_standalone_size: + if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]: + return collate_from_paths([src for src in srcs if src != pdf], dest, start_index) + if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]: + return collate_from_paths([pdf], dest, start_index) + return None def self_and_parents(path): @@ -641,7 +706,7 @@ argparser = argparse.ArgumentParser( argparser.add_argument( '-d', '--destdir', type=Path, - default=Path(getenv('DLIBRARY_DIR', './dlibrary')), + default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')), help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)', ) subparsers = argparser.add_subparsers(title="subcommands", required=True) @@ -665,7 +730,7 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail parser_fetch.add_argument( '-l', '--locale', type=str, - default=getenv('DLIBRARY_LOCALE', 'en_US'), + default=os.getenv('DLIBRARY_LOCALE', 'en_US'), help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). ' 'May still fall back to Japanese if metadata in other languages is unavailable. ' '(default: $DLIBRARY_LOCALE or en_US)'), diff --git a/flake.nix b/flake.nix index 63d5f48..4ae830b 100644 --- a/flake.nix +++ b/flake.nix @@ -51,6 +51,7 @@ pyproject = true; propagatedBuildInputs = [ pymupdf + pillow requests dlsite-async jinja2 diff --git a/pyproject.toml b/pyproject.toml index 58df9bd..7ecfcfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ authors = [{name = "xenofem"}] dependencies = [ "requests", "PyMuPDF", + "pillow", "dlsite-async", "jinja2", "importlib_resources",