detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

This commit is contained in:
xenofem 2024-02-06 23:01:59 -05:00
parent 9cc51ace4a
commit 7e429f3160
3 changed files with 71 additions and 4 deletions

View file

@ -4,7 +4,7 @@ import argparse
import asyncio import asyncio
import importlib_resources as resources import importlib_resources as resources
from pathlib import Path from pathlib import Path
from os import getenv import os
from os.path import relpath, splitext from os.path import relpath, splitext
import re import re
import shutil import shutil
@ -15,6 +15,7 @@ import zipfile
from dlsite_async import DlsiteAPI from dlsite_async import DlsiteAPI
import fitz import fitz
from PIL import Image
from jinja2 import Environment, PackageLoader, select_autoescape from jinja2 import Environment, PackageLoader, select_autoescape
import requests import requests
@ -325,12 +326,25 @@ def link_ordered_files(ordering, dest, start_index):
link_path = dest / f'{idx:04d}{ext}' link_path = dest / f'{idx:04d}{ext}'
link_path.symlink_to(relpath(src_path, dest)) link_path.symlink_to(relpath(src_path, dest))
def ignoreable(path):
return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS
def ls_ignore(directory): def ls_ignore(directory):
return [ return [
path for path in directory.iterdir() path for path in directory.iterdir()
if path.name not in IGNOREABLE_FILES and path.suffix.lower() not in IGNOREABLE_EXTENSIONS if not ignoreable(path)
] ]
def descendant_files_ignore(directory):
result = []
for item in ls_ignore(directory):
if item.is_dir():
result.extend(descendant_files_ignore(item))
else:
result.append(item)
return result
def collate(args): def collate(args):
con = sqlite3.connect(args.destdir / 'meta.db') con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor() cur = con.cursor()
@ -392,6 +406,29 @@ def collate_regex_later(srcs, dest, regex, start_index):
return None return None
return nonmatching_pages + matching_pages return nonmatching_pages + matching_pages
def standalone_image_size(filepath):
with Image.open(filepath) as im:
return im.size
def pdf_image_sizes(filepath):
sizes_by_xref = {}
with fitz.open(filepath) as pdf:
for page in pdf:
for (xref, _, width, height, *_) in page.get_images():
if xref in sizes_by_xref:
continue
sizes_by_xref[xref] = (width, height)
return list(sizes_by_xref.values())
def median(items):
if len(items) == 0:
return None
items.sort()
return items[len(items) // 2]
def collate_from_paths(srcs, dest, start_index): def collate_from_paths(srcs, dest, start_index):
if len(srcs) == 1 and srcs[0].is_dir(): if len(srcs) == 1 and srcs[0].is_dir():
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index) return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
@ -420,6 +457,34 @@ def collate_from_paths(srcs, dest, start_index):
else: else:
return None return None
pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf']
if len(pdfs) == 1:
pdf = pdfs[0]
images = []
non_images = []
descendant_files = [
src for src in srcs if src != pdf and src.is_file()
] + [
f for src in srcs if src.is_dir() for f in descendant_files_ignore(src)
]
for f in descendant_files:
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
images.append(f)
else:
non_images.append(f)
break
if len(non_images) == 0 and len(images) > 0:
pdf_sizes = pdf_image_sizes(pdf)
standalone_sizes = [standalone_image_size(f) for f in images]
if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2:
median_pdf_size = median(pdf_sizes)
median_standalone_size = median(standalone_sizes)
if median_pdf_size and median_standalone_size:
if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]:
return collate_from_paths([src for src in srcs if src != pdf], dest, start_index)
if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]:
return collate_from_paths([pdf], dest, start_index)
return None return None
def self_and_parents(path): def self_and_parents(path):
@ -641,7 +706,7 @@ argparser = argparse.ArgumentParser(
argparser.add_argument( argparser.add_argument(
'-d', '--destdir', '-d', '--destdir',
type=Path, type=Path,
default=Path(getenv('DLIBRARY_DIR', './dlibrary')), default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')),
help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)', help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)',
) )
subparsers = argparser.add_subparsers(title="subcommands", required=True) subparsers = argparser.add_subparsers(title="subcommands", required=True)
@ -665,7 +730,7 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
parser_fetch.add_argument( parser_fetch.add_argument(
'-l', '--locale', '-l', '--locale',
type=str, type=str,
default=getenv('DLIBRARY_LOCALE', 'en_US'), default=os.getenv('DLIBRARY_LOCALE', 'en_US'),
help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). ' help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). '
'May still fall back to Japanese if metadata in other languages is unavailable. ' 'May still fall back to Japanese if metadata in other languages is unavailable. '
'(default: $DLIBRARY_LOCALE or en_US)'), '(default: $DLIBRARY_LOCALE or en_US)'),

View file

@ -51,6 +51,7 @@
pyproject = true; pyproject = true;
propagatedBuildInputs = [ propagatedBuildInputs = [
pymupdf pymupdf
pillow
requests requests
dlsite-async dlsite-async
jinja2 jinja2

View file

@ -7,6 +7,7 @@ authors = [{name = "xenofem"}]
dependencies = [ dependencies = [
"requests", "requests",
"PyMuPDF", "PyMuPDF",
"pillow",
"dlsite-async", "dlsite-async",
"jinja2", "jinja2",
"importlib_resources", "importlib_resources",