detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution
This commit is contained in:
parent
9cc51ace4a
commit
7e429f3160
|
@ -4,7 +4,7 @@ import argparse
|
|||
import asyncio
|
||||
import importlib_resources as resources
|
||||
from pathlib import Path
|
||||
from os import getenv
|
||||
import os
|
||||
from os.path import relpath, splitext
|
||||
import re
|
||||
import shutil
|
||||
|
@ -15,6 +15,7 @@ import zipfile
|
|||
|
||||
from dlsite_async import DlsiteAPI
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from jinja2 import Environment, PackageLoader, select_autoescape
|
||||
import requests
|
||||
|
||||
|
@ -325,12 +326,25 @@ def link_ordered_files(ordering, dest, start_index):
|
|||
link_path = dest / f'{idx:04d}{ext}'
|
||||
link_path.symlink_to(relpath(src_path, dest))
|
||||
|
||||
def ignoreable(path):
|
||||
return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS
|
||||
|
||||
def ls_ignore(directory):
|
||||
return [
|
||||
path for path in directory.iterdir()
|
||||
if path.name not in IGNOREABLE_FILES and path.suffix.lower() not in IGNOREABLE_EXTENSIONS
|
||||
if not ignoreable(path)
|
||||
]
|
||||
|
||||
def descendant_files_ignore(directory):
|
||||
result = []
|
||||
for item in ls_ignore(directory):
|
||||
if item.is_dir():
|
||||
result.extend(descendant_files_ignore(item))
|
||||
else:
|
||||
result.append(item)
|
||||
|
||||
return result
|
||||
|
||||
def collate(args):
|
||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||
cur = con.cursor()
|
||||
|
@ -392,6 +406,29 @@ def collate_regex_later(srcs, dest, regex, start_index):
|
|||
return None
|
||||
return nonmatching_pages + matching_pages
|
||||
|
||||
def standalone_image_size(filepath):
|
||||
with Image.open(filepath) as im:
|
||||
return im.size
|
||||
|
||||
def pdf_image_sizes(filepath):
|
||||
sizes_by_xref = {}
|
||||
|
||||
with fitz.open(filepath) as pdf:
|
||||
for page in pdf:
|
||||
for (xref, _, width, height, *_) in page.get_images():
|
||||
if xref in sizes_by_xref:
|
||||
continue
|
||||
sizes_by_xref[xref] = (width, height)
|
||||
|
||||
return list(sizes_by_xref.values())
|
||||
|
||||
def median(items):
|
||||
if len(items) == 0:
|
||||
return None
|
||||
|
||||
items.sort()
|
||||
return items[len(items) // 2]
|
||||
|
||||
def collate_from_paths(srcs, dest, start_index):
|
||||
if len(srcs) == 1 and srcs[0].is_dir():
|
||||
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
|
||||
|
@ -420,6 +457,34 @@ def collate_from_paths(srcs, dest, start_index):
|
|||
else:
|
||||
return None
|
||||
|
||||
pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf']
|
||||
if len(pdfs) == 1:
|
||||
pdf = pdfs[0]
|
||||
images = []
|
||||
non_images = []
|
||||
descendant_files = [
|
||||
src for src in srcs if src != pdf and src.is_file()
|
||||
] + [
|
||||
f for src in srcs if src.is_dir() for f in descendant_files_ignore(src)
|
||||
]
|
||||
for f in descendant_files:
|
||||
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
||||
images.append(f)
|
||||
else:
|
||||
non_images.append(f)
|
||||
break
|
||||
if len(non_images) == 0 and len(images) > 0:
|
||||
pdf_sizes = pdf_image_sizes(pdf)
|
||||
standalone_sizes = [standalone_image_size(f) for f in images]
|
||||
if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2:
|
||||
median_pdf_size = median(pdf_sizes)
|
||||
median_standalone_size = median(standalone_sizes)
|
||||
if median_pdf_size and median_standalone_size:
|
||||
if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]:
|
||||
return collate_from_paths([src for src in srcs if src != pdf], dest, start_index)
|
||||
if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]:
|
||||
return collate_from_paths([pdf], dest, start_index)
|
||||
|
||||
return None
|
||||
|
||||
def self_and_parents(path):
|
||||
|
@ -641,7 +706,7 @@ argparser = argparse.ArgumentParser(
|
|||
argparser.add_argument(
|
||||
'-d', '--destdir',
|
||||
type=Path,
|
||||
default=Path(getenv('DLIBRARY_DIR', './dlibrary')),
|
||||
default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')),
|
||||
help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)',
|
||||
)
|
||||
subparsers = argparser.add_subparsers(title="subcommands", required=True)
|
||||
|
@ -665,7 +730,7 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
|
|||
parser_fetch.add_argument(
|
||||
'-l', '--locale',
|
||||
type=str,
|
||||
default=getenv('DLIBRARY_LOCALE', 'en_US'),
|
||||
default=os.getenv('DLIBRARY_LOCALE', 'en_US'),
|
||||
help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). '
|
||||
'May still fall back to Japanese if metadata in other languages is unavailable. '
|
||||
'(default: $DLIBRARY_LOCALE or en_US)'),
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
pyproject = true;
|
||||
propagatedBuildInputs = [
|
||||
pymupdf
|
||||
pillow
|
||||
requests
|
||||
dlsite-async
|
||||
jinja2
|
||||
|
|
|
@ -7,6 +7,7 @@ authors = [{name = "xenofem"}]
|
|||
dependencies = [
|
||||
"requests",
|
||||
"PyMuPDF",
|
||||
"pillow",
|
||||
"dlsite-async",
|
||||
"jinja2",
|
||||
"importlib_resources",
|
||||
|
|
Loading…
Reference in a new issue