Compare commits

..

3 commits

3 changed files with 98 additions and 5 deletions

View file

@ -4,7 +4,7 @@ import argparse
import asyncio
import importlib_resources as resources
from pathlib import Path
from os import getenv
import os
from os.path import relpath, splitext
import re
import shutil
@ -15,6 +15,7 @@ import zipfile
from dlsite_async import DlsiteAPI
import fitz
from PIL import Image
from jinja2 import Environment, PackageLoader, select_autoescape
import requests
@ -43,7 +44,7 @@ ALT_VERSIONS = [
IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff']
IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd']
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
def open_zipfile_with_encoding(path):
try:
@ -325,12 +326,25 @@ def link_ordered_files(ordering, dest, start_index):
link_path = dest / f'{idx:04d}{ext}'
link_path.symlink_to(relpath(src_path, dest))
def ignoreable(path):
return path.name in IGNOREABLE_FILES or path.suffix.lower() in IGNOREABLE_EXTENSIONS
def ls_ignore(directory):
return [
path for path in directory.iterdir()
if path.name not in IGNOREABLE_FILES and path.suffix.lower() not in IGNOREABLE_EXTENSIONS
if not ignoreable(path)
]
def descendant_files_ignore(directory):
result = []
for item in ls_ignore(directory):
if item.is_dir():
result.extend(descendant_files_ignore(item))
else:
result.append(item)
return result
def collate(args):
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
@ -392,6 +406,29 @@ def collate_regex_later(srcs, dest, regex, start_index):
return None
return nonmatching_pages + matching_pages
def standalone_image_size(filepath):
with Image.open(filepath) as im:
return im.size
def pdf_image_sizes(filepath):
sizes_by_xref = {}
with fitz.open(filepath) as pdf:
for page in pdf:
for (xref, _, width, height, *_) in page.get_images():
if xref in sizes_by_xref:
continue
sizes_by_xref[xref] = (width, height)
return list(sizes_by_xref.values())
def median(items):
if len(items) == 0:
return None
items.sort()
return items[len(items) // 2]
def collate_from_paths(srcs, dest, start_index):
if len(srcs) == 1 and srcs[0].is_dir():
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
@ -420,6 +457,34 @@ def collate_from_paths(srcs, dest, start_index):
else:
return None
pdfs = [src for src in srcs if src.is_file() and src.suffix.lower() == '.pdf']
if len(pdfs) == 1:
pdf = pdfs[0]
images = []
non_images = []
descendant_files = [
src for src in srcs if src != pdf and src.is_file()
] + [
f for src in srcs if src.is_dir() for f in descendant_files_ignore(src)
]
for f in descendant_files:
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
images.append(f)
else:
non_images.append(f)
break
if len(non_images) == 0 and len(images) > 0:
pdf_sizes = pdf_image_sizes(pdf)
standalone_sizes = [standalone_image_size(f) for f in images]
if abs(len(pdf_sizes) - len(standalone_sizes)) <= 2:
median_pdf_size = median(pdf_sizes)
median_standalone_size = median(standalone_sizes)
if median_pdf_size and median_standalone_size:
if median_standalone_size[0] >= median_pdf_size[0] and median_standalone_size[1] >= median_pdf_size[1]:
return collate_from_paths([src for src in srcs if src != pdf], dest, start_index)
if median_pdf_size[0] >= median_standalone_size[0] and median_pdf_size[1] >= median_standalone_size[1]:
return collate_from_paths([pdf], dest, start_index)
return None
def self_and_parents(path):
@ -462,6 +527,28 @@ def manual_collate(args):
print(f'Unknown file type {path}, stopping')
return
def fmt_size(s):
return f'{s[0]}x{s[1]}px'
def analyze(args):
extract_dir = args.destdir / 'extract'
files = descendant_files_ignore(extract_dir / args.work_id)
files.sort()
for f in files:
print(f'{relpath(f, extract_dir)}', end='')
if f.suffix.lower() in IMAGE_FILE_EXTENSIONS:
size = standalone_image_size(f)
print(f'\t{fmt_size(size)}')
elif f.suffix.lower() == '.pdf':
sizes = pdf_image_sizes(f)
if len(sizes) == 0:
print(f'\tContains no images')
else:
print(f'\t{len(sizes)} images, median {fmt_size(median(sizes))}, min {fmt_size(min(sizes))}, max {fmt_size(max(sizes))}')
else:
print()
def metadata(args):
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
@ -641,7 +728,7 @@ argparser = argparse.ArgumentParser(
argparser.add_argument(
'-d', '--destdir',
type=Path,
default=Path(getenv('DLIBRARY_DIR', './dlibrary')),
default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')),
help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)',
)
subparsers = argparser.add_subparsers(title="subcommands", required=True)
@ -665,7 +752,7 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
parser_fetch.add_argument(
'-l', '--locale',
type=str,
default=getenv('DLIBRARY_LOCALE', 'en_US'),
default=os.getenv('DLIBRARY_LOCALE', 'en_US'),
help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). '
'May still fall back to Japanese if metadata in other languages is unavailable. '
'(default: $DLIBRARY_LOCALE or en_US)'),
@ -748,6 +835,10 @@ parser_manual_collate.add_argument(
)
parser_manual_collate.set_defaults(func=manual_collate)
parser_analyze = subparsers.add_parser('analyze', help='analyze an extracted folder to assist in collation')
parser_analyze.add_argument('work_id')
parser_analyze.set_defaults(func=analyze)
parser_metadata = subparsers.add_parser('metadata', help='view or modify metadata for a work')
parser_metadata.add_argument('work_id')
parser_metadata.add_argument(

View file

@ -51,6 +51,7 @@
pyproject = true;
propagatedBuildInputs = [
pymupdf
pillow
requests
dlsite-async
jinja2

View file

@ -7,6 +7,7 @@ authors = [{name = "xenofem"}]
dependencies = [
"requests",
"PyMuPDF",
"pillow",
"dlsite-async",
"jinja2",
"importlib_resources",