dlibrary/dlibrary/dlibrary.py

#!/usr/bin/env python3

import argparse
import asyncio
import importlib_resources as resources
from pathlib import Path
import os
from os.path import relpath, splitext
import re
import readline
import shutil
import sqlite3
import textwrap
from urllib.parse import urlparse
import zipfile

from dlsite_async import DlsiteAPI
import fitz
from PIL import Image
from jinja2 import Environment, PackageLoader, select_autoescape
import requests

NUMBER_REGEX = re.compile('[0-9０-９]+')

DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$')
FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
FAKKU_ID_REGEX = re.compile('.*_FAKKU$')

TEXTLESS_REGEX = re.compile('(台詞|セリフ|せりふ|テキスト|文字)((な|無)し|抜き)|notext|textless', re.IGNORECASE)
EPILOGUE_REGEX = re.compile('after|後日談', re.IGNORECASE)
HI_RES_REGEX = re.compile('高解像度', re.IGNORECASE)
FRONT_COVER_REGEX = re.compile('(^|[^裏])表紙|cover|hyoushi', re.IGNORECASE)
BACK_COVER_REGEX = re.compile('裏表紙', re.IGNORECASE)
ALT_VERSIONS = [
    '褐色',
    '日焼け',
    'pink',
    '金髪',
    '白肌',
    'うつろ目',
    'dark skin',
    'ラバー',
    'ゾンビ肌',
    'マスク',
    'アヘ顔',
]

IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']

IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']

def open_zipfile_with_encoding(path):
    try:
        return zipfile.ZipFile(path, metadata_encoding="utf-8")
    except UnicodeDecodeError:
        pass

    try:
        return zipfile.ZipFile(path, metadata_encoding="shift-jis")
    except UnicodeDecodeError:
        pass

    return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213")

def extract(args):
    for zip_path in args.zipfiles:
        work_id = zip_path.stem
        work_extract_path = args.destdir / 'extract' / work_id
        work_extract_path.mkdir(parents=True)

        print(f'Extracting {zip_path} to {work_extract_path}')

        with open_zipfile_with_encoding(zip_path) as z:
            z.extractall(path=work_extract_path)

        if args.remove:
            zip_path.unlink()


def manual_input_metadata(work_id):
    print(f"Don't know how to fetch metadata for {work_id}, input manually:")

    title = input('Title: ')
    circle = input('Circle [None]: ') or None
    authors = [author.strip() for author in input('Authors (comma-separated): ').split(',') if author.strip()]
    tags = [tag.strip() for tag in input('Tags (comma-separated): ').split(',') if tag.strip()]
    date = input('Pub date (yyyy-mm-dd): ')
    description = input('Description: ')
    series = input('Series [None]: ') or None

    return {
        "id": work_id,
        "title": title,
        "circle": circle,
        "authors": authors,
        "tags": tags,
        "date": date,
        "description": description,
        "series": series,
    }

async def fetch_async(args):
    con = sqlite3.connect(args.destdir / 'meta.db')
    cur = con.cursor()

    cur.execute("CREATE TABLE IF NOT EXISTS works(id TEXT PRIMARY KEY, title TEXT, circle TEXT, date TEXT, description TEXT, series TEXT, virtual INT)")
    cur.execute("CREATE TABLE IF NOT EXISTS authors(author TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(author, work))")
    cur.execute("CREATE TABLE IF NOT EXISTS tags(tag TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(tag, work))")

    thumbnails_dir = args.destdir / 'site' / 'thumbnails'
    thumbnails_dir.mkdir(parents=True, exist_ok=True)

    async with DlsiteAPI(locale=args.locale) as api:
        for work_path in (args.destdir / 'extract').iterdir():
            work_id = work_path.name

            res = cur.execute("SELECT id FROM works WHERE id = ?", (work_id,))
            if res.fetchone() is not None:
                continue

            if DLSITE_ID_REGEX.fullmatch(work_id):
                print(f'Fetching DLSite metadata for {work_id}')
                dlsite_metadata = await api.get_work(work_id)
                db_row = {
                    "id": work_id,
                    "title": dlsite_metadata.work_name,
                    "circle": dlsite_metadata.circle,
                    "date": dlsite_metadata.regist_date.date().isoformat(),
                    "description": dlsite_metadata.description,
                    "series": dlsite_metadata.series,
                }
                authors = dlsite_metadata.author or []
                tags = dlsite_metadata.genre or []
                thumbnail_url = dlsite_metadata.work_image
                if thumbnail_url.startswith('//'):
                    thumbnail_url = 'https:' + thumbnail_url
            else:
                db_row = manual_input_metadata(work_id)
                authors = db_row.pop('authors')
                tags = db_row.pop('tags')
                if FANZA_ID_REGEX.fullmatch(work_id):
                    thumbnail_url = f'https://doujin-assets.dmm.co.jp/digital/comic/{work_id}/{work_id}pl.jpg'
                elif FAKKU_ID_REGEX.fullmatch(work_id):
                    thumbnail_url = None
                else:
                    thumbnail_url = input('Thumbnail image URL [default: first page]: ')

            cur.execute(
                "INSERT INTO works(id, title, circle, date, description, series) VALUES(:id, :title, :circle, :date, :description, :series)",
                db_row,
            )
            cur.executemany(
                "INSERT INTO authors VALUES(:author, :work)",
                [{ "author": author, "work": work_id } for author in authors],
            )
            cur.executemany(
                "INSERT INTO tags VALUES(:tag, :work)",
                [{ "tag": tag, "work": work_id } for tag in tags],
            )

            if thumbnail_url:
                ext = url_file_ext(thumbnail_url)
                dest_file = thumbnails_dir / (work_id + ext)
                print(f'Downloading thumbnail for {work_id} from {thumbnail_url}')
                with open(dest_file, 'wb') as fd:
                    with requests.get(thumbnail_url, stream=True) as r:
                        for chunk in r.iter_content(chunk_size=16384):
                            fd.write(chunk)

            con.commit()

        con.close()

def url_file_ext(url):
    return splitext(urlparse(url).path)[1]

def fetch(args):
    asyncio.run(fetch_async(args))


def image_xrefs(pdf):
    images_by_page = [page.get_images() for page in pdf]
    if all(len(images) == 1 for images in images_by_page):
        return [images[0][0] for images in images_by_page]

    print("Checking PDF images the quick way failed, trying the slow way")
    xrefs = []
    for (idx, page) in enumerate(pdf):
        print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
        images = page.get_image_info(xrefs=True)
        if len(images) != 1 or images[0]['xref'] == 0:
            print('\nFailed')
            return None
        xrefs.append(images[0]['xref'])

    print('\nSuccess')
    return xrefs

def link_pdf(src, dest, start_index):
    with fitz.open(src) as pdf:
        xrefs = image_xrefs(pdf)
        if xrefs is None:
            print(f'Support for weirder PDFs not yet implemented, skipping {src}')
            return None

        dest.mkdir(parents=True, exist_ok=True)
        for (idx, xref) in enumerate(xrefs, start=start_index):
            image = pdf.extract_image(xref)
            file_path = dest / f'{idx:04d}.{image["ext"]}'
            with open(file_path, 'wb') as f:
                f.write(image["image"])

        return pdf.page_count

def complete_prefix_number_ordering(entries):
    if len(entries) == 1:
        return entries

    entries_by_version = {}
    for entry in entries:
        version_code = 0
        for (i, version) in enumerate(ALT_VERSIONS):
            if version in entry.name:
                version_code |= (1 << i)
        entries_by_version.setdefault(version_code, []).append(entry)

    numberings_by_version = {ver: unique_hierarchical_prefix_numbering(entries_by_version[ver]) for ver in entries_by_version}

    unified_indices = set()
    for numbering in numberings_by_version.values():
        if numbering is None:
            return None
        unified_indices |= set(numbering.keys())
    unified_indices.discard(None)
    unified_indices = list(unified_indices)
    unified_indices.sort()

    min_delta_by_level = {}
    if len(unified_indices) > 1:
        for i in range(1, len(unified_indices)):
            cur = unified_indices[i]
            prev = unified_indices[i-1]
            for level in range(min(len(cur), len(prev))):
                if cur[level] != prev[level]:
                    delta = cur[level] - prev[level]
                    min_delta_by_level[level] = min(min_delta_by_level.get(level, delta), delta)
    if any(delta > 2 for delta in min_delta_by_level.values()):
        return None

    unified_indices.append(None)

    versions = list(numberings_by_version.keys())
    versions.sort()

    version_lengths = {ver: len(numberings_by_version[ver]) for ver in numberings_by_version}
    inner_versions = []
    outer_versions = [versions[0]]
    for ver in versions[1:]:
        if version_lengths[ver] >= version_lengths[versions[0]] - 2:
            outer_versions.append(ver)
        else:
            inner_versions.append(ver)

    result = []
    for out_ver in outer_versions:
        for i in unified_indices:
            for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])):
                result += numberings_by_version[ver].get(i, [])
    return result

def unique_hierarchical_prefix_numbering(entries, start_point=0):
    if len(entries) == 1 and not NUMBER_REGEX.search(entries[0].name):
        return {None: entries}

    longest_entry = max(entries, key=lambda e: len(e.name))
    matches = reversed(list(NUMBER_REGEX.finditer(longest_entry.name)))
    for m in matches:
        pos = m.start()
        if pos < start_point:
            return None
        prefix = longest_entry.name[:pos]
        if all(e.name.startswith(prefix) or prefix.startswith(e.stem) for e in entries):
            numbering = {}
            for e in entries:
                if pos >= len(e.stem):
                    i = 0
                else:
                    n = NUMBER_REGEX.match(e.name[pos:])
                    if n is None:
                        return None
                    i = int(n.group())
                numbering.setdefault((i,), []).append(e)

            indices = list(numbering.keys())
            for idx in indices:
                if len(numbering[idx]) > 1:
                    ents_idx = numbering.pop(idx)
                    longest = max(ents_idx, key=lambda e: len(e.name))
                    next_layer_start = pos + NUMBER_REGEX.match(longest.name[pos:]).end()
                    sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
                    if not sub_numbering:
                        return None
                    for sub_idx in sub_numbering:
                        numbering[(*idx, *sub_idx)] = sub_numbering[sub_idx]

            return numbering

    return None

def alphabetic_numbering(entries, start_point):
    alphabetized = {}
    for entry in entries:
        ending = entry.stem[start_point:]
        if len(ending) > 1:
            return None
        index = 0 if ending == '' else ord(ending.lower()) - ord('a') + 1
        if (index,) in alphabetized:
            return None
        alphabetized[(index,)] = [entry]
    indices = list(alphabetized.keys())
    indices.sort()
    if indices != [(i,) for i in range(len(indices))]:
        return None
    return alphabetized

def link_ordered_files(ordering, dest, start_index):
    dest.mkdir(parents=True, exist_ok=True)

    for (idx, src_path) in enumerate(ordering, start=start_index):
        ext = src_path.suffix.lower()
        link_path = dest / f'{idx:04d}{ext}'
        link_path.symlink_to(relpath(src_path, dest))

def check_extension(path, exts):
    return path.suffix.lower() in exts

def is_pdf(path):
    return check_extension(path, ['.pdf'])

def is_image(path):
    return check_extension(path, IMAGE_FILE_EXTENSIONS)

def ignoreable(path):
    return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)

def ls_ignore(directory, exclude):
    return [
        path for path in directory.iterdir()
        if not ignoreable(path) and path not in exclude
    ]

def descendant_files_ignore(path, exclude):
    if path.is_file():
        return [path]

    result = []
    for item in ls_ignore(path, exclude):
        if item.is_dir():
            result.extend(descendant_files_ignore(item, exclude))
        else:
            result.append(item)

    return result

def collate(args):
    con = sqlite3.connect(args.destdir / 'meta.db')
    cur = con.cursor()

    extraction_dir = args.destdir / 'extract'
    hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}

    collation_staging_area = args.destdir / 'site' / 'images-staging'
    collation_staging_area.mkdir(parents=True)

    for work_path in extraction_dir.iterdir():
        work_id = work_path.name

        collation_dir = args.destdir / 'site' / 'images' / work_id
        if collation_dir.exists():
            continue

        virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
        if virtual == (1,):
            continue

        work_staging_dir = collation_staging_area / work_id

        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, [])
        if pages_collated:
            print(f'Collated {pages_collated} pages for {work_id}')
            work_staging_dir.rename(collation_dir)
        else:
            if work_staging_dir.is_dir():
                for f in work_staging_dir.iterdir():
                    f.unlink()
                work_staging_dir.rmdir()

            if pages_collated == 0:
                print(f'{work_id} contains no files? skipping')
            elif pages_collated is None:
                print(f'Unable to deduce file structure for {work_id}, skipping')

    collation_staging_area.rmdir()
    con.close()

def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None):
    early_srcs = []
    middle_srcs = []
    late_srcs = []
    for src in srcs:
        if earlier and earlier.search(src.name):
            early_srcs.append(src)
        elif later and later.search(src.name):
            late_srcs.append(src)
        else:
            middle_srcs.append(src)

    if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
        return False

    early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude)
    if early_page_count is None:
        return None
    start_index += early_page_count

    middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude)
    if middle_page_count is None:
        return None
    start_index += middle_page_count

    late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude)
    if late_page_count is None:
        return None

    return early_page_count + middle_page_count + late_page_count

def standalone_image_size(filepath):
    with Image.open(filepath) as im:
        return im.size

def pdf_image_sizes(filepath):
    sizes_by_xref = {}

    with fitz.open(filepath) as pdf:
        for page in pdf:
            for (xref, _, width, height, *_) in page.get_images():
                if xref in sizes_by_xref:
                    continue
                sizes_by_xref[xref] = (width, height)

    return list(sizes_by_xref.values())

def median(items):
    if len(items) == 0:
        return None

    items.sort()
    return items[len(items) // 2]

def superior_or_equal(a, b):
    return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))

def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
    pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
    if len(pdfs) != 1:
        return False
    outer_pdf = pdfs[0]

    inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)]
    if len(inner_pdfs) != 1:
        return False
    inner_pdf = inner_pdfs[0]

    non_pdf_srcs = [src for src in srcs if src != outer_pdf]
    images = []
    non_images = []
    descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)]
    for f in descendant_files:
        if is_image(f):
            images.append(f)
        else:
            non_images.append(f)
            break

    if len(non_images) != 0 or len(images) == 0:
        return False

    pdf_sizes = pdf_image_sizes(inner_pdf)
    standalone_sizes = [standalone_image_size(f) for f in images]
    if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
        return False

    median_pdf_size = median(pdf_sizes)
    median_standalone_size = median(standalone_sizes)
    if not (median_pdf_size and median_standalone_size):
        return False

    if superior_or_equal(median_standalone_size, median_pdf_size):
        return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
    elif superior_or_equal(median_pdf_size, median_standalone_size):
        return collate_from_paths([outer_pdf], dest, start_index, exclude)
    else:
        return False

def collate_from_paths(srcs, dest, start_index, exclude):
    if len(srcs) == 1 and srcs[0].is_dir():
        return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude)

    if len(srcs) == 1 and is_pdf(srcs[0]):
        print(f'Extracting images from {srcs[0]}')
        return link_pdf(srcs[0], dest, start_index)

    if len(srcs) == 0:
        return 0

    if len(srcs) == 2 and all(src.is_dir() for src in srcs):
        hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(src.name)]
        if len(hi_res_dirs) == 1:
            hi_res_dir = hi_res_dirs[0]
            lo_res_dir = next(src for src in srcs if src != hi_res_dir)
            if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)):
                return collate_from_paths([hi_res_dir], dest, start_index, exclude)

    textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX)
    if textless_split != False:
        return textless_split

    cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=FRONT_COVER_REGEX, later=BACK_COVER_REGEX)
    if cover_split != False:
        return cover_split

    epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX)
    if epilogue_split != False:
        return epilogue_split

    if all(src.is_file() and is_image(src) for src in srcs):
        ordering = complete_prefix_number_ordering(srcs)
        if ordering:
            print(f'Symlinking image files: {ordering[0]}...')
            link_ordered_files(ordering, dest, start_index)
            return len(ordering)
        else:
            return None

    images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude)
    if images_vs_pdf != False:
        return images_vs_pdf

    return None

def self_and_parents(path):
    return [path] + list(path.parents)

def parse_expressions(tokens):
    groups = []
    exclusions = []

    while tokens:
        token = tokens.pop(0)
        if token == '!':
            exclusions.extend(parse_exclusion(tokens))
        elif token == '(':
            groups.append(parse_group(tokens))
        else:
            groups.append([token])

    return (groups, exclusions)

def parse_exclusion(tokens):
    token = tokens.pop(0)

    if token == '(':
        return parse_group(tokens)
    else:
        return [token]

def parse_group(tokens):
    items = []

    while True:
        token = tokens.pop(0)
        if token == ')':
            return items
        else:
            items.append(token)

def normalize_to(path, ref):
    return ref / Path(relpath(path, ref))

def manual_collate(args):
    (raw_groups, raw_exclusions) = parse_expressions(args.expression)

    extraction_dir = args.destdir / 'extract'

    sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group)
    work_id = Path(relpath(sample_path, extraction_dir)).parents[-2].name

    exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]

    if raw_groups:
        groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
    else:
        groups = [[extraction_dir / work_id]]

    collation_dir = args.destdir / 'site' / 'images' / work_id
    if collation_dir.exists():
        if len(list(collation_dir.iterdir())) > 0:
            print(f'Collation directory already exists!')
            return
        else:
            collation_dir.rmdir()

    nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
    if len(nonexistent) > 0:
        print(f'Nonexistent paths: {nonexistent}')
        return

    collation_staging_area = args.destdir / 'site' / 'images-staging'
    work_staging_dir = collation_staging_area / work_id
    work_staging_dir.mkdir(parents=True)

    pages_collated = 0
    for group in groups:
        pages_added = collate_from_paths(
            [item for item in group if item not in exclusions],
            work_staging_dir,
            pages_collated,
            exclusions,
        )
        if pages_added is None:
            print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
            pages_collated = None
            break

        pages_collated += pages_added

    if pages_collated:
        print(f'Collated {pages_collated} pages for {work_id}')
        work_staging_dir.rename(collation_dir)
    else:
        for f in work_staging_dir.iterdir():
            f.unlink()
        work_staging_dir.rmdir()

        if pages_collated == 0:
            print(f'No files found for {work_id}')

    collation_staging_area.rmdir()


def fmt_size(s):
    return f'{s[0]}x{s[1]}px'

def analyze(args):
    extract_dir = args.destdir / 'extract'
    files = descendant_files_ignore(extract_dir / args.work_id, [])
    files.sort()

    for f in files:
        print(f'{relpath(f, extract_dir)}', end='')
        if is_image(f):
            size = standalone_image_size(f)
            print(f'\t{fmt_size(size)}')
        elif is_pdf(f):
            sizes = pdf_image_sizes(f)
            if len(sizes) == 0:
                print(f'\tContains no images')
            else:
                print(f'\t{len(sizes)} images, median {fmt_size(median(sizes))}, min {fmt_size(min(sizes))}, max {fmt_size(max(sizes))}')
        else:
            print()

def metadata(args):
    con = sqlite3.connect(args.destdir / 'meta.db')
    cur = con.cursor()

    if args.virtual is not None:
        cur.execute("UPDATE works SET virtual = ? WHERE id = ?", (1 if args.virtual else 0, args.work_id))
        con.commit()

    res = cur.execute(
        "SELECT title, circle, date, description, series, virtual FROM works WHERE id = ?",
        (args.work_id,),
    ).fetchone()

    if res is None:
        print(f'Work id {args.work_id} not found!')
        return

    (title, circle, date, description, series, virtual) = res
    print(f'Work ID:     {args.work_id}')
    print(f'Title:       {title}')
    print(f'Circle:      {circle}')
    print(f'Pub date:    {date}')
    print(f'Description: {description}')
    print(f'Series:      {series}')
    print(f'Virtual:     {"Yes" if virtual == 1 else "No"}')

    con.close()

def copy_recursive(src, dest):
    dest.mkdir(parents=True, exist_ok=True)
    for item in src.iterdir():
        if item.is_dir() and not item.is_symlink():
            copy_recursive(item, dest / item.name)
        else:
            shutil.copyfile(item, dest / item.name)

def generate(args):
    jenv = Environment(
        loader=PackageLoader("dlibrary"),
        autoescape=select_autoescape()
    )
    viewer_template = jenv.get_template("viewer.html")
    list_template = jenv.get_template("list.html")
    categorization_template = jenv.get_template("categorization.html")
    work_template = jenv.get_template("work.html")
    index_template = jenv.get_template("index.html")

    con = sqlite3.connect(args.destdir / 'meta.db')
    cur = con.cursor()

    site_dir = args.destdir / 'site'

    collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}

    actual_series = {series for (series,) in cur.execute('SELECT series FROM works GROUP BY series HAVING count(series) > 1')}

    works = []
    for (work_id, title, circle, date, description, series) in cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall():
        if work_id not in collated_work_ids:
            continue
        authors = [author for (author,) in cur.execute('SELECT author FROM authors WHERE work = ?', (work_id,))]
        tags = [tag for (tag,) in cur.execute('SELECT tag FROM tags WHERE work = ?', (work_id,))]

        images = [path.name for path in (site_dir / 'images' / work_id).iterdir()]
        images.sort()

        try:
            thumbnail_path = relpath(next(
                f for f in (site_dir / 'thumbnails').iterdir() if f.stem == work_id
            ), site_dir)
        except StopIteration:
            thumbnail_path = f'images/{work_id}/{images[0]}'
        work = {
            'id': work_id,
            'title': title,
            'circle': circle,
            'date': date,
            'description': description,
            'series': series,
            'authors': authors,
            'tags': tags,
            'thumbnail_path': thumbnail_path,
        }
        works.append(work)

        work_dir = site_dir / 'works' / work_id
        viewer_dir = work_dir / 'view'
        viewer_dir.mkdir(parents=True, exist_ok=True)
        with open(work_dir / 'index.html', 'w') as f:
            f.write(work_template.render(depth=2, work=work, title=title, images=images))
        with open(viewer_dir / 'index.html', 'w') as f:
            f.write(viewer_template.render(depth=3, work=work, title=title, images=images))

    def make_categorization(categorization, query, work_filter, work_style_cards=False):
        categorization_dir = site_dir / categorization

        cats = [cat for (cat,) in cur.execute(query)]
        cat_samples = {}
        for cat in cats:
            cat_works = list(filter(work_filter(cat), works))
            cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None

            safeish_cat = cat.replace('/', ' ')
            cat_dir = categorization_dir / safeish_cat
            cat_dir.mkdir(parents=True, exist_ok=True)
            with open(cat_dir / 'index.html', 'w') as f:
                f.write(list_template.render(
                    depth=2,
                    works=cat_works,
                    title=cat,
                    categorization=categorization,
                ))

        categorization_dir.mkdir(parents=True, exist_ok=True)
        with open(categorization_dir / 'index.html', 'w') as f:
            f.write(categorization_template.render(
                depth=1,
                categorization=categorization,
                categories=cats,
                samples=cat_samples,
                work_style_cards=work_style_cards,
            ))

    make_categorization(
        'authors',
        'SELECT DISTINCT author FROM authors ORDER BY author',
        lambda author: lambda work: author in work['authors'],
    )
    make_categorization(
        'tags',
        'SELECT DISTINCT tag FROM tags ORDER BY tag',
        lambda tag: lambda work: tag in work['tags'],
    )
    make_categorization(
        'circles',
        'SELECT DISTINCT circle FROM works WHERE circle NOT NULL ORDER BY circle',
        lambda circle: lambda work: work['circle'] == circle,
    )
    make_categorization(
        'series',
        'SELECT DISTINCT series FROM works WHERE series NOT NULL ORDER BY series',
        lambda series: lambda work: work['series'] == series,
        work_style_cards=True,
    )

    with resources.as_file(resources.files("dlibrary")) as r:
        copy_recursive(r / 'static', site_dir / 'static')

    with open(site_dir / 'index.html', 'w') as f:
        f.write(index_template.render(depth=0, works=works))

    con.close()


argparser = argparse.ArgumentParser(
    prog='dlibrary',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description=textwrap.dedent("""\
    Organize DRM-free works purchased from DLSite into a library
    that can be viewed in a web browser.

    Intended workflow:
    - `extract` a collection of zipfiles downloaded from DLSite
      into DLibrary's data directory, giving each work its own
      subfolder.
    - `fetch` metadata and thumbnail images for extracted works
      from DLSite.
    - `collate` and/or `manual-collate` extracted works,
      producing a single sequence of image files (or symlinks
      into the extracted data, when possible) for each work.
    - Manually adjust works' `metadata` when necessary.
    - `generate` a static website providing a catalog and viewer
      for all collated works.
    """),
)

argparser.add_argument(
    '-d', '--destdir',
    type=Path,
    default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')),
    help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)',
)
subparsers = argparser.add_subparsers(title="subcommands", required=True)

parser_extract = subparsers.add_parser('extract', aliases=['x', 'ex'], help='extract zipfiles')
parser_extract.add_argument(
    '-r', '--remove',
    action='store_true',
    help='remove original zipfiles after extraction',
)
parser_extract.add_argument(
    'zipfiles',
    metavar='FILE',
    type=Path,
    nargs='+',
    help='zipfiles to extract',
)
parser_extract.set_defaults(func=extract)

parser_fetch = subparsers.add_parser('fetch', aliases=['f', 'fet'], help='fetch metadata and thumbnails')
parser_fetch.add_argument(
    '-l', '--locale',
    type=str,
    default=os.getenv('DLIBRARY_LOCALE', 'en_US'),
    help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). '
          'May still fall back to Japanese if metadata in other languages is unavailable. '
          '(default: $DLIBRARY_LOCALE or en_US)'),
)
parser_fetch.set_defaults(func=fetch)

parser_collate = subparsers.add_parser(
    'collate',
    aliases=['c', 'co', 'col'],
    help='collate each work into a sequence of image files',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description=textwrap.dedent("""\
    For each extracted work that has not already been collated,
    DLibrary will attempt to intuit its structure as follows:

    - Enter the work's directory. If the directory contains
      nothing except a single subdirectory (ignoring a few types
      of files that are definitely not relevant), traverse
      downwards repeatedly.
    - If the current directory contains nothing except a single
      PDF (again, ignoring irrelevant files), attempt to extract
      a series of images from the PDF. This process expects that
      each page of the PDF consists of a single embedded image,
      which will be extracted at full resolution. Support for
      more complex PDFs is not yet implemented.
    - If the current directory contains nothing except image
      files, and the image files are named in a way that clearly
      indicates a complete numerical order (each filename
      consists of a shared prefix followed by a distinct
      number), symlink files in the inferred order.
    - Otherwise, skip processing this work for now.

    DLibrary can be given "collation hints" which provide
    alternative starting points for this search process. A hint
    is a path under $DLIBRARY_DIR/extract/[work id]/
    indicating a different directory or PDF file to begin the
    search process for that work, rather than starting at the
    top level of the extracted data. There can be at most one
    hint per work; for more complicated scenarios where a work
    includes multiple folders that need to be collated together,
    or where filenames do not clearly indicate an ordering, use
    `manual-collate` instead.
    """),
)
parser_collate.add_argument(
    'hints',
    metavar='PATH',
    type=Path,
    nargs='*',
    help='paths within extraction folders as collation hints'
)
parser_collate.set_defaults(func=collate)

parser_manual_collate = subparsers.add_parser(
    'manual-collate',
    aliases=['mc', 'man', 'manual'],
    help='collate a single work manually',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description=textwrap.dedent("""\
    Provide an expression or sequence of expressions specifying groups
    of paths to collate or skip. An expression can be:

    PATH
      A single path. If this is an image, it will be appended to
      the sequence of collated images; if this is a PDF, images will be
      extracted from it and concatenated to the sequence; if this is a
      directory, the contents of the directory will be collated based on
      the normal heuristics and concatenated to the sequence.

    ( PATH [PATH ...] )
      A group of paths contained in parentheses. You may need to escape
      the parentheses to avoid them getting parsed by your shell.
      All the paths in this group will be considered together, and
      collated based on the normal heuristics, regardless of what
      order the paths are provided in.

    ! PATH
    ! ( PATH [PATH ...] )
      A path or group of paths to exclude from collation. You may
      need to escape the !. If an excluded path appears within any
      of the other specified paths, it will be ignored.

    If the only expressions provided are negations, then auto-collation
    will start from the top level of the extracted work while excluding
    the negated paths.

    All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
    for the work being manually collated. `manual-collate` can
    only handle one work at a time.
"""),
)
parser_manual_collate.add_argument(
    'expression',
    nargs='+',
    help='expressions indicating paths to collate or skip',
)
parser_manual_collate.set_defaults(func=manual_collate)

parser_analyze = subparsers.add_parser('analyze', aliases=['a', 'an', 'anal'], help='analyze an extracted folder to assist in collation')
parser_analyze.add_argument('work_id')
parser_analyze.set_defaults(func=analyze)

parser_metadata = subparsers.add_parser('metadata', aliases=['m', 'me', 'meta'], help='view or modify metadata for a work')
parser_metadata.add_argument('work_id')
parser_metadata.add_argument(
    '--virtual',
    action=argparse.BooleanOptionalAction,
    help='set work as virtual',
)
parser_metadata.set_defaults(func=metadata)

parser_generate = subparsers.add_parser(
    'generate',
    aliases=['g', 'gen'],
    help='generate HTML/CSS/JS for library site',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description=textwrap.dedent("""\
    The static site will be generated under $DLIBRARY_DIR/site/
    and can be served by pointing an HTTP server at that
    directory. Note that some files inside the static site
    hierarchy will be symlinks into $DLIBRARY_DIR/extract/
    outside the site hierarchy, so make sure your HTTP server
    will allow those symlinks to be read.
    """),
)
parser_generate.set_defaults(func=generate)

def main():
    args = argparser.parse_args()
    args.func(args)

if __name__ == "__main__":
    main()
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								#!/usr/bin/env python3
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								import argparse
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								import asyncio
-												do fancy python packaging stuff

											
										
										
											2024-01-22 22:06:04 -05:00
+								import importlib_resources as resources
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								from pathlib import Path
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								import os
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								from os.path import relpath, splitext
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								import re
-												import readline so hopefully input prompts will suck less

											
										
										
											2024-02-07 00:24:30 -05:00
+								import readline
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								import shutil
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								import sqlite3
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								import textwrap
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								from urllib.parse import urlparse
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								import zipfile
 								from dlsite_async import DlsiteAPI
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								import fitz
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								from PIL import Image
-												do fancy python packaging stuff

											
										
										
											2024-01-22 22:06:04 -05:00
+								from jinja2 import Environment, PackageLoader, select_autoescape
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								import requests
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
-												include full-width numerals in number regex

											
										
										
											2024-02-08 04:54:36 -05:00
+								NUMBER_REGEX = re.compile('[0-9０-９]+')
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
-												don't try to fetch dlsite metadata for non-dlsite works

											
										
										
											2024-01-29 03:01:35 -05:00
+								DLSITE_ID_REGEX = re.compile('^[BR]J[0-9]+$')
-												support fanza works with manual metadata input

											
										
										
											2024-01-29 04:11:55 -05:00
+								FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
-												support Fakku works using first page as thumbnail for now

											
										
										
											2024-01-29 21:25:21 -05:00
+								FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
-												don't try to fetch dlsite metadata for non-dlsite works

											
										
										
											2024-01-29 03:01:35 -05:00
-												more textless regex

											
										
										
											2024-02-08 04:32:05 -05:00
+								TEXTLESS_REGEX = re.compile('(台詞|セリフ|せりふ|テキスト|文字)((な|無)し|抜き)|notext|textless', re.IGNORECASE)
-												add more cover regex, make all the regexes case-insensitive

											
										
										
											2024-02-08 00:06:30 -05:00
+								EPILOGUE_REGEX = re.compile('after|後日談', re.IGNORECASE)
 								HI_RES_REGEX = re.compile('高解像度', re.IGNORECASE)
-												recognize back covers as well as front covers

											
										
										
											2024-02-08 05:11:17 -05:00
+								FRONT_COVER_REGEX = re.compile('(^|[^裏])表紙|cover|hyoushi', re.IGNORECASE)
 								BACK_COVER_REGEX = re.compile('裏表紙', re.IGNORECASE)
-												more alt versions

											
										
										
											2024-02-06 13:29:38 -05:00
+								ALT_VERSIONS = [
 								    '褐色',
 								    '日焼け',
 								    'pink',
 								    '金髪',
 								    '白肌',
 								    'うつろ目',
 								    'dark skin',
 								    'ラバー',
 								    'ゾンビ肌',
 								    'マスク',
-												more alt versions

											
										
										
											2024-02-06 13:59:49 -05:00
+								    'アヘ顔',
-												more alt versions

											
										
										
											2024-02-06 13:29:38 -05:00
+								]
-												smarter automatic collation of textless versions

											
										
										
											2024-02-06 09:26:39 -05:00
-												add .bmp to recognized image formats

											
										
										
											2024-02-08 00:14:24 -05:00
+								IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												don't get tripped up as much by random extra files during collation

											
										
										
											2024-01-22 07:56:45 -05:00
+								IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
-												ignore the mp4 files that fanza throws in sometimes

											
										
										
											2024-02-06 23:02:59 -05:00
+								IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
-												don't get tripped up as much by random extra files during collation

											
										
										
											2024-01-22 07:56:45 -05:00
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								def open_zipfile_with_encoding(path):
 								    try:
 								        return zipfile.ZipFile(path, metadata_encoding="utf-8")
 								    except UnicodeDecodeError:
 								        pass
 								    try:
 								        return zipfile.ZipFile(path, metadata_encoding="shift-jis")
 								    except UnicodeDecodeError:
 								        pass
 								    return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213")
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								def extract(args):
 								    for zip_path in args.zipfiles:
 								        work_id = zip_path.stem
 								        work_extract_path = args.destdir / 'extract' / work_id
 								        work_extract_path.mkdir(parents=True)
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
-												give more progress info during extract

											
										
										
											2024-02-04 00:49:26 -05:00
+								        print(f'Extracting {zip_path} to {work_extract_path}')
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								        with open_zipfile_with_encoding(zip_path) as z:
 								            z.extractall(path=work_extract_path)
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								        if args.remove:
 								            zip_path.unlink()
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
-												support fanza works with manual metadata input

											
										
										
											2024-01-29 04:11:55 -05:00
 								def manual_input_metadata(work_id):
 								    print(f"Don't know how to fetch metadata for {work_id}, input manually:")
 								    title = input('Title: ')
 								    circle = input('Circle [None]: ') or None
 								    authors = [author.strip() for author in input('Authors (comma-separated): ').split(',') if author.strip()]
 								    tags = [tag.strip() for tag in input('Tags (comma-separated): ').split(',') if tag.strip()]
 								    date = input('Pub date (yyyy-mm-dd): ')
 								    description = input('Description: ')
 								    series = input('Series [None]: ') or None
 								    return {
 								        "id": work_id,
 								        "title": title,
 								        "circle": circle,
 								        "authors": authors,
 								        "tags": tags,
 								        "date": date,
 								        "description": description,
 								        "series": series,
 								    }
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								async def fetch_async(args):
 								    con = sqlite3.connect(args.destdir / 'meta.db')
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								    cur = con.cursor()
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								    cur.execute("CREATE TABLE IF NOT EXISTS works(id TEXT PRIMARY KEY, title TEXT, circle TEXT, date TEXT, description TEXT, series TEXT, virtual INT)")
-												make author relations and tag relations unique in db schema

											
										
										
											2024-02-07 00:09:42 -05:00
+								    cur.execute("CREATE TABLE IF NOT EXISTS authors(author TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(author, work))")
 								    cur.execute("CREATE TABLE IF NOT EXISTS tags(tag TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(tag, work))")
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								    thumbnails_dir = args.destdir / 'site' / 'thumbnails'
 								    thumbnails_dir.mkdir(parents=True, exist_ok=True)
-												add configurable locale for metadata

											
										
										
											2024-01-24 02:33:14 -05:00
+								    async with DlsiteAPI(locale=args.locale) as api:
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								        for work_path in (args.destdir / 'extract').iterdir():
 								            work_id = work_path.name
 								            res = cur.execute("SELECT id FROM works WHERE id = ?", (work_id,))
 								            if res.fetchone() is not None:
 								                continue
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
-												support fanza works with manual metadata input

											
										
										
											2024-01-29 04:11:55 -05:00
+								            if DLSITE_ID_REGEX.fullmatch(work_id):
 								                print(f'Fetching DLSite metadata for {work_id}')
 								                dlsite_metadata = await api.get_work(work_id)
 								                db_row = {
 								                    "id": work_id,
 								                    "title": dlsite_metadata.work_name,
 								                    "circle": dlsite_metadata.circle,
 								                    "date": dlsite_metadata.regist_date.date().isoformat(),
 								                    "description": dlsite_metadata.description,
 								                    "series": dlsite_metadata.series,
 								                }
 								                authors = dlsite_metadata.author or []
 								                tags = dlsite_metadata.genre or []
 								                thumbnail_url = dlsite_metadata.work_image
 								                if thumbnail_url.startswith('//'):
 								                    thumbnail_url = 'https:' + thumbnail_url
-												support Fakku works using first page as thumbnail for now

											
										
										
											2024-01-29 21:25:21 -05:00
+								            else:
-												support fanza works with manual metadata input

											
										
										
											2024-01-29 04:11:55 -05:00
+								                db_row = manual_input_metadata(work_id)
 								                authors = db_row.pop('authors')
 								                tags = db_row.pop('tags')
-												support Fakku works using first page as thumbnail for now

											
										
										
											2024-01-29 21:25:21 -05:00
+								                if FANZA_ID_REGEX.fullmatch(work_id):
 								                    thumbnail_url = f'https://doujin-assets.dmm.co.jp/digital/comic/{work_id}/{work_id}pl.jpg'
 								                elif FAKKU_ID_REGEX.fullmatch(work_id):
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								                    thumbnail_url = None
-												support Fakku works using first page as thumbnail for now

											
										
										
											2024-01-29 21:25:21 -05:00
+								                else:
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								                    thumbnail_url = input('Thumbnail image URL [default: first page]: ')
-												don't try to fetch dlsite metadata for non-dlsite works

											
										
										
											2024-01-29 03:01:35 -05:00
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								            cur.execute(
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								                "INSERT INTO works(id, title, circle, date, description, series) VALUES(:id, :title, :circle, :date, :description, :series)",
-												support fanza works with manual metadata input

											
										
										
											2024-01-29 04:11:55 -05:00
+								                db_row,
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								            )
 								            cur.executemany(
 								                "INSERT INTO authors VALUES(:author, :work)",
-												support fanza works with manual metadata input

											
										
										
											2024-01-29 04:11:55 -05:00
+								                [{ "author": author, "work": work_id } for author in authors],
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								            )
 								            cur.executemany(
 								                "INSERT INTO tags VALUES(:tag, :work)",
-												support fanza works with manual metadata input

											
										
										
											2024-01-29 04:11:55 -05:00
+								                [{ "tag": tag, "work": work_id } for tag in tags],
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								            )
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								            if thumbnail_url:
-												support Fakku works using first page as thumbnail for now

											
										
										
											2024-01-29 21:25:21 -05:00
+								                ext = url_file_ext(thumbnail_url)
 								                dest_file = thumbnails_dir / (work_id + ext)
 								                print(f'Downloading thumbnail for {work_id} from {thumbnail_url}')
 								                with open(dest_file, 'wb') as fd:
 								                    with requests.get(thumbnail_url, stream=True) as r:
 								                        for chunk in r.iter_content(chunk_size=16384):
 								                            fd.write(chunk)
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
+								            con.commit()
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								        con.close()
 								def url_file_ext(url):
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								    return splitext(urlparse(url).path)[1]
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								def fetch(args):
 								    asyncio.run(fetch_async(args))
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
-												add a second fallback method for getting PDF image xrefs per page

											
										
										
											2024-01-22 10:36:20 -05:00
+								def image_xrefs(pdf):
 								    images_by_page = [page.get_images() for page in pdf]
 								    if all(len(images) == 1 for images in images_by_page):
 								        return [images[0][0] for images in images_by_page]
 								    print("Checking PDF images the quick way failed, trying the slow way")
 								    xrefs = []
 								    for (idx, page) in enumerate(pdf):
 								        print(f'\x1b[2K\r{idx}/{pdf.page_count} pages processed...', end='')
 								        images = page.get_image_info(xrefs=True)
 								        if len(images) != 1 or images[0]['xref'] == 0:
 								            print('\nFailed')
 								            return None
 								        xrefs.append(images[0]['xref'])
 								    print('\nSuccess')
 								    return xrefs
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								def link_pdf(src, dest, start_index):
-												implement metadata viewing/editing and basic PDF image extraction

											
										
										
											2024-01-22 02:58:17 -05:00
+								    with fitz.open(src) as pdf:
-												add a second fallback method for getting PDF image xrefs per page

											
										
										
											2024-01-22 10:36:20 -05:00
+								        xrefs = image_xrefs(pdf)
 								        if xrefs is None:
-												implement metadata viewing/editing and basic PDF image extraction

											
										
										
											2024-01-22 02:58:17 -05:00
+								            print(f'Support for weirder PDFs not yet implemented, skipping {src}')
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								            return None
-												add a second fallback method for getting PDF image xrefs per page

											
										
										
											2024-01-22 10:36:20 -05:00
 								        dest.mkdir(parents=True, exist_ok=True)
 								        for (idx, xref) in enumerate(xrefs, start=start_index):
 								            image = pdf.extract_image(xref)
 								            file_path = dest / f'{idx:04d}.{image["ext"]}'
 								            with open(file_path, 'wb') as f:
 								                f.write(image["image"])
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								        return pdf.page_count
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								def complete_prefix_number_ordering(entries):
-												smarter automatic collation of textless versions

											
										
										
											2024-02-06 09:26:39 -05:00
+								    if len(entries) == 1:
 								        return entries
-												smarter automatic collation when there are multiple versions of pages

											
										
										
											2024-02-06 10:22:11 -05:00
+								    entries_by_version = {}
 								    for entry in entries:
-												handle combinations of versions, and cases where things have versions but not numbers

											
										
										
											2024-02-06 13:20:56 -05:00
+								        version_code = 0
 								        for (i, version) in enumerate(ALT_VERSIONS):
 								            if version in entry.name:
 								                version_code |= (1 << i)
 								        entries_by_version.setdefault(version_code, []).append(entry)
-												smarter automatic collation when there are multiple versions of pages

											
										
										
											2024-02-06 10:22:11 -05:00
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								    numberings_by_version = {ver: unique_hierarchical_prefix_numbering(entries_by_version[ver]) for ver in entries_by_version}
-												smarter automatic collation when there are multiple versions of pages

											
										
										
											2024-02-06 10:22:11 -05:00
 								    unified_indices = set()
 								    for numbering in numberings_by_version.values():
 								        if numbering is None:
 								            return None
 								        unified_indices |= set(numbering.keys())
-												handle combinations of versions, and cases where things have versions but not numbers

											
										
										
											2024-02-06 13:20:56 -05:00
+								    unified_indices.discard(None)
-												smarter automatic collation when there are multiple versions of pages

											
										
										
											2024-02-06 10:22:11 -05:00
+								    unified_indices = list(unified_indices)
 								    unified_indices.sort()
-												be more forgiving about checking index deltas

											
										
										
											2024-02-07 23:59:35 -05:00
+								    min_delta_by_level = {}
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								    if len(unified_indices) > 1:
 								        for i in range(1, len(unified_indices)):
 								            cur = unified_indices[i]
 								            prev = unified_indices[i-1]
 								            for level in range(min(len(cur), len(prev))):
 								                if cur[level] != prev[level]:
-												be more forgiving about checking index deltas

											
										
										
											2024-02-07 23:59:35 -05:00
+								                    delta = cur[level] - prev[level]
 								                    min_delta_by_level[level] = min(min_delta_by_level.get(level, delta), delta)
 								    if any(delta > 2 for delta in min_delta_by_level.values()):
 								        return None
-												stop automatic collation if the sequence of numbers doesn't seem to be indices in an ordering

											
										
										
											2024-02-06 10:22:48 -05:00
-												handle combinations of versions, and cases where things have versions but not numbers

											
										
										
											2024-02-06 13:20:56 -05:00
+								    unified_indices.append(None)
-												smarter automatic collation when there are multiple versions of pages

											
										
										
											2024-02-06 10:22:11 -05:00
+								    versions = list(numberings_by_version.keys())
 								    versions.sort()
 								    version_lengths = {ver: len(numberings_by_version[ver]) for ver in numberings_by_version}
 								    inner_versions = []
 								    outer_versions = [versions[0]]
 								    for ver in versions[1:]:
 								        if version_lengths[ver] >= version_lengths[versions[0]] - 2:
 								            outer_versions.append(ver)
 								        else:
 								            inner_versions.append(ver)
 								    result = []
 								    for out_ver in outer_versions:
 								        for i in unified_indices:
 								            for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])):
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								                result += numberings_by_version[ver].get(i, [])
-												smarter automatic collation when there are multiple versions of pages

											
										
										
											2024-02-06 10:22:11 -05:00
+								    return result
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								def unique_hierarchical_prefix_numbering(entries, start_point=0):
-												handle combinations of versions, and cases where things have versions but not numbers

											
										
										
											2024-02-06 13:20:56 -05:00
+								    if len(entries) == 1 and not NUMBER_REGEX.search(entries[0].name):
 								        return {None: entries}
-												handle cases where the first item isn't numbered

											
										
										
											2024-02-06 13:00:44 -05:00
+								    longest_entry = max(entries, key=lambda e: len(e.name))
 								    matches = reversed(list(NUMBER_REGEX.finditer(longest_entry.name)))
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								    for m in matches:
 								        pos = m.start()
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								        if pos < start_point:
 								            return None
-												handle cases where the first item isn't numbered

											
										
										
											2024-02-06 13:00:44 -05:00
+								        prefix = longest_entry.name[:pos]
 								        if all(e.name.startswith(prefix) or prefix.startswith(e.stem) for e in entries):
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								            numbering = {}
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								            for e in entries:
-												handle cases where the first item isn't numbered

											
										
										
											2024-02-06 13:00:44 -05:00
+								                if pos >= len(e.stem):
 								                    i = 0
 								                else:
 								                    n = NUMBER_REGEX.match(e.name[pos:])
 								                    if n is None:
 								                        return None
 								                    i = int(n.group())
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								                numbering.setdefault((i,), []).append(e)
 								            indices = list(numbering.keys())
 								            for idx in indices:
 								                if len(numbering[idx]) > 1:
 								                    ents_idx = numbering.pop(idx)
-												handle cases where the first item isn't numbered

											
										
										
											2024-02-06 13:00:44 -05:00
+								                    longest = max(ents_idx, key=lambda e: len(e.name))
 								                    next_layer_start = pos + NUMBER_REGEX.match(longest.name[pos:]).end()
-												handle alphabetic suffixes

											
										
										
											2024-02-06 12:19:11 -05:00
+								                    sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
-												handle hierarchical numbering schemes in auto-collation

											
										
										
											2024-02-06 11:02:08 -05:00
+								                    if not sub_numbering:
 								                        return None
 								                    for sub_idx in sub_numbering:
 								                        numbering[(*idx, *sub_idx)] = sub_numbering[sub_idx]
 								            return numbering
-												smarter automatic collation when there are multiple versions of pages

											
										
										
											2024-02-06 10:22:11 -05:00
-												enumerate and link files

											
										
										
											2024-01-21 17:45:56 -05:00
+								    return None
-												handle alphabetic suffixes

											
										
										
											2024-02-06 12:19:11 -05:00
+								def alphabetic_numbering(entries, start_point):
 								    alphabetized = {}
 								    for entry in entries:
 								        ending = entry.stem[start_point:]
 								        if len(ending) > 1:
 								            return None
-												for alphabet ordering, blank is 0, a is 1

											
										
										
											2024-02-07 20:45:46 -05:00
+								        index = 0 if ending == '' else ord(ending.lower()) - ord('a') + 1
-												add more textless regex, fix bug in alphabetized uniqueness checker

											
										
										
											2024-02-07 17:12:02 -05:00
+								        if (index,) in alphabetized:
-												handle alphabetic suffixes

											
										
										
											2024-02-06 12:19:11 -05:00
+								            return None
 								        alphabetized[(index,)] = [entry]
 								    indices = list(alphabetized.keys())
 								    indices.sort()
 								    if indices != [(i,) for i in range(len(indices))]:
 								        return None
 								    return alphabetized
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								def link_ordered_files(ordering, dest, start_index):
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								    dest.mkdir(parents=True, exist_ok=True)
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								    for (idx, src_path) in enumerate(ordering, start=start_index):
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								        ext = src_path.suffix.lower()
 								        link_path = dest / f'{idx:04d}{ext}'
 								        link_path.symlink_to(relpath(src_path, dest))
-												starting dlsite curator script

											
										
										
											2024-01-21 05:13:09 -05:00
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
+								def check_extension(path, exts):
 								    return path.suffix.lower() in exts
 								def is_pdf(path):
-												need to actually return values, whould've thought

											
										
										
											2024-02-07 19:51:46 -05:00
+								    return check_extension(path, ['.pdf'])
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
 								def is_image(path):
-												need to actually return values, whould've thought

											
										
										
											2024-02-07 19:51:46 -05:00
+								    return check_extension(path, IMAGE_FILE_EXTENSIONS)
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								def ignoreable(path):
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
+								    return path.name in IGNOREABLE_FILES or check_extension(path, IGNOREABLE_EXTENSIONS)
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								def ls_ignore(directory, exclude):
-												don't get tripped up as much by random extra files during collation

											
										
										
											2024-01-22 07:56:45 -05:00
+								    return [
 								        path for path in directory.iterdir()
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								        if not ignoreable(path) and path not in exclude
-												don't get tripped up as much by random extra files during collation

											
										
										
											2024-01-22 07:56:45 -05:00
+								    ]
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								def descendant_files_ignore(path, exclude):
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								    if path.is_file():
 								        return [path]
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								    result = []
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    for item in ls_ignore(path, exclude):
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								        if item.is_dir():
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								            result.extend(descendant_files_ignore(item, exclude))
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								        else:
 								            result.append(item)
 								    return result
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								def collate(args):
 								    con = sqlite3.connect(args.destdir / 'meta.db')
 								    cur = con.cursor()
 								    extraction_dir = args.destdir / 'extract'
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    hint_map = {Path(relpath(hint, extraction_dir)).parents[-2].name: hint for hint in args.hints}
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								    collation_staging_area = args.destdir / 'site' / 'images-staging'
 								    collation_staging_area.mkdir(parents=True)
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								    for work_path in extraction_dir.iterdir():
 								        work_id = work_path.name
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								        collation_dir = args.destdir / 'site' / 'images' / work_id
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								        if collation_dir.exists():
 								            continue
 								        virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
 								        if virtual == (1,):
 								            continue
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								        work_staging_dir = collation_staging_area / work_id
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								        pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0, [])
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								        if pages_collated:
 								            print(f'Collated {pages_collated} pages for {work_id}')
 								            work_staging_dir.rename(collation_dir)
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								        else:
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								            if work_staging_dir.is_dir():
 								                for f in work_staging_dir.iterdir():
 								                    f.unlink()
 								                work_staging_dir.rmdir()
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								            if pages_collated == 0:
 								                print(f'{work_id} contains no files? skipping')
 								            elif pages_collated is None:
 								                print(f'Unable to deduce file structure for {work_id}, skipping')
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								    collation_staging_area.rmdir()
 								    con.close()
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								def try_collate_split_regex(srcs, dest, start_index, exclude, earlier=None, later=None):
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								    early_srcs = []
 								    middle_srcs = []
 								    late_srcs = []
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
+								    for src in srcs:
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								        if earlier and earlier.search(src.name):
 								            early_srcs.append(src)
 								        elif later and later.search(src.name):
 								            late_srcs.append(src)
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
+								        else:
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								            middle_srcs.append(src)
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								    if sum(1 for l in [early_srcs, middle_srcs, late_srcs] if l) <= 1:
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
+								        return False
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    early_page_count = collate_from_paths(early_srcs, dest, start_index, exclude)
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								    if early_page_count is None:
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
+								        return None
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								    start_index += early_page_count
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    middle_page_count = collate_from_paths(middle_srcs, dest, start_index, exclude)
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								    if middle_page_count is None:
 								        return None
 								    start_index += middle_page_count
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    late_page_count = collate_from_paths(late_srcs, dest, start_index, exclude)
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								    if late_page_count is None:
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
+								        return None
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
 								    return early_page_count + middle_page_count + late_page_count
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								def standalone_image_size(filepath):
 								    with Image.open(filepath) as im:
 								        return im.size
 								def pdf_image_sizes(filepath):
 								    sizes_by_xref = {}
 								    with fitz.open(filepath) as pdf:
 								        for page in pdf:
 								            for (xref, _, width, height, *_) in page.get_images():
 								                if xref in sizes_by_xref:
 								                    continue
 								                sizes_by_xref[xref] = (width, height)
 								    return list(sizes_by_xref.values())
 								def median(items):
 								    if len(items) == 0:
 								        return None
 								    items.sort()
 								    return items[len(items) // 2]
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								def superior_or_equal(a, b):
 								    return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								def try_collate_images_vs_pdf(srcs, dest, start_index, exclude):
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								    pdfs = [src for src in srcs if 'pdf' in src.name.lower()]
 								    if len(pdfs) != 1:
 								        return False
 								    outer_pdf = pdfs[0]
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    inner_pdfs = [f for f in descendant_files_ignore(outer_pdf, exclude) if is_pdf(f)]
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								    if len(inner_pdfs) != 1:
 								        return False
 								    inner_pdf = inner_pdfs[0]
 								    non_pdf_srcs = [src for src in srcs if src != outer_pdf]
 								    images = []
 								    non_images = []
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    descendant_files = [f for src in non_pdf_srcs for f in descendant_files_ignore(src, exclude)]
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								    for f in descendant_files:
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
+								        if is_image(f):
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								            images.append(f)
 								        else:
 								            non_images.append(f)
 								            break
 								    if len(non_images) != 0 or len(images) == 0:
 								        return False
 								    pdf_sizes = pdf_image_sizes(inner_pdf)
 								    standalone_sizes = [standalone_image_size(f) for f in images]
 								    if abs(len(pdf_sizes) - len(standalone_sizes)) > 2:
 								        return False
 								    median_pdf_size = median(pdf_sizes)
 								    median_standalone_size = median(standalone_sizes)
 								    if not (median_pdf_size and median_standalone_size):
 								        return False
 								    if superior_or_equal(median_standalone_size, median_pdf_size):
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								        return collate_from_paths(non_pdf_srcs, dest, start_index, exclude)
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								    elif superior_or_equal(median_pdf_size, median_standalone_size):
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								        return collate_from_paths([outer_pdf], dest, start_index, exclude)
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								    else:
 								        return False
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								def collate_from_paths(srcs, dest, start_index, exclude):
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								    if len(srcs) == 1 and srcs[0].is_dir():
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								        return collate_from_paths(ls_ignore(srcs[0], exclude), dest, start_index, exclude)
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
+								    if len(srcs) == 1 and is_pdf(srcs[0]):
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								        print(f'Extracting images from {srcs[0]}')
 								        return link_pdf(srcs[0], dest, start_index)
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								    if len(srcs) == 0:
 								        return 0
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
+								    if len(srcs) == 2 and all(src.is_dir() for src in srcs):
 								        hi_res_dirs = [src for src in srcs if HI_RES_REGEX.search(src.name)]
 								        if len(hi_res_dirs) == 1:
 								            hi_res_dir = hi_res_dirs[0]
 								            lo_res_dir = next(src for src in srcs if src != hi_res_dir)
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								            if len(descendant_files_ignore(lo_res_dir, exclude)) == len(descendant_files_ignore(hi_res_dir, exclude)):
 								                return collate_from_paths([hi_res_dir], dest, start_index, exclude)
-												detect cover images and hi-res editions

											
										
										
											2024-02-07 17:42:18 -05:00
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    textless_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=TEXTLESS_REGEX)
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
+								    if textless_split != False:
 								        return textless_split
-												recognize back covers as well as front covers

											
										
										
											2024-02-08 05:11:17 -05:00
+								    cover_split = try_collate_split_regex(srcs, dest, start_index, exclude, earlier=FRONT_COVER_REGEX, later=BACK_COVER_REGEX)
 								    if cover_split != False:
 								        return cover_split
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    epilogue_split = try_collate_split_regex(srcs, dest, start_index, exclude, later=EPILOGUE_REGEX)
-												detect epilogues in auto collation

											
										
										
											2024-02-06 12:48:00 -05:00
+								    if epilogue_split != False:
 								        return epilogue_split
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
+								    if all(src.is_file() and is_image(src) for src in srcs):
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								        ordering = complete_prefix_number_ordering(srcs)
 								        if ordering:
 								            print(f'Symlinking image files: {ordering[0]}...')
 								            link_ordered_files(ordering, dest, start_index)
 								            return len(ordering)
 								        else:
 								            return None
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    images_vs_pdf = try_collate_images_vs_pdf(srcs, dest, start_index, exclude)
-												handle when the pdf is in a folder called PDF instead of being right there in the directory

											
										
										
											2024-02-07 19:11:37 -05:00
+								    if images_vs_pdf != False:
 								        return images_vs_pdf
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
-												more flexible splitting out of textless pages

											
										
										
											2024-02-06 11:59:20 -05:00
+								    return None
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												handle edge-case where we need to manual-collate the top-level directory of an extracted work

											
										
										
											2024-01-22 08:47:20 -05:00
+								def self_and_parents(path):
 								    return [path] + list(path.parents)
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								def parse_expressions(tokens):
 								    groups = []
 								    exclusions = []
 								    while tokens:
 								        token = tokens.pop(0)
 								        if token == '!':
 								            exclusions.extend(parse_exclusion(tokens))
 								        elif token == '(':
 								            groups.append(parse_group(tokens))
 								        else:
 								            groups.append([token])
 								    return (groups, exclusions)
 								def parse_exclusion(tokens):
 								    token = tokens.pop(0)
 								    if token == '(':
 								        return parse_group(tokens)
 								    else:
 								        return [token]
 								def parse_group(tokens):
 								    items = []
 								    while True:
 								        token = tokens.pop(0)
 								        if token == ')':
 								            return items
 								        else:
 								            items.append(token)
 								def normalize_to(path, ref):
 								    return ref / Path(relpath(path, ref))
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								def manual_collate(args):
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    (raw_groups, raw_exclusions) = parse_expressions(args.expression)
-												let manual-collate automatically deduce the work id from the paths

											
										
										
											2024-01-22 07:36:23 -05:00
-												go back to deducing work_id automatically in manual-collate

											
										
										
											2024-02-07 22:52:17 -05:00
+								    extraction_dir = args.destdir / 'extract'
 								    sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group)
 								    work_id = Path(relpath(sample_path, extraction_dir)).parents[-2].name
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
 								    if raw_groups:
 								        groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
 								    else:
-												go back to deducing work_id automatically in manual-collate

											
										
										
											2024-02-07 22:52:17 -05:00
+								        groups = [[extraction_dir / work_id]]
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
-												go back to deducing work_id automatically in manual-collate

											
										
										
											2024-02-07 22:52:17 -05:00
+								    collation_dir = args.destdir / 'site' / 'images' / work_id
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    if collation_dir.exists():
 								        if len(list(collation_dir.iterdir())) > 0:
 								            print(f'Collation directory already exists!')
 								            return
 								        else:
 								            collation_dir.rmdir()
-												refuse to manual-collate a nonexistent path

											
										
										
											2024-01-22 07:43:24 -05:00
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
-												refuse to manual-collate a nonexistent path

											
										
										
											2024-01-22 07:43:24 -05:00
+								    if len(nonexistent) > 0:
 								        print(f'Nonexistent paths: {nonexistent}')
 								        return
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    collation_staging_area = args.destdir / 'site' / 'images-staging'
-												go back to deducing work_id automatically in manual-collate

											
										
										
											2024-02-07 22:52:17 -05:00
+								    work_staging_dir = collation_staging_area / work_id
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    work_staging_dir.mkdir(parents=True)
 								    pages_collated = 0
 								    for group in groups:
 								        pages_added = collate_from_paths(
 								            [item for item in group if item not in exclusions],
 								            work_staging_dir,
 								            pages_collated,
 								            exclusions,
 								        )
 								        if pages_added is None:
-												go back to deducing work_id automatically in manual-collate

											
										
										
											2024-02-07 22:52:17 -05:00
+								            print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								            pages_collated = None
 								            break
 								        pages_collated += pages_added
 								    if pages_collated:
-												go back to deducing work_id automatically in manual-collate

											
										
										
											2024-02-07 22:52:17 -05:00
+								        print(f'Collated {pages_collated} pages for {work_id}')
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								        work_staging_dir.rename(collation_dir)
 								    else:
 								        for f in work_staging_dir.iterdir():
 								            f.unlink()
 								        work_staging_dir.rmdir()
 								        if pages_collated == 0:
 								            print(f'No files found for {work_id}')
 								    collation_staging_area.rmdir()
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
-												add subcommand to show page counts and image sizes of an extracted work's files

											
										
										
											2024-02-06 23:52:59 -05:00
+								def fmt_size(s):
 								    return f'{s[0]}x{s[1]}px'
 								def analyze(args):
 								    extract_dir = args.destdir / 'extract'
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    files = descendant_files_ignore(extract_dir / args.work_id, [])
-												add subcommand to show page counts and image sizes of an extracted work's files

											
										
										
											2024-02-06 23:52:59 -05:00
+								    files.sort()
 								    for f in files:
 								        print(f'{relpath(f, extract_dir)}', end='')
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
+								        if is_image(f):
-												add subcommand to show page counts and image sizes of an extracted work's files

											
										
										
											2024-02-06 23:52:59 -05:00
+								            size = standalone_image_size(f)
 								            print(f'\t{fmt_size(size)}')
-												refactor checking file extensions

											
										
										
											2024-02-07 19:18:19 -05:00
+								        elif is_pdf(f):
-												add subcommand to show page counts and image sizes of an extracted work's files

											
										
										
											2024-02-06 23:52:59 -05:00
+								            sizes = pdf_image_sizes(f)
 								            if len(sizes) == 0:
 								                print(f'\tContains no images')
 								            else:
 								                print(f'\t{len(sizes)} images, median {fmt_size(median(sizes))}, min {fmt_size(min(sizes))}, max {fmt_size(max(sizes))}')
 								        else:
 								            print()
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								def metadata(args):
-												implement metadata viewing/editing and basic PDF image extraction

											
										
										
											2024-01-22 02:58:17 -05:00
+								    con = sqlite3.connect(args.destdir / 'meta.db')
 								    cur = con.cursor()
 								    if args.virtual is not None:
 								        cur.execute("UPDATE works SET virtual = ? WHERE id = ?", (1 if args.virtual else 0, args.work_id))
 								        con.commit()
 								    res = cur.execute(
 								        "SELECT title, circle, date, description, series, virtual FROM works WHERE id = ?",
 								        (args.work_id,),
 								    ).fetchone()
 								    if res is None:
 								        print(f'Work id {args.work_id} not found!')
 								        return
 								    (title, circle, date, description, series, virtual) = res
 								    print(f'Work ID:     {args.work_id}')
 								    print(f'Title:       {title}')
 								    print(f'Circle:      {circle}')
 								    print(f'Pub date:    {date}')
 								    print(f'Description: {description}')
 								    print(f'Series:      {series}')
 								    print(f'Virtual:     {"Yes" if virtual == 1 else "No"}')
 								    con.close()
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												fancier recursive copy function we didn't end up needing

											
										
										
											2024-01-23 17:35:01 -05:00
+								def copy_recursive(src, dest):
-												do fancy python packaging stuff

											
										
										
											2024-01-22 22:06:04 -05:00
+								    dest.mkdir(parents=True, exist_ok=True)
 								    for item in src.iterdir():
-												fancier recursive copy function we didn't end up needing

											
										
										
											2024-01-23 17:35:01 -05:00
+								        if item.is_dir() and not item.is_symlink():
 								            copy_recursive(item, dest / item.name)
 								        else:
 								            shutil.copyfile(item, dest / item.name)
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								def generate(args):
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								    jenv = Environment(
-												do fancy python packaging stuff

											
										
										
											2024-01-22 22:06:04 -05:00
+								        loader=PackageLoader("dlibrary"),
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								        autoescape=select_autoescape()
 								    )
 								    viewer_template = jenv.get_template("viewer.html")
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								    list_template = jenv.get_template("list.html")
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								    categorization_template = jenv.get_template("categorization.html")
-												add work overview pages

											
										
										
											2024-01-23 00:49:58 -05:00
+								    work_template = jenv.get_template("work.html")
-												rework front page to have shuffle and lazy infinite scroll

											
										
										
											2024-01-25 04:10:17 -05:00
+								    index_template = jenv.get_template("index.html")
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
 								    con = sqlite3.connect(args.destdir / 'meta.db')
 								    cur = con.cursor()
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								    site_dir = args.destdir / 'site'
 								    collated_work_ids = {p.name for p in (site_dir / 'images').iterdir()}
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								    actual_series = {series for (series,) in cur.execute('SELECT series FROM works GROUP BY series HAVING count(series) > 1')}
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								    works = []
 								    for (work_id, title, circle, date, description, series) in cur.execute('SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC').fetchall():
 								        if work_id not in collated_work_ids:
 								            continue
 								        authors = [author for (author,) in cur.execute('SELECT author FROM authors WHERE work = ?', (work_id,))]
 								        tags = [tag for (tag,) in cur.execute('SELECT tag FROM tags WHERE work = ?', (work_id,))]
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
 								        images = [path.name for path in (site_dir / 'images' / work_id).iterdir()]
 								        images.sort()
 								        try:
 								            thumbnail_path = relpath(next(
 								                f for f in (site_dir / 'thumbnails').iterdir() if f.stem == work_id
 								            ), site_dir)
 								        except StopIteration:
 								            thumbnail_path = f'images/{work_id}/{images[0]}'
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								        work = {
 								            'id': work_id,
 								            'title': title,
 								            'circle': circle,
 								            'date': date,
 								            'description': description,
 								            'series': series,
 								            'authors': authors,
 								            'tags': tags,
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								            'thumbnail_path': thumbnail_path,
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								        }
 								        works.append(work)
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								        work_dir = site_dir / 'works' / work_id
-												add work overview pages

											
										
										
											2024-01-23 00:49:58 -05:00
+								        viewer_dir = work_dir / 'view'
 								        viewer_dir.mkdir(parents=True, exist_ok=True)
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
+								        with open(work_dir / 'index.html', 'w') as f:
-												add work overview pages

											
										
										
											2024-01-23 00:49:58 -05:00
+								            f.write(work_template.render(depth=2, work=work, title=title, images=images))
 								        with open(viewer_dir / 'index.html', 'w') as f:
 								            f.write(viewer_template.render(depth=3, work=work, title=title, images=images))
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
-												tweak back card styling for series list

											
										
										
											2024-01-23 00:55:06 -05:00
+								    def make_categorization(categorization, query, work_filter, work_style_cards=False):
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								        categorization_dir = site_dir / categorization
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								        cats = [cat for (cat,) in cur.execute(query)]
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								        cat_samples = {}
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								        for cat in cats:
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								            cat_works = list(filter(work_filter(cat), works))
 								            cat_samples[cat] = cat_works[0] if len(cat_works) > 0 else None
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								            safeish_cat = cat.replace('/', ' ')
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								            cat_dir = categorization_dir / safeish_cat
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								            cat_dir.mkdir(parents=True, exist_ok=True)
 								            with open(cat_dir / 'index.html', 'w') as f:
 								                f.write(list_template.render(
 								                    depth=2,
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								                    works=cat_works,
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								                    title=cat,
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								                    categorization=categorization,
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								                ))
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								        categorization_dir.mkdir(parents=True, exist_ok=True)
 								        with open(categorization_dir / 'index.html', 'w') as f:
 								            f.write(categorization_template.render(
 								                depth=1,
 								                categorization=categorization,
 								                categories=cats,
 								                samples=cat_samples,
-												tweak back card styling for series list

											
										
										
											2024-01-23 00:55:06 -05:00
+								                work_style_cards=work_style_cards,
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								            ))
 								    make_categorization(
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								        'authors',
 								        'SELECT DISTINCT author FROM authors ORDER BY author',
 								        lambda author: lambda work: author in work['authors'],
 								    )
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								    make_categorization(
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								        'tags',
 								        'SELECT DISTINCT tag FROM tags ORDER BY tag',
 								        lambda tag: lambda work: tag in work['tags'],
 								    )
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								    make_categorization(
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								        'circles',
 								        'SELECT DISTINCT circle FROM works WHERE circle NOT NULL ORDER BY circle',
 								        lambda circle: lambda work: work['circle'] == circle,
 								    )
-												create author-list, tag-list, etc pages

											
										
										
											2024-01-23 00:00:15 -05:00
+								    make_categorization(
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								        'series',
 								        'SELECT DISTINCT series FROM works WHERE series NOT NULL ORDER BY series',
 								        lambda series: lambda work: work['series'] == series,
-												tweak back card styling for series list

											
										
										
											2024-01-23 00:55:06 -05:00
+								        work_style_cards=True,
-												create listing pages per author/tag/etc

											
										
										
											2024-01-22 23:20:42 -05:00
+								    )
-												do fancy python packaging stuff

											
										
										
											2024-01-22 22:06:04 -05:00
+								    with resources.as_file(resources.files("dlibrary")) as r:
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								        copy_recursive(r / 'static', site_dir / 'static')
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
-												default to using first page as thumbnail

											
										
										
											2024-02-06 08:29:52 -05:00
+								    with open(site_dir / 'index.html', 'w') as f:
-												rework front page to have shuffle and lazy infinite scroll

											
										
										
											2024-01-25 04:10:17 -05:00
+								        f.write(index_template.render(depth=0, works=works))
-												we've got a website, sorta!

											
										
										
											2024-01-22 07:01:41 -05:00
 								    con.close()
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								argparser = argparse.ArgumentParser(
 								    prog='dlibrary',
 								    formatter_class=argparse.RawDescriptionHelpFormatter,
 								    description=textwrap.dedent("""\
 								    Organize DRM-free works purchased from DLSite into a library
 								    that can be viewed in a web browser.
 								    Intended workflow:
 								    - `extract` a collection of zipfiles downloaded from DLSite
 								      into DLibrary's data directory, giving each work its own
 								      subfolder.
 								    - `fetch` metadata and thumbnail images for extracted works
 								      from DLSite.
 								    - `collate` and/or `manual-collate` extracted works,
 								      producing a single sequence of image files (or symlinks
 								      into the extracted data, when possible) for each work.
 								    - Manually adjust works' `metadata` when necessary.
 								    - `generate` a static website providing a catalog and viewer
 								      for all collated works.
 								    """),
 								)
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								argparser.add_argument(
 								    '-d', '--destdir',
 								    type=Path,
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								    default=Path(os.getenv('DLIBRARY_DIR', './dlibrary')),
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								    help='directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary)',
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								)
-												require a subcommand

											
										
										
											2024-01-22 22:09:26 -05:00
+								subparsers = argparser.add_subparsers(title="subcommands", required=True)
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												add short aliases for subcommands

											
										
										
											2024-02-08 05:03:40 -05:00
+								parser_extract = subparsers.add_parser('extract', aliases=['x', 'ex'], help='extract zipfiles')
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								parser_extract.add_argument(
 								    '-r', '--remove',
 								    action='store_true',
 								    help='remove original zipfiles after extraction',
 								)
 								parser_extract.add_argument(
 								    'zipfiles',
 								    metavar='FILE',
 								    type=Path,
 								    nargs='+',
 								    help='zipfiles to extract',
 								)
 								parser_extract.set_defaults(func=extract)
-												add short aliases for subcommands

											
										
										
											2024-02-08 05:03:40 -05:00
+								parser_fetch = subparsers.add_parser('fetch', aliases=['f', 'fet'], help='fetch metadata and thumbnails')
-												add configurable locale for metadata

											
										
										
											2024-01-24 02:33:14 -05:00
+								parser_fetch.add_argument(
 								    '-l', '--locale',
 								    type=str,
-												detect if there are equivalent PDF and image-file versions, and choose whichever is higher-resolution

											
										
										
											2024-02-06 23:01:59 -05:00
+								    default=os.getenv('DLIBRARY_LOCALE', 'en_US'),
-												add configurable locale for metadata

											
										
										
											2024-01-24 02:33:14 -05:00
+								    help=('locale to use when requesting metadata (e.g. "ja_JP", "en_US"). '
 								          'May still fall back to Japanese if metadata in other languages is unavailable. '
 								          '(default: $DLIBRARY_LOCALE or en_US)'),
 								)
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								parser_fetch.set_defaults(func=fetch)
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								parser_collate = subparsers.add_parser(
 								    'collate',
-												add short aliases for subcommands

											
										
										
											2024-02-08 05:03:40 -05:00
+								    aliases=['c', 'co', 'col'],
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								    help='collate each work into a sequence of image files',
 								    formatter_class=argparse.RawDescriptionHelpFormatter,
 								    description=textwrap.dedent("""\
 								    For each extracted work that has not already been collated,
 								    DLibrary will attempt to intuit its structure as follows:
 								    - Enter the work's directory. If the directory contains
 								      nothing except a single subdirectory (ignoring a few types
 								      of files that are definitely not relevant), traverse
 								      downwards repeatedly.
 								    - If the current directory contains nothing except a single
 								      PDF (again, ignoring irrelevant files), attempt to extract
 								      a series of images from the PDF. This process expects that
 								      each page of the PDF consists of a single embedded image,
 								      which will be extracted at full resolution. Support for
 								      more complex PDFs is not yet implemented.
 								    - If the current directory contains nothing except image
 								      files, and the image files are named in a way that clearly
 								      indicates a complete numerical order (each filename
 								      consists of a shared prefix followed by a distinct
 								      number), symlink files in the inferred order.
 								    - Otherwise, skip processing this work for now.
 								    DLibrary can be given "collation hints" which provide
 								    alternative starting points for this search process. A hint
 								    is a path under $DLIBRARY_DIR/extract/[work id]/
 								    indicating a different directory or PDF file to begin the
 								    search process for that work, rather than starting at the
 								    top level of the extracted data. There can be at most one
 								    hint per work; for more complicated scenarios where a work
 								    includes multiple folders that need to be collated together,
 								    or where filenames do not clearly indicate an ordering, use
 								    `manual-collate` instead.
 								    """),
 								)
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								parser_collate.add_argument(
 								    'hints',
 								    metavar='PATH',
 								    type=Path,
 								    nargs='*',
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								    help='paths within extraction folders as collation hints'
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								)
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								parser_collate.set_defaults(func=collate)
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								parser_manual_collate = subparsers.add_parser(
 								    'manual-collate',
-												add short aliases for subcommands

											
										
										
											2024-02-08 05:03:40 -05:00
+								    aliases=['mc', 'man', 'manual'],
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								    help='collate a single work manually',
 								    formatter_class=argparse.RawDescriptionHelpFormatter,
 								    description=textwrap.dedent("""\
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    Provide an expression or sequence of expressions specifying groups
 								    of paths to collate or skip. An expression can be:
 								    PATH
 								      A single path. If this is an image, it will be appended to
 								      the sequence of collated images; if this is a PDF, images will be
 								      extracted from it and concatenated to the sequence; if this is a
 								      directory, the contents of the directory will be collated based on
 								      the normal heuristics and concatenated to the sequence.
 								    ( PATH [PATH ...] )
 								      A group of paths contained in parentheses. You may need to escape
 								      the parentheses to avoid them getting parsed by your shell.
 								      All the paths in this group will be considered together, and
 								      collated based on the normal heuristics, regardless of what
 								      order the paths are provided in.
 								    ! PATH
 								    ! ( PATH [PATH ...] )
 								      A path or group of paths to exclude from collation. You may
 								      need to escape the !. If an excluded path appears within any
 								      of the other specified paths, it will be ignored.
 								    If the only expressions provided are negations, then auto-collation
 								    will start from the top level of the extracted work while excluding
 								    the negated paths.
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								    All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
 								    for the work being manually collated. `manual-collate` can
-												go back to deducing work_id automatically in manual-collate

											
										
										
											2024-02-07 22:52:17 -05:00
+								    only handle one work at a time.
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								"""),
 								)
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								parser_manual_collate.add_argument(
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    'expression',
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								    nargs='+',
-												completely refactor manual collation to be more ergonomic

											
										
										
											2024-02-07 22:32:31 -05:00
+								    help='expressions indicating paths to collate or skip',
-												add support for explicitly specifying paths for collation

											
										
										
											2024-01-22 03:49:00 -05:00
+								)
 								parser_manual_collate.set_defaults(func=manual_collate)
-												add short aliases for subcommands

											
										
										
											2024-02-08 05:03:40 -05:00
+								parser_analyze = subparsers.add_parser('analyze', aliases=['a', 'an', 'anal'], help='analyze an extracted folder to assist in collation')
-												add subcommand to show page counts and image sizes of an extracted work's files

											
										
										
											2024-02-06 23:52:59 -05:00
+								parser_analyze.add_argument('work_id')
 								parser_analyze.set_defaults(func=analyze)
-												add short aliases for subcommands

											
										
										
											2024-02-08 05:03:40 -05:00
+								parser_metadata = subparsers.add_parser('metadata', aliases=['m', 'me', 'meta'], help='view or modify metadata for a work')
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								parser_metadata.add_argument('work_id')
 								parser_metadata.add_argument(
 								    '--virtual',
 								    action=argparse.BooleanOptionalAction,
 								    help='set work as virtual',
 								)
 								parser_metadata.set_defaults(func=metadata)
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								parser_generate = subparsers.add_parser(
 								    'generate',
-												add short aliases for subcommands

											
										
										
											2024-02-08 05:03:40 -05:00
+								    aliases=['g', 'gen'],
-												add more detailed help and allow configuring destdir with environment variable

											
										
										
											2024-01-23 15:54:17 -05:00
+								    help='generate HTML/CSS/JS for library site',
 								    formatter_class=argparse.RawDescriptionHelpFormatter,
 								    description=textwrap.dedent("""\
 								    The static site will be generated under $DLIBRARY_DIR/site/
 								    and can be served by pointing an HTTP server at that
 								    directory. Note that some files inside the static site
 								    hierarchy will be symlinks into $DLIBRARY_DIR/extract/
 								    outside the site hierarchy, so make sure your HTTP server
 								    will allow those symlinks to be read.
 								    """),
 								)
 								parser_generate.set_defaults(func=generate)
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
-												do fancy python packaging stuff

											
										
										
											2024-01-22 22:06:04 -05:00
+								def main():
-												refactor as a CLI program with nice subcommands, fuck GUIs, we hate GUIs

											
										
										
											2024-01-22 02:16:06 -05:00
+								    args = argparser.parse_args()
 								    args.func(args)
-												do fancy python packaging stuff

											
										
										
											2024-01-22 22:06:04 -05:00
 								if __name__ == "__main__":
 								    main()