Compare commits

..

No commits in common. "ee9eba32872f003ae23c22c11a519693e24817cd" and "b7b989433a9cabf509d1f120363ebd02ff6f4f51" have entirely different histories.

3 changed files with 168 additions and 202 deletions

View file

@ -3,7 +3,6 @@
import argparse import argparse
import asyncio import asyncio
import importlib_resources as resources import importlib_resources as resources
from io import BytesIO
from pathlib import Path from pathlib import Path
import os import os
from os.path import relpath, splitext from os.path import relpath, splitext
@ -18,7 +17,6 @@ import zipfile
from dlsite_async import DlsiteAPI from dlsite_async import DlsiteAPI
import fitz import fitz
from libsixel import *
from PIL import Image from PIL import Image
from jinja2 import Environment, PackageLoader, select_autoescape from jinja2 import Environment, PackageLoader, select_autoescape
import requests import requests
@ -79,8 +77,7 @@ IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff', '.bmp']
IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store'] IGNOREABLE_FILES = ['Thumbs.db', '__MACOSX', '.DS_Store']
IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4'] IGNOREABLE_EXTENSIONS = ['.txt', '.html', '.htm', '.psd', '.mp4']
PDF_CONVERSION_DPI = 300 PDF_FALLBACK_DPI = 300
PDF_PREVIEW_DPI = 72
IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I) IRRELEVANT_PDF_BLOCK_REGEX = re.compile(r'\bTCPDF\b', re.I)
@ -217,52 +214,11 @@ def self_and_parents(path):
return [path] + list(path.parents) return [path] + list(path.parents)
def collate(args): def collate(args):
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
extraction_dir = args.destdir / 'extract' extraction_dir = args.destdir / 'extract'
hint_map = {self_and_parents(Path(relpath(hint, extraction_dir)))[-2].name: hint for hint in args.hints}
def extracted_path_work_id(path):
trail = self_and_parents(Path(relpath(path, extraction_dir)))
if len(trail) < 2:
return None
result = trail[-2].name
if result == '..':
return None
return result
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
specified_works = set()
works_groups = {}
for group in raw_groups:
if len(group) == 0:
continue
work_id = extracted_path_work_id(group[0])
if not work_id:
print(f'Group {group} contains paths outside an extracted work!')
exit(1)
if not all(extracted_path_work_id(item) == work_id for item in group[1:]):
print(f'Group {group} contains paths from multiple works!')
exit(1)
specified_works.add(work_id)
if work_id not in works_groups:
works_groups[work_id] = []
normalized_paths = [normalize_to(item, args.destdir) for item in group]
if not all(path.exists() for path in normalized_paths):
print(f'Group {group} contains nonexistent paths!')
exit(1)
works_groups[work_id].append(normalized_paths)
exclusions = []
for exclusion in raw_exclusions:
work_id = extracted_path_work_id(exclusion)
if not work_id:
print(f'Excluded path {exclusion} does not belong to an extracted work!')
exit(1)
specified_works.add(work_id)
normalized_path = normalize_to(exclusion, args.destdir)
if not normalized_path.exists():
print(f'Excluded path {exclusion} does not exist!')
exit(1)
exclusions.append(normalized_path)
collation_staging_area = args.destdir / 'site' / 'images-staging' collation_staging_area = args.destdir / 'site' / 'images-staging'
collation_staging_area.mkdir(parents=True) collation_staging_area.mkdir(parents=True)
@ -270,41 +226,21 @@ def collate(args):
collation_area = args.destdir / 'site' / 'images' collation_area = args.destdir / 'site' / 'images'
collation_area.mkdir(parents=True, exist_ok=True) collation_area.mkdir(parents=True, exist_ok=True)
con = sqlite3.connect(args.destdir / 'meta.db')
cur = con.cursor()
for work_path in extraction_dir.iterdir(): for work_path in extraction_dir.iterdir():
work_id = work_path.name work_id = work_path.name
if args.only_specified_works and work_id not in specified_works:
continue
work_collation_dir = collation_area / work_id work_collation_dir = collation_area / work_id
if work_collation_dir.exists(): if work_collation_dir.exists():
if work_id not in specified_works:
continue continue
if len(list(work_collation_dir.iterdir())) > 0:
print(f'Collation directory for work {work_id} already exists!')
break
else:
work_collation_dir.rmdir()
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone() virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
if virtual == (1,): if virtual == (1,):
if work_id in specified_works:
print(f'Work {work_id} is virtual!')
break
continue continue
work_staging_dir = collation_staging_area / work_id work_staging_dir = collation_staging_area / work_id
collator = Collator(work_staging_dir, exclusions, args) collator = Collator(work_staging_dir, [], args)
for group in works_groups.get(work_id, [[work_path]]): collation_result = collator.collate_from_paths([hint_map.get(work_id, work_path)])
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
if not collation_result:
print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
break
if collation_result and collator.index > 0: if collation_result and collator.index > 0:
print(f'Collated {collator.index} pages for {work_id}') print(f'Collated {collator.index} pages for {work_id}')
work_staging_dir.rename(work_collation_dir) work_staging_dir.rename(work_collation_dir)
@ -317,7 +253,7 @@ def collate(args):
if not collation_result: if not collation_result:
print(f'Unable to deduce file structure for {work_id}, skipping') print(f'Unable to deduce file structure for {work_id}, skipping')
elif collator.index == 0: elif collator.index == 0:
print(f'No files found for {work_id}, skipping') print(f'{work_id} contains no files? skipping')
collation_staging_area.rmdir() collation_staging_area.rmdir()
con.close() con.close()
@ -530,84 +466,43 @@ def extract_image(pdf, xref):
pix = fitz.Pixmap(pdf, xref) pix = fitz.Pixmap(pdf, xref)
return { 'ext': 'png', 'image': pix.tobytes('png') } return { 'ext': 'png', 'image': pix.tobytes('png') }
def display_sixel_page(page):
s = BytesIO()
image = Image.open(BytesIO(page.get_pixmap(dpi=PDF_PREVIEW_DPI).tobytes('png')))
width, height = image.size
try:
data = image.tobytes()
except NotImplementedError:
data = image.tostring()
output = sixel_output_new(lambda data, s: s.write(data), s)
try:
if image.mode == 'RGBA':
dither = sixel_dither_new(256)
sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGBA8888)
elif image.mode == 'RGB':
dither = sixel_dither_new(256)
sixel_dither_initialize(dither, data, width, height, SIXEL_PIXELFORMAT_RGB888)
elif image.mode == 'P':
palette = image.getpalette()
dither = sixel_dither_new(256)
sixel_dither_set_palette(dither, palette)
sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_PAL8)
elif image.mode == 'L':
dither = sixel_dither_get(SIXEL_BUILTIN_G8)
sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G8)
elif image.mode == '1':
dither = sixel_dither_get(SIXEL_BUILTIN_G1)
sixel_dither_set_pixelformat(dither, SIXEL_PIXELFORMAT_G1)
else:
raise RuntimeError('unexpected image mode')
try:
sixel_encode(data, width, height, 1, dither, output)
print(s.getvalue().decode('ascii'))
finally:
sixel_dither_unref(dither)
finally:
sixel_output_unref(output)
def pdf_images(pdf, force=False): def pdf_images(pdf, force=False):
images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf] images_by_page = [(page.get_images(), is_single_image(page)) for page in pdf]
if all(len(images) == 1 and single for (images, single) in images_by_page): if all(len(images) == 1 and single for (images, single) in images_by_page):
return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page) return (extract_image(pdf, images[0][0]) for (images, _) in images_by_page)
print("Checking PDF images the quick way failed, trying the slow way") print("Checking PDF images the quick way failed, trying the slow way")
print(f'0/{pdf.page_count} pages processed...', end='') def xref_or_image_generator():
image_extractors = [] xref_mode = not force
for (idx, page) in enumerate(pdf): for (idx, page) in enumerate(pdf):
page_images = page.get_image_info(xrefs=True) page_images = page.get_image_info(xrefs=True)
if len(page_images) == 1 and page_images[0]['xref'] != 0: if len(page_images) == 1 and page_images[0]['xref'] != 0 and is_single_image(page):
xref = page_images[0]['xref'] xref = page_images[0]['xref']
if xref_mode:
yield xref
else: else:
xref = None yield extract_image(pdf, xref)
if xref is not None and is_single_image(page):
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
else: else:
print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects') print(f'\nPage {idx+1}: {len(page_images)} images, {len([img for img in page_images if img["xref"] == 0])} non-xref images, {len(relevant_blocks(page))} total relevant objects')
if force: if xref_mode:
print(f'Converting page {idx+1}') raise ValueError
choice = 'c'
else: else:
shown = False print(f'Generating pixmap for page {idx+1}')
while True: pix = page.get_pixmap(dpi=PDF_FALLBACK_DPI)
choice = input(f'[N]ope out of this PDF / [c]onvert page to image{"" if xref is None else " / e[x]tract embedded image without text"}{"" if shown else " / [s]how page before deciding"}? [N/c{"" if xref is None else "/x"}{"" if shown else "/s"}] ') yield { 'ext': 'png', 'image': pix.tobytes('png') }
if not shown and choice != '' and choice[0].lower() == 's': print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end='')
display_sixel_page(page) print('')
shown = True
else:
break
if xref is not None and choice != '' and choice[0].lower() == 'x':
image_extractors.append(lambda p=pdf, x=xref: extract_image(p, x))
elif choice != '' and choice[0].lower() == 'c':
image_extractors.append(lambda p=page: { 'ext': 'png', 'image': p.get_pixmap(dpi=PDF_CONVERSION_DPI).tobytes('png') })
else:
return None
print(f'\x1b[2K\r{idx+1}/{pdf.page_count} pages processed...', end=('' if idx+1 < pdf.page_count else '\n'))
return (extractor() for extractor in image_extractors) if force:
return xref_or_image_generator()
try:
xrefs = list(xref_or_image_generator())
except ValueError:
print('\nFailed')
return None
print('Success')
return (extract_image(pdf, xref) for xref in xrefs)
def nfc(s): def nfc(s):
return unicodedata.normalize('NFC', s) return unicodedata.normalize('NFC', s)
@ -793,6 +688,9 @@ def superior_or_equal(a, b):
return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b))) return len(a) >= len(b) and all(a[i] >= b[i] for i in range(len(b)))
def self_and_parents(path):
return [path] + list(path.parents)
def parse_expressions(tokens): def parse_expressions(tokens):
groups = [] groups = []
exclusions = [] exclusions = []
@ -829,6 +727,61 @@ def parse_group(tokens):
def normalize_to(path, ref): def normalize_to(path, ref):
return ref / Path(relpath(path, ref)) return ref / Path(relpath(path, ref))
def manual_collate(args):
(raw_groups, raw_exclusions) = parse_expressions(args.expression)
extraction_dir = args.destdir / 'extract'
sample_path = next(path for group in (raw_groups + [raw_exclusions]) for path in group)
work_id = self_and_parents(Path(relpath(sample_path, extraction_dir)))[-2].name
exclusions = [normalize_to(item, args.destdir) for item in raw_exclusions]
if raw_groups:
groups = [[normalize_to(item, args.destdir) for item in group] for group in raw_groups]
else:
groups = [[extraction_dir / work_id]]
collation_area = args.destdir / 'site' / 'images'
collation_area.mkdir(parents=True, exist_ok=True)
work_collation_dir = collation_area / work_id
if work_collation_dir.exists():
if len(list(work_collation_dir.iterdir())) > 0:
print('Collation directory already exists!')
return
else:
work_collation_dir.rmdir()
nonexistent = [path for group in (groups + [exclusions]) for path in group if not path.exists()]
if len(nonexistent) > 0:
print(f'Nonexistent paths: {nonexistent}')
return
collation_staging_area = args.destdir / 'site' / 'images-staging'
work_staging_dir = collation_staging_area / work_id
work_staging_dir.mkdir(parents=True)
collator = Collator(work_staging_dir, exclusions, args)
for group in groups:
collation_result = collator.collate_from_paths([item for item in group if item not in exclusions])
if collation_result is None:
print(f'Unable to deduce file structure for {work_id} subgroup {[str(path) for path in group]}')
break
if collation_result and collator.index > 0:
print(f'Collated {collator.index} pages for {work_id}')
work_staging_dir.rename(work_collation_dir)
else:
for f in work_staging_dir.iterdir():
f.unlink()
work_staging_dir.rmdir()
if collation_result and collator.index == 0:
print(f'No files found for {work_id}')
collation_staging_area.rmdir()
def fmt_size(s): def fmt_size(s):
return f'{s[0]}x{s[1]}px' return f'{s[0]}x{s[1]}px'
@ -1069,9 +1022,9 @@ argparser = argparse.ArgumentParser(
subfolder. subfolder.
- `fetch` metadata and thumbnail images for extracted works - `fetch` metadata and thumbnail images for extracted works
from DLSite. from DLSite.
- `collate` extracted works, producing a single sequence of - `collate` and/or `manual-collate` extracted works,
image files (or symlinks into the extracted data, when producing a single sequence of image files (or symlinks
possible) for each work. into the extracted data, when possible) for each work.
- Manually adjust works' `metadata` when necessary. - Manually adjust works' `metadata` when necessary.
- `generate` a static website providing a catalog and viewer - `generate` a static website providing a catalog and viewer
for all collated works. for all collated works.
@ -1094,7 +1047,7 @@ argparser.add_argument(
) )
subparsers = argparser.add_subparsers(title="subcommands", required=True) subparsers = argparser.add_subparsers(title="subcommands", required=True)
parser_extract = subparsers.add_parser('extract', aliases=['x'], help='extract zipfiles') parser_extract = subparsers.add_parser('extract', aliases=['x', 'ex'], help='extract zipfiles')
parser_extract.add_argument( parser_extract.add_argument(
'-r', '--remove', '-r', '--remove',
action='store_true', action='store_true',
@ -1109,96 +1062,111 @@ parser_extract.add_argument(
) )
parser_extract.set_defaults(func=extract) parser_extract.set_defaults(func=extract)
parser_fetch = subparsers.add_parser('fetch', aliases=['f'], help='fetch metadata and thumbnails') parser_fetch = subparsers.add_parser('fetch', aliases=['f', 'fet'], help='fetch metadata and thumbnails')
parser_fetch.set_defaults(func=fetch) parser_fetch.set_defaults(func=fetch)
parser_collate = subparsers.add_parser( parser_collate = subparsers.add_parser(
'collate', 'collate',
aliases=['c'], aliases=['c', 'co', 'col'],
help='collate works into sequences of image files', help='collate each work into a sequence of image files',
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\ description=textwrap.dedent("""\
For each extracted work that has not already been collated, For each extracted work that has not already been collated,
DLibrary will attempt to intuit its structure and create DLibrary will attempt to intuit its structure as follows:
a single ordered list of image files in the site data
directory. Each image will either be a symlink to an image
file in the extraction folder, or a single page extracted
from a PDF file.
DLibrary may fail to automatically collate a work if its - Enter the work's directory. If the directory contains
files and subdirectories are not named in a way that nothing except a single subdirectory (ignoring a few types
indicates a clear linear ordering. In order to assist with of files that are definitely not relevant), traverse
collation, you can provide a list of expressions specifying downwards repeatedly.
where to start traversing the directory structure, what - If the current directory contains nothing except a single
files to include in what order, and/or what files to ignore PDF (again, ignoring irrelevant files), attempt to extract
entirely. a series of images from the PDF. This process expects that
each page of the PDF consists of a single embedded image,
which will be extracted at full resolution. Support for
more complex PDFs is not yet implemented.
- If the current directory contains nothing except image
files, and the image files are named in a way that clearly
indicates a complete numerical order (each filename
consists of a shared prefix followed by a distinct
number), symlink files in the inferred order.
- Otherwise, skip processing this work for now.
An expression can be: DLibrary can be given "collation hints" which provide
alternative starting points for this search process. A hint
is a path under $DLIBRARY_DIR/extract/[work id]/
indicating a different directory or PDF file to begin the
search process for that work, rather than starting at the
top level of the extracted data. There can be at most one
hint per work; for more complicated scenarios where a work
includes multiple folders that need to be collated together,
or where filenames do not clearly indicate an ordering, use
`manual-collate` instead.
"""),
)
parser_collate.add_argument(
'hints',
metavar='PATH',
type=Path,
nargs='*',
help='paths within extraction folders as collation hints'
)
parser_collate.set_defaults(func=collate, force_convert_pdf=False)
parser_manual_collate = subparsers.add_parser(
'manual-collate',
aliases=['mc', 'man', 'manual'],
help='collate a single work manually',
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\
Provide an expression or sequence of expressions specifying groups
of paths to collate or skip. An expression can be:
PATH PATH
A single path. If this is an image, it will be appended to A single path. If this is an image, it will be appended to
the sequence of collated images for the work it belongs to; the sequence of collated images; if this is a PDF, images will be
if this is a PDF, images will be extracted from it and extracted from it and concatenated to the sequence; if this is a
concatenated to the sequence; if this is a directory, the directory, the contents of the directory will be collated based on
contents of the directory will be automatically collated the normal heuristics and concatenated to the sequence.
using DLibrary's default heuristics, and concatenated
to the sequence.
( PATH [PATH ...] ) ( PATH [PATH ...] )
A group of paths contained in parentheses. You may need to escape A group of paths contained in parentheses. You may need to escape
the parentheses to avoid them getting parsed by your shell. the parentheses to avoid them getting parsed by your shell.
All the paths in this group will be considered together, and All the paths in this group will be considered together, and
automatically collated using the default heuristics, regardless collated based on the normal heuristics, regardless of what
of what order the paths are provided in. order the paths are provided in.
! PATH ! PATH
! ( PATH [PATH ...] ) ! ( PATH [PATH ...] )
A path or group of paths to exclude from collation. You may A path or group of paths to exclude from collation. You may
need to escape the !. If an excluded path appears within any need to escape the !. If an excluded path appears within any
of the other specified paths, it will be skipped by the collation of the other specified paths, it will be ignored.
heuristics.
If the only expressions provided are negations, then auto-collation If the only expressions provided are negations, then auto-collation
will start from the top level of the extracted work while skipping will start from the top level of the extracted work while excluding
the excluded paths. the negated paths.
All provided paths must be under $DLIBRARY_DIR/extract/[work id]/ All provided paths must be under $DLIBRARY_DIR/extract/[work id]/
for some not-yet-collated work. Paths belonging to multiple for the work being manually collated. `manual-collate` can
different works can all be provided on the same command line, and only handle one work at a time.
expressions will be clustered together by work id while otherwise
preserving the order they were provided in. A parenthesized group
expression must only contain paths belonging to a single work.
By default, DLibrary will attempt to collate every not-yet-collated
work (excluding "virtual" works), using the provided expressions
to assist in collation when available. The `-o` flag will direct
DLibrary to *only* collate works included in the provided expressions,
even if other uncollated works are present.
"""), """),
) )
parser_collate.add_argument( parser_manual_collate.add_argument(
'-o', '--only-specified-works',
action='store_true',
help="only collate works that are explicitly specified",
)
parser_collate.add_argument(
'--force-convert-pdf', '--force-convert-pdf',
action='store_true', action='store_true',
help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly", help="convert a PDF page to a 300dpi image if there isn't a single image we can extract directly",
) )
parser_collate.add_argument( parser_manual_collate.add_argument(
'expression', 'expression',
nargs='*', nargs='+',
help='expressions indicating paths to collate or skip', help='expressions indicating paths to collate or skip',
) )
parser_collate.set_defaults(func=collate) parser_manual_collate.set_defaults(func=manual_collate)
parser_analyze = subparsers.add_parser('analyze', aliases=['a', 'an', 'anal'], help='analyze an extracted folder to assist in collation')
parser_analyze = subparsers.add_parser('analyze', aliases=['a'], help='analyze an extracted folder to assist in collation')
parser_analyze.add_argument('work_id') parser_analyze.add_argument('work_id')
parser_analyze.set_defaults(func=analyze) parser_analyze.set_defaults(func=analyze)
parser_metadata = subparsers.add_parser('metadata', aliases=['m'], help='view or modify metadata for a work') parser_metadata = subparsers.add_parser('metadata', aliases=['m', 'me', 'meta'], help='view or modify metadata for a work')
parser_metadata.add_argument('work_id') parser_metadata.add_argument('work_id')
parser_metadata.add_argument( parser_metadata.add_argument(
'--virtual', '--virtual',
@ -1209,7 +1177,7 @@ parser_metadata.set_defaults(func=metadata)
parser_generate = subparsers.add_parser( parser_generate = subparsers.add_parser(
'generate', 'generate',
aliases=['g'], aliases=['g', 'gen'],
help='generate HTML/CSS/JS for library site', help='generate HTML/CSS/JS for library site',
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent("""\ description=textwrap.dedent("""\

View file

@ -57,7 +57,6 @@
jinja2 jinja2
importlib-resources importlib-resources
setuptools setuptools
libsixel
]; ];
src = ./.; src = ./.;
}; };

View file

@ -11,7 +11,6 @@ dependencies = [
"dlsite-async", "dlsite-async",
"jinja2", "jinja2",
"importlib_resources", "importlib_resources",
"libsixel",
] ]
[project.scripts] [project.scripts]