add support for explicitly specifying paths for collation
This commit is contained in:
parent
3714eecab9
commit
7680a174fc
151
dlibrary.py
151
dlibrary.py
|
@ -106,55 +106,12 @@ def fetch(args):
|
||||||
asyncio.run(fetch_async(args))
|
asyncio.run(fetch_async(args))
|
||||||
|
|
||||||
|
|
||||||
def collate(args):
|
def link_pdf(src, dest, start_index=0):
|
||||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
|
||||||
cur = con.cursor()
|
|
||||||
|
|
||||||
for work_path in (args.destdir / 'extract').iterdir():
|
|
||||||
work_id = work_path.name
|
|
||||||
|
|
||||||
collation_dir = args.destdir / 'site' / 'works' / work_id
|
|
||||||
if collation_dir.exists():
|
|
||||||
continue
|
|
||||||
|
|
||||||
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
|
|
||||||
if virtual == (1,):
|
|
||||||
continue
|
|
||||||
|
|
||||||
search_dir = work_path
|
|
||||||
while True:
|
|
||||||
entries = list(search_dir.iterdir())
|
|
||||||
if len(entries) == 1 and entries[0].is_dir():
|
|
||||||
search_dir = entries[0]
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
|
|
||||||
print(f'Extracting images from {entries[0].name} for {work_id}')
|
|
||||||
link_pdf(entries[0], collation_dir)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(entries) == 0:
|
|
||||||
print(f'{work_id} contains no files? skipping')
|
|
||||||
continue
|
|
||||||
|
|
||||||
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
|
|
||||||
ordering = complete_prefix_number_ordering(entries)
|
|
||||||
if ordering:
|
|
||||||
print(f'Symlinking image files for {work_id}')
|
|
||||||
link_ordered_files(ordering, collation_dir)
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f'Unable to deduce file structure for {work_id}, skipping')
|
|
||||||
|
|
||||||
con.close()
|
|
||||||
|
|
||||||
def link_pdf(src, dest):
|
|
||||||
with fitz.open(src) as pdf:
|
with fitz.open(src) as pdf:
|
||||||
images_by_page = [page.get_images() for page in pdf]
|
images_by_page = [page.get_images() for page in pdf]
|
||||||
if all(len(images) == 1 for images in images_by_page):
|
if all(len(images) == 1 for images in images_by_page):
|
||||||
dest.mkdir(parents=True)
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
for (idx, images) in enumerate(images_by_page):
|
for (idx, images) in enumerate(images_by_page, start=start_index):
|
||||||
xref = images[0][0]
|
xref = images[0][0]
|
||||||
image = pdf.extract_image(xref)
|
image = pdf.extract_image(xref)
|
||||||
file_path = dest / f'{idx:04d}.{image["ext"]}'
|
file_path = dest / f'{idx:04d}.{image["ext"]}'
|
||||||
|
@ -184,14 +141,92 @@ def complete_prefix_number_ordering(entries):
|
||||||
return [e for (e, i) in entries_with_indices]
|
return [e for (e, i) in entries_with_indices]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def link_ordered_files(ordering, dest):
|
def link_ordered_files(ordering, dest, start_index=0):
|
||||||
dest.mkdir(parents=True)
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
for (idx, src_path) in enumerate(ordering):
|
for (idx, src_path) in enumerate(ordering, start=start_index):
|
||||||
ext = src_path.suffix.lower()
|
ext = src_path.suffix.lower()
|
||||||
link_path = dest / f'{idx:04d}{ext}'
|
link_path = dest / f'{idx:04d}{ext}'
|
||||||
link_path.symlink_to(relpath(src_path, dest))
|
link_path.symlink_to(relpath(src_path, dest))
|
||||||
|
|
||||||
|
def collate(args):
|
||||||
|
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||||
|
cur = con.cursor()
|
||||||
|
|
||||||
|
extraction_dir = args.destdir / 'extract'
|
||||||
|
hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
|
||||||
|
|
||||||
|
for work_path in extraction_dir.iterdir():
|
||||||
|
work_id = work_path.name
|
||||||
|
|
||||||
|
collation_dir = args.destdir / 'site' / 'works' / work_id
|
||||||
|
if collation_dir.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
virtual = cur.execute("SELECT virtual FROM works WHERE id = ?", (work_id,)).fetchone()
|
||||||
|
if virtual == (1,):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if work_id in hint_map:
|
||||||
|
hint = hint_map[work_id]
|
||||||
|
entries = list(hint.iterdir()) if hint.is_dir() else [hint]
|
||||||
|
else:
|
||||||
|
search_dir = work_path
|
||||||
|
while True:
|
||||||
|
entries = list(search_dir.iterdir())
|
||||||
|
if len(entries) == 1 and entries[0].is_dir():
|
||||||
|
search_dir = entries[0]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
|
||||||
|
print(f'Extracting images from {entries[0]} for {work_id}')
|
||||||
|
link_pdf(entries[0], collation_dir)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(entries) == 0:
|
||||||
|
print(f'{work_id} contains no files? skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
|
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
|
||||||
|
ordering = complete_prefix_number_ordering(entries)
|
||||||
|
if ordering:
|
||||||
|
print(f'Symlinking image files for {work_id}')
|
||||||
|
link_ordered_files(ordering, collation_dir)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f'Unable to deduce file structure for {work_id}, skipping')
|
||||||
|
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
def manual_collate(args):
|
||||||
|
collation_dir = args.destdir / 'site' / 'works' / args.work_id
|
||||||
|
if collation_dir.exists() and len(list(collation_dir.iterdir())) > 0:
|
||||||
|
print(f'Collation directory already exists!')
|
||||||
|
return
|
||||||
|
collation_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
for path in args.paths:
|
||||||
|
if path.is_dir():
|
||||||
|
entries = [p for p in path.iterdir() if p.suffix.lower() in IMAGE_FILE_EXTENSIONS]
|
||||||
|
ordering = complete_prefix_number_ordering(entries)
|
||||||
|
if ordering is None:
|
||||||
|
ordering = entries
|
||||||
|
ordering.sort()
|
||||||
|
link_ordered_files(ordering, collation_dir, start_index=index)
|
||||||
|
index += len(ordering)
|
||||||
|
elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
|
||||||
|
link_ordered_files([path], collation_dir, start_index=index)
|
||||||
|
index += 1
|
||||||
|
elif path.suffix.lower() == ".pdf":
|
||||||
|
link_pdf(path, collation_dir, start_index=index)
|
||||||
|
with fitz.open(path) as pdf:
|
||||||
|
index += pdf.page_count
|
||||||
|
else:
|
||||||
|
print(f'Unknown file type {path}, stopping')
|
||||||
|
return
|
||||||
|
|
||||||
def metadata(args):
|
def metadata(args):
|
||||||
con = sqlite3.connect(args.destdir / 'meta.db')
|
con = sqlite3.connect(args.destdir / 'meta.db')
|
||||||
cur = con.cursor()
|
cur = con.cursor()
|
||||||
|
@ -251,8 +286,26 @@ parser_fetch = subparsers.add_parser('fetch', help='fetch metadata and thumbnail
|
||||||
parser_fetch.set_defaults(func=fetch)
|
parser_fetch.set_defaults(func=fetch)
|
||||||
|
|
||||||
parser_collate = subparsers.add_parser('collate', help='collate a single sequence of image files for each work')
|
parser_collate = subparsers.add_parser('collate', help='collate a single sequence of image files for each work')
|
||||||
|
parser_collate.add_argument(
|
||||||
|
'hints',
|
||||||
|
metavar='PATH',
|
||||||
|
type=Path,
|
||||||
|
nargs='*',
|
||||||
|
help='manually-specified paths of subdirectories or PDFs within extraction folders, at most one per work',
|
||||||
|
)
|
||||||
parser_collate.set_defaults(func=collate)
|
parser_collate.set_defaults(func=collate)
|
||||||
|
|
||||||
|
parser_manual_collate = subparsers.add_parser('manual-collate', help='collate a specific work manually, specifying all paths to include')
|
||||||
|
parser_manual_collate.add_argument('work_id')
|
||||||
|
parser_manual_collate.add_argument(
|
||||||
|
'paths',
|
||||||
|
metavar='PATH',
|
||||||
|
type=Path,
|
||||||
|
nargs='+',
|
||||||
|
help='paths of files (images to symlink, pdfs to extract) or directories (symlink all images in the directory, no recursion, best-effort sorting)'
|
||||||
|
)
|
||||||
|
parser_manual_collate.set_defaults(func=manual_collate)
|
||||||
|
|
||||||
parser_metadata = subparsers.add_parser('metadata', help='view or modify metadata for a work')
|
parser_metadata = subparsers.add_parser('metadata', help='view or modify metadata for a work')
|
||||||
parser_metadata.add_argument('work_id')
|
parser_metadata.add_argument('work_id')
|
||||||
parser_metadata.add_argument(
|
parser_metadata.add_argument(
|
||||||
|
|
Loading…
Reference in a new issue