Compare commits

..

No commits in common. "657ec65e4a3d788f42af8b5511789f0ead101e93" and "d66b467c5c037de659ffa7832fe30f6a5a03c0c1" have entirely different histories.

View file

@ -25,7 +25,7 @@ FANZA_ID_REGEX = re.compile('^d_[0-9]+$')
FAKKU_ID_REGEX = re.compile('.*_FAKKU$') FAKKU_ID_REGEX = re.compile('.*_FAKKU$')
TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless') TEXTLESS_REGEX = re.compile('(台詞|セリフ)(な|無)し|notext|textless')
ALT_VERSIONS = ['褐色', '日焼け', 'pink'] ALT_VERSIONS = ['褐色', '日焼け']
IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff'] IMAGE_FILE_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.gif', '.tiff']
@ -179,12 +179,12 @@ def image_xrefs(pdf):
print('\nSuccess') print('\nSuccess')
return xrefs return xrefs
def link_pdf(src, dest, start_index): def link_pdf(src, dest, start_index=0):
with fitz.open(src) as pdf: with fitz.open(src) as pdf:
xrefs = image_xrefs(pdf) xrefs = image_xrefs(pdf)
if xrefs is None: if xrefs is None:
print(f'Support for weirder PDFs not yet implemented, skipping {src}') print(f'Support for weirder PDFs not yet implemented, skipping {src}')
return None return
dest.mkdir(parents=True, exist_ok=True) dest.mkdir(parents=True, exist_ok=True)
for (idx, xref) in enumerate(xrefs, start=start_index): for (idx, xref) in enumerate(xrefs, start=start_index):
@ -193,8 +193,6 @@ def link_pdf(src, dest, start_index):
with open(file_path, 'wb') as f: with open(file_path, 'wb') as f:
f.write(image["image"]) f.write(image["image"])
return pdf.page_count
def complete_prefix_number_ordering(entries): def complete_prefix_number_ordering(entries):
if len(entries) == 1: if len(entries) == 1:
return entries return entries
@ -204,7 +202,7 @@ def complete_prefix_number_ordering(entries):
version = next(ver for ver in (ALT_VERSIONS + ['']) if ver in entry.name) version = next(ver for ver in (ALT_VERSIONS + ['']) if ver in entry.name)
entries_by_version.setdefault(version, []).append(entry) entries_by_version.setdefault(version, []).append(entry)
numberings_by_version = {ver: unique_hierarchical_prefix_numbering(entries_by_version[ver]) for ver in entries_by_version} numberings_by_version = {ver: prefix_numbering(entries_by_version[ver]) for ver in entries_by_version}
unified_indices = set() unified_indices = set()
for numbering in numberings_by_version.values(): for numbering in numberings_by_version.values():
@ -214,15 +212,8 @@ def complete_prefix_number_ordering(entries):
unified_indices = list(unified_indices) unified_indices = list(unified_indices)
unified_indices.sort() unified_indices.sort()
if len(unified_indices) > 1: if len(unified_indices) > 1 and min(unified_indices[i] - unified_indices[i-1] for i in range(1, len(unified_indices))) > 2:
for i in range(1, len(unified_indices)):
cur = unified_indices[i]
prev = unified_indices[i-1]
for level in range(min(len(cur), len(prev))):
if cur[level] != prev[level]:
if cur[level] - prev[level] > 2:
return None return None
break
versions = list(numberings_by_version.keys()) versions = list(numberings_by_version.keys())
versions.sort() versions.sort()
@ -240,57 +231,31 @@ def complete_prefix_number_ordering(entries):
for out_ver in outer_versions: for out_ver in outer_versions:
for i in unified_indices: for i in unified_indices:
for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])): for ver in ([out_ver] + (inner_versions if out_ver == versions[0] else [])):
result += numberings_by_version[ver].get(i, []) entries_i_ver = numberings_by_version[ver].get(i, [])
if len(entries_i_ver) <= 1:
result += entries_i_ver
else:
return None
return result return result
def unique_hierarchical_prefix_numbering(entries, start_point=0): def prefix_numbering(entries):
matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name))) matches = reversed(list(NUMBER_REGEX.finditer(entries[0].name)))
for m in matches: for m in matches:
pos = m.start() pos = m.start()
if pos < start_point:
return None
prefix = entries[0].name[:pos] prefix = entries[0].name[:pos]
if all(e.name.startswith(prefix) for e in entries): if all(e.name.startswith(prefix) for e in entries):
numbering = {} entries_by_index = {}
for e in entries: for e in entries:
n = NUMBER_REGEX.match(e.name[pos:]) n = NUMBER_REGEX.match(e.name[pos:])
if n is None: if n is None:
return None return None
i = int(n.group()) i = int(n.group())
numbering.setdefault((i,), []).append(e) entries_by_index.setdefault(i, []).append(e)
return entries_by_index
indices = list(numbering.keys())
for idx in indices:
if len(numbering[idx]) > 1:
ents_idx = numbering.pop(idx)
next_layer_start = pos + NUMBER_REGEX.match(ents_idx[0].name[pos:]).end()
sub_numbering = unique_hierarchical_prefix_numbering(ents_idx, start_point=next_layer_start) or alphabetic_numbering(ents_idx, next_layer_start)
if not sub_numbering:
return None
for sub_idx in sub_numbering:
numbering[(*idx, *sub_idx)] = sub_numbering[sub_idx]
return numbering
return None return None
def alphabetic_numbering(entries, start_point): def link_ordered_files(ordering, dest, start_index=0):
alphabetized = {}
for entry in entries:
ending = entry.stem[start_point:]
if len(ending) > 1:
return None
index = 0 if ending == '' else ord(ending.lower()) - ord('a')
if index in alphabetized:
return None
alphabetized[(index,)] = [entry]
indices = list(alphabetized.keys())
indices.sort()
if indices != [(i,) for i in range(len(indices))]:
return None
return alphabetized
def link_ordered_files(ordering, dest, start_index):
dest.mkdir(parents=True, exist_ok=True) dest.mkdir(parents=True, exist_ok=True)
for (idx, src_path) in enumerate(ordering, start=start_index): for (idx, src_path) in enumerate(ordering, start=start_index):
@ -311,9 +276,6 @@ def collate(args):
extraction_dir = args.destdir / 'extract' extraction_dir = args.destdir / 'extract'
hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints} hint_map = {hint.relative_to(extraction_dir).parents[-2].name: hint for hint in args.hints}
collation_staging_area = args.destdir / 'site' / 'images-staging'
collation_staging_area.mkdir(parents=True)
for work_path in extraction_dir.iterdir(): for work_path in extraction_dir.iterdir():
work_id = work_path.name work_id = work_path.name
@ -325,63 +287,50 @@ def collate(args):
if virtual == (1,): if virtual == (1,):
continue continue
work_staging_dir = collation_staging_area / work_id if work_id in hint_map:
hint = hint_map[work_id]
pages_collated = collate_from_paths([hint_map.get(work_id, work_path)], work_staging_dir, 0) entries = [hint] if hint.is_file() else ls_ignore(hint)
if pages_collated:
print(f'Collated {pages_collated} pages for {work_id}')
work_staging_dir.rename(collation_dir)
else: else:
if work_staging_dir.is_dir(): search_dir = work_path
for f in work_staging_dir.iterdir(): while True:
f.unlink() entries = ls_ignore(search_dir)
work_staging_dir.rmdir() if len(entries) == 1 and entries[0].is_dir():
search_dir = entries[0]
else:
break
if pages_collated == 0: if len(entries) == 1 and entries[0].suffix.lower() == '.pdf':
print(f'Extracting images from {entries[0]} for {work_id}')
link_pdf(entries[0], collation_dir)
continue
if len(entries) == 0:
print(f'{work_id} contains no files? skipping') print(f'{work_id} contains no files? skipping')
elif pages_collated is None: continue
print(f'Unable to deduce file structure for {work_id}, skipping')
collation_staging_area.rmdir()
con.close()
def collate_from_paths(srcs, dest, start_index):
if len(srcs) == 1 and srcs[0].is_dir():
return collate_from_paths(ls_ignore(srcs[0]), dest, start_index)
if len(srcs) == 1 and srcs[0].suffix.lower() == '.pdf':
print(f'Extracting images from {srcs[0]}')
return link_pdf(srcs[0], dest, start_index)
if len(srcs) == 0:
return 0
if all(entry.is_file() and entry.suffix.lower() in IMAGE_FILE_EXTENSIONS for entry in entries):
ordering = complete_prefix_number_ordering(entries)
if not ordering:
with_text = [] with_text = []
textless = [] textless = []
for src in srcs: for entry in entries:
if TEXTLESS_REGEX.search(src.name): if TEXTLESS_REGEX.search(entry.name):
textless.append(src) textless.append(entry)
else: else:
with_text.append(src) with_text.append(entry)
if with_text and textless: if with_text and textless:
text_pages = collate_from_paths(with_text, dest, start_index) with_text_ordering = complete_prefix_number_ordering(with_text)
if text_pages is None: textless_ordering = complete_prefix_number_ordering(textless)
return None if with_text_ordering and textless_ordering:
textless_pages = collate_from_paths(textless, dest, start_index+text_pages) ordering = with_text_ordering + textless_ordering
if textless_pages is None:
return None
return text_pages + textless_pages
if all(src.is_file() and src.suffix.lower() in IMAGE_FILE_EXTENSIONS for src in srcs):
ordering = complete_prefix_number_ordering(srcs)
if ordering: if ordering:
print(f'Symlinking image files: {ordering[0]}...') print(f'Symlinking image files for {work_id}')
link_ordered_files(ordering, dest, start_index) link_ordered_files(ordering, collation_dir)
return len(ordering) continue
else:
return None
return None print(f'Unable to deduce file structure for {work_id}, skipping')
con.close()
def self_and_parents(path): def self_and_parents(path):
return [path] + list(path.parents) return [path] + list(path.parents)
@ -409,16 +358,15 @@ def manual_collate(args):
if ordering is None: if ordering is None:
ordering = entries ordering = entries
ordering.sort() ordering.sort()
link_ordered_files(ordering, collation_dir, index) link_ordered_files(ordering, collation_dir, start_index=index)
index += len(ordering) index += len(ordering)
elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS: elif path.suffix.lower() in IMAGE_FILE_EXTENSIONS:
link_ordered_files([path], collation_dir, index) link_ordered_files([path], collation_dir, start_index=index)
index += 1 index += 1
elif path.suffix.lower() == ".pdf": elif path.suffix.lower() == ".pdf":
pdf_page_count = link_pdf(path, collation_dir, index) link_pdf(path, collation_dir, start_index=index)
if pdf_page_count is None: with fitz.open(path) as pdf:
return index += pdf.page_count
index += pdf_page_count
else: else:
print(f'Unknown file type {path}, stopping') print(f'Unknown file type {path}, stopping')
return return