From ecc110460a1ac1bed638af60168e1a0d836914b9 Mon Sep 17 00:00:00 2001 From: xenofem Date: Sun, 21 Jan 2024 17:45:56 -0500 Subject: [PATCH] enumerate and link files --- dlibrary.py | 99 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 6 deletions(-) diff --git a/dlibrary.py b/dlibrary.py index 735a799..d59a2cb 100644 --- a/dlibrary.py +++ b/dlibrary.py @@ -3,16 +3,22 @@ import asyncio import os import os.path +import re import sqlite3 +from urllib.parse import urlparse import zipfile from dlsite_async import DlsiteAPI +# import fitz +import requests ZIP_DIR = "./zip" EXTRACT_DIR = "./extract" SITE_DIR = "./site" DB_PATH = "./dlibrary.db" +NUMBER_REGEX = re.compile('[0-9]+') + def open_zipfile_with_encoding(path): try: return zipfile.ZipFile(path, metadata_encoding="utf-8") @@ -81,10 +87,91 @@ async def populate_db(refresh=False): [{ "tag": tag, "work": work_id } for tag in (metadata.genre or [])], ) con.commit() + con.close() -def process(work_id): - fetch_metadata(work_id) - get_thumbnail(work_id) - select_files(work_id) - extract_files(work_id) - +def url_file_ext(url): + return os.path.splitext(urlparse(url).path)[1] + +def get_thumbnails(refresh=False): + con = sqlite3.connect(DB_PATH) + cur = con.cursor() + + for (work_id, thumbnail_url) in cur.execute("SELECT id, thumbnail_url FROM works"): + if thumbnail_url.startswith('//'): + thumbnail_url = 'https:' + thumbnail_url + ext = url_file_ext(thumbnail_url) + dest_file = os.path.join(SITE_DIR, 'thumbnails', work_id + ext) + if not refresh: + if os.path.exists(dest_file): + print(f'Thumbnail for {work_id} is already cached, skipping') + continue + + print(f'Downloading thumbnail for {work_id} from {thumbnail_url}') + with open(dest_file, 'wb') as fd: + with requests.get(thumbnail_url, stream=True) as r: + for chunk in r.iter_content(chunk_size=16384): + fd.write(chunk) + +def link_files(work_id): + work_site_dir = os.path.join(SITE_DIR, "works", work_id) + work_images_dir = os.path.join(work_site_dir, "images") + os.makedirs(work_images_dir) + + search_dir = os.path.join(EXTRACT_DIR, work_id) + while True: + entries = os.listdir(search_dir) + if len(entries) == 1: + entry_path = os.path.join(search_dir, entries[0]) + if os.path.isdir(entry_path): + search_dir = entry_path + continue + break + + if len(entries) == 1 and os.path.splitext(entry_path)[1] == ".pdf": + link_pdf(entry_path, work_images_dir) + return + + if len(entries) == 0: + print(f'{work_id} contains no files? Skipping') + return + + if all(os.path.isfile(os.path.join(search_dir, entry)) for entry in entries): + ordering = complete_prefix_number_ordering(entries) + if ordering: + link_ordered_files(search_dir, ordering, work_images_dir) + return + + print(f'Unable to deduce file structure for {work_id}, skipping') + +def link_pdf(src, dest): + pass + +def complete_prefix_number_ordering(entries): + matches = reversed(list(NUMBER_REGEX.finditer(entries[0]))) + for m in matches: + pos = m.start() + prefix = entries[0][:pos] + if all(e.startswith(prefix) for e in entries): + entries_with_indices = [] + indices = set() + for e in entries: + n = NUMBER_REGEX.match(e[pos:]) + if n is None: + return None + i = int(n.group()) + if i in indices: + return None + indices.add(i) + entries_with_indices.append((e, i)) + entries_with_indices.sort(key=lambda ei: ei[1]) + return [e for (e, i) in entries_with_indices] + return None + +def link_ordered_files(srcdir, ordering, dest): + for (idx, item) in enumerate(ordering): + ext = os.path.splitext(item)[1] + target = os.path.join(dest, f'{idx:04d}{ext}') + os.link(os.path.join(srcdir, item), target) + +def gen_site(): + pass