From ecc110460a1ac1bed638af60168e1a0d836914b9 Mon Sep 17 00:00:00 2001
From: xenofem <xenofem@xeno.science>
Date: Sun, 21 Jan 2024 17:45:56 -0500
Subject: [PATCH] enumerate and link files

---
 dlibrary.py | 99 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 6 deletions(-)

diff --git a/dlibrary.py b/dlibrary.py
index 735a799..d59a2cb 100644
--- a/dlibrary.py
+++ b/dlibrary.py
@@ -3,16 +3,22 @@
 import asyncio
 import os
 import os.path
+import re
 import sqlite3
+from urllib.parse import urlparse
 import zipfile
 
 from dlsite_async import DlsiteAPI
+# import fitz
+import requests
 
 ZIP_DIR = "./zip"
 EXTRACT_DIR = "./extract"
 SITE_DIR = "./site"
 DB_PATH = "./dlibrary.db"
 
+NUMBER_REGEX = re.compile('[0-9]+')
+
 def open_zipfile_with_encoding(path):
     try:
         return zipfile.ZipFile(path, metadata_encoding="utf-8")
@@ -81,10 +87,91 @@ async def populate_db(refresh=False):
                 [{ "tag": tag, "work": work_id } for tag in (metadata.genre or [])],
             )
             con.commit()
+        con.close()
 
-def process(work_id):
-    fetch_metadata(work_id)
-    get_thumbnail(work_id)
-    select_files(work_id)
-    extract_files(work_id)
-    
+def url_file_ext(url):
+    return os.path.splitext(urlparse(url).path)[1]
+
+def get_thumbnails(refresh=False):
+    con = sqlite3.connect(DB_PATH)
+    cur = con.cursor()
+
+    for (work_id, thumbnail_url) in cur.execute("SELECT id, thumbnail_url FROM works"):
+        if thumbnail_url.startswith('//'):
+            thumbnail_url = 'https:' + thumbnail_url
+        ext = url_file_ext(thumbnail_url)
+        dest_file = os.path.join(SITE_DIR, 'thumbnails', work_id + ext)
+        if not refresh:
+            if os.path.exists(dest_file):
+                print(f'Thumbnail for {work_id} is already cached, skipping')
+                continue
+
+        print(f'Downloading thumbnail for {work_id} from {thumbnail_url}')
+        with open(dest_file, 'wb') as fd:
+            with requests.get(thumbnail_url, stream=True) as r:
+                for chunk in r.iter_content(chunk_size=16384):
+                    fd.write(chunk)
+
+def link_files(work_id):
+    work_site_dir = os.path.join(SITE_DIR, "works", work_id)
+    work_images_dir = os.path.join(work_site_dir, "images")
+    os.makedirs(work_images_dir)
+
+    search_dir = os.path.join(EXTRACT_DIR, work_id)
+    while True:
+        entries = os.listdir(search_dir)
+        if len(entries) == 1:
+            entry_path = os.path.join(search_dir, entries[0])
+            if os.path.isdir(entry_path):
+                search_dir = entry_path
+                continue
+        break
+
+    if len(entries) == 1 and os.path.splitext(entry_path)[1] == ".pdf":
+        link_pdf(entry_path, work_images_dir)
+        return
+
+    if len(entries) == 0:
+        print(f'{work_id} contains no files? Skipping')
+        return
+
+    if all(os.path.isfile(os.path.join(search_dir, entry)) for entry in entries):
+        ordering = complete_prefix_number_ordering(entries)
+        if ordering:
+            link_ordered_files(search_dir, ordering, work_images_dir)
+            return
+
+    print(f'Unable to deduce file structure for {work_id}, skipping')
+
+def link_pdf(src, dest):
+    pass
+
+def complete_prefix_number_ordering(entries):
+    matches = reversed(list(NUMBER_REGEX.finditer(entries[0])))
+    for m in matches:
+        pos = m.start()
+        prefix = entries[0][:pos]
+        if all(e.startswith(prefix) for e in entries):
+            entries_with_indices = []
+            indices = set()
+            for e in entries:
+                n = NUMBER_REGEX.match(e[pos:])
+                if n is None:
+                    return None
+                i = int(n.group())
+                if i in indices:
+                    return None
+                indices.add(i)
+                entries_with_indices.append((e, i))
+            entries_with_indices.sort(key=lambda ei: ei[1])
+            return [e for (e, i) in entries_with_indices]
+    return None
+
+def link_ordered_files(srcdir, ordering, dest):
+    for (idx, item) in enumerate(ordering):
+        ext = os.path.splitext(item)[1]
+        target = os.path.join(dest, f'{idx:04d}{ext}')
+        os.link(os.path.join(srcdir, item), target)
+
+def gen_site():
+    pass