starting dlsite curator script

2024-01-21 05:13:09 -05:00 · 2024-01-21 05:13:09 -05:00 · 12529498a1
commit 12529498a1
2 changed files with 91 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+env
--- a/dlibrary.py
+++ b/dlibrary.py
@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+
+import asyncio
+import os
+import os.path
+import sqlite3
+import zipfile
+
+from dlsite_async import DlsiteAPI
+
+ZIP_DIR = "./zip"
+EXTRACT_DIR = "./extract"
+SITE_DIR = "./site"
+DB_PATH = "./dlibrary.db"
+
+def open_zipfile_with_encoding(path):
+    try:
+        return zipfile.ZipFile(path, metadata_encoding="utf-8")
+    except UnicodeDecodeError:
+        pass
+
+    try:
+        return zipfile.ZipFile(path, metadata_encoding="shift-jis")
+    except UnicodeDecodeError:
+        pass
+
+    return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213")
+
+def extract(zip_path, remove=False):
+    work_id = os.path.splitext(os.path.basename(zip_path))[0]
+    work_extract_path = os.path.join(EXTRACT_DIR, work_id)
+    os.makedirs(work_extract_path)
+
+    with open_zipfile_with_encoding(zip_path) as z:
+        z.extractall(path=work_extract_path)
+
+    if remove:
+        os.remove(zip_path)
+
+def extract_all(remove=False):
+    for f in os.listdir(ZIP_DIR):
+        if f.endswith('.zip'):
+            print(f'Extracting {f}')
+            extract(os.path.join(ZIP_DIR, f), remove=remove)
+
+async def populate_db(refresh=False):
+    con = sqlite3.connect(DB_PATH)
+    cur = con.cursor()
+
+    cur.execute("CREATE TABLE IF NOT EXISTS works(id TEXT PRIMARY KEY, title TEXT, circle TEXT, date TEXT, description TEXT, thumbnail_url TEXT)")
+    cur.execute("CREATE TABLE IF NOT EXISTS authors(author TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id))")
+    cur.execute("CREATE TABLE IF NOT EXISTS tags(tag TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id))")
+
+    async with DlsiteAPI() as api:
+        for work_id in os.listdir(EXTRACT_DIR):
+            if not refresh:
+                res = cur.execute("SELECT id FROM works WHERE id = ?", (work_id,))
+                if res.fetchone() is not None:
+                    print(f'Metadata for {work_id} is already cached, skipping')
+                    continue
+
+            print(f'Fetching metadata for {work_id}')
+            metadata = await api.get_work(work_id)
+            cur.execute(
+                "INSERT INTO works VALUES(:id, :title, :circle, :date, :description, :thumbnail_url)",
+                {
+                    "id": work_id,
+                    "title": metadata.work_name,
+                    "circle": metadata.circle,
+                    "date": metadata.regist_date.date().isoformat(),
+                    "description": metadata.description,
+                    "thumbnail_url": metadata.work_image,
+                },
+            )
+            cur.executemany(
+                "INSERT INTO authors VALUES(:author, :work)",
+                [{ "author": author, "work": work_id } for author in (metadata.author or [])],
+            )
+            cur.executemany(
+                "INSERT INTO tags VALUES(:tag, :work)",
+                [{ "tag": tag, "work": work_id } for tag in (metadata.genre or [])],
+            )
+            con.commit()
+
+def process(work_id):
+    fetch_metadata(work_id)
+    get_thumbnail(work_id)
+    select_files(work_id)
+    extract_files(work_id)
+