From 12529498a16dbfe94c291fc35fd262c0193ab9d2 Mon Sep 17 00:00:00 2001 From: xenofem Date: Sun, 21 Jan 2024 05:13:09 -0500 Subject: [PATCH] starting dlsite curator script --- .gitignore | 1 + dlibrary.py | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 .gitignore create mode 100644 dlibrary.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8fa5b33 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +env \ No newline at end of file diff --git a/dlibrary.py b/dlibrary.py new file mode 100644 index 0000000..735a799 --- /dev/null +++ b/dlibrary.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +import asyncio +import os +import os.path +import sqlite3 +import zipfile + +from dlsite_async import DlsiteAPI + +ZIP_DIR = "./zip" +EXTRACT_DIR = "./extract" +SITE_DIR = "./site" +DB_PATH = "./dlibrary.db" + +def open_zipfile_with_encoding(path): + try: + return zipfile.ZipFile(path, metadata_encoding="utf-8") + except UnicodeDecodeError: + pass + + try: + return zipfile.ZipFile(path, metadata_encoding="shift-jis") + except UnicodeDecodeError: + pass + + return zipfile.ZipFile(path, metadata_encoding="shift-jisx0213") + +def extract(zip_path, remove=False): + work_id = os.path.splitext(os.path.basename(zip_path))[0] + work_extract_path = os.path.join(EXTRACT_DIR, work_id) + os.makedirs(work_extract_path) + + with open_zipfile_with_encoding(zip_path) as z: + z.extractall(path=work_extract_path) + + if remove: + os.remove(zip_path) + +def extract_all(remove=False): + for f in os.listdir(ZIP_DIR): + if f.endswith('.zip'): + print(f'Extracting {f}') + extract(os.path.join(ZIP_DIR, f), remove=remove) + +async def populate_db(refresh=False): + con = sqlite3.connect(DB_PATH) + cur = con.cursor() + + cur.execute("CREATE TABLE IF NOT EXISTS works(id TEXT PRIMARY KEY, title TEXT, circle TEXT, date TEXT, description TEXT, thumbnail_url TEXT)") + cur.execute("CREATE TABLE IF NOT EXISTS authors(author TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id))") + cur.execute("CREATE TABLE IF NOT EXISTS tags(tag TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id))") + + async with DlsiteAPI() as api: + for work_id in os.listdir(EXTRACT_DIR): + if not refresh: + res = cur.execute("SELECT id FROM works WHERE id = ?", (work_id,)) + if res.fetchone() is not None: + print(f'Metadata for {work_id} is already cached, skipping') + continue + + print(f'Fetching metadata for {work_id}') + metadata = await api.get_work(work_id) + cur.execute( + "INSERT INTO works VALUES(:id, :title, :circle, :date, :description, :thumbnail_url)", + { + "id": work_id, + "title": metadata.work_name, + "circle": metadata.circle, + "date": metadata.regist_date.date().isoformat(), + "description": metadata.description, + "thumbnail_url": metadata.work_image, + }, + ) + cur.executemany( + "INSERT INTO authors VALUES(:author, :work)", + [{ "author": author, "work": work_id } for author in (metadata.author or [])], + ) + cur.executemany( + "INSERT INTO tags VALUES(:tag, :work)", + [{ "tag": tag, "work": work_id } for tag in (metadata.genre or [])], + ) + con.commit() + +def process(work_id): + fetch_metadata(work_id) + get_thumbnail(work_id) + select_files(work_id) + extract_files(work_id) +