2024-01-21 05:13:09 -05:00
#!/usr/bin/env python3
2024-01-22 02:16:06 -05:00
import argparse
2024-01-21 05:13:09 -05:00
import asyncio
2024-01-22 22:06:04 -05:00
import importlib_resources as resources
2024-01-22 02:16:06 -05:00
from pathlib import Path
2024-02-06 23:01:59 -05:00
import os
2024-01-22 02:16:06 -05:00
from os . path import relpath , splitext
2024-01-21 17:45:56 -05:00
import re
2024-02-07 00:24:30 -05:00
import readline
2024-01-22 07:01:41 -05:00
import shutil
2024-01-21 05:13:09 -05:00
import sqlite3
2024-01-23 15:54:17 -05:00
import textwrap
2024-01-21 17:45:56 -05:00
from urllib . parse import urlparse
2024-01-21 05:13:09 -05:00
import zipfile
from dlsite_async import DlsiteAPI
2024-01-22 02:16:06 -05:00
import fitz
2024-02-06 23:01:59 -05:00
from PIL import Image
2024-01-22 22:06:04 -05:00
from jinja2 import Environment , PackageLoader , select_autoescape
2024-01-21 17:45:56 -05:00
import requests
2024-01-21 05:13:09 -05:00
2024-02-08 04:54:36 -05:00
NUMBER_REGEX = re . compile ( ' [0-90 -9 ]+ ' )
2024-01-21 17:45:56 -05:00
2024-01-29 03:01:35 -05:00
DLSITE_ID_REGEX = re . compile ( ' ^[BR]J[0-9]+$ ' )
2024-01-29 04:11:55 -05:00
FANZA_ID_REGEX = re . compile ( ' ^d_[0-9]+$ ' )
2024-01-29 21:25:21 -05:00
FAKKU_ID_REGEX = re . compile ( ' .*_FAKKU$ ' )
2024-01-29 03:01:35 -05:00
2024-02-08 04:32:05 -05:00
TEXTLESS_REGEX = re . compile ( ' (台詞|セリフ|せりふ|テキスト|文字)((な|無)し|抜き)|notext|textless ' , re . IGNORECASE )
2024-02-08 00:06:30 -05:00
EPILOGUE_REGEX = re . compile ( ' after|後日談 ' , re . IGNORECASE )
HI_RES_REGEX = re . compile ( ' 高解像度 ' , re . IGNORECASE )
2024-02-08 05:11:17 -05:00
FRONT_COVER_REGEX = re . compile ( ' (^|[^裏])表紙|cover|hyoushi ' , re . IGNORECASE )
BACK_COVER_REGEX = re . compile ( ' 裏表紙 ' , re . IGNORECASE )
2024-02-06 13:29:38 -05:00
ALT_VERSIONS = [
' 褐色 ' ,
' 日焼け ' ,
' pink ' ,
' 金髪 ' ,
' 白肌 ' ,
' うつろ目 ' ,
' dark skin ' ,
' ラバー ' ,
' ゾンビ肌 ' ,
' マスク ' ,
2024-02-06 13:59:49 -05:00
' アヘ顔 ' ,
2024-02-06 13:29:38 -05:00
]
2024-02-06 09:26:39 -05:00
2024-02-08 00:14:24 -05:00
IMAGE_FILE_EXTENSIONS = [ ' .png ' , ' .jpg ' , ' .jpeg ' , ' .gif ' , ' .tiff ' , ' .bmp ' ]
2024-01-22 02:16:06 -05:00
2024-01-22 07:56:45 -05:00
IGNOREABLE_FILES = [ ' Thumbs.db ' , ' __MACOSX ' , ' .DS_Store ' ]
2024-02-06 23:02:59 -05:00
IGNOREABLE_EXTENSIONS = [ ' .txt ' , ' .html ' , ' .htm ' , ' .psd ' , ' .mp4 ' ]
2024-01-22 07:56:45 -05:00
2024-01-21 05:13:09 -05:00
def open_zipfile_with_encoding ( path ) :
try :
return zipfile . ZipFile ( path , metadata_encoding = " utf-8 " )
except UnicodeDecodeError :
pass
try :
return zipfile . ZipFile ( path , metadata_encoding = " shift-jis " )
except UnicodeDecodeError :
pass
return zipfile . ZipFile ( path , metadata_encoding = " shift-jisx0213 " )
2024-01-22 02:16:06 -05:00
def extract ( args ) :
for zip_path in args . zipfiles :
work_id = zip_path . stem
work_extract_path = args . destdir / ' extract ' / work_id
work_extract_path . mkdir ( parents = True )
2024-01-21 05:13:09 -05:00
2024-02-04 00:49:26 -05:00
print ( f ' Extracting { zip_path } to { work_extract_path } ' )
2024-01-22 02:16:06 -05:00
with open_zipfile_with_encoding ( zip_path ) as z :
z . extractall ( path = work_extract_path )
2024-01-21 05:13:09 -05:00
2024-01-22 02:16:06 -05:00
if args . remove :
zip_path . unlink ( )
2024-01-21 05:13:09 -05:00
2024-01-29 04:11:55 -05:00
def manual_input_metadata ( work_id ) :
print ( f " Don ' t know how to fetch metadata for { work_id } , input manually: " )
title = input ( ' Title: ' )
circle = input ( ' Circle [None]: ' ) or None
authors = [ author . strip ( ) for author in input ( ' Authors (comma-separated): ' ) . split ( ' , ' ) if author . strip ( ) ]
tags = [ tag . strip ( ) for tag in input ( ' Tags (comma-separated): ' ) . split ( ' , ' ) if tag . strip ( ) ]
date = input ( ' Pub date (yyyy-mm-dd): ' )
description = input ( ' Description: ' )
series = input ( ' Series [None]: ' ) or None
return {
" id " : work_id ,
" title " : title ,
" circle " : circle ,
" authors " : authors ,
" tags " : tags ,
" date " : date ,
" description " : description ,
" series " : series ,
}
2024-01-22 02:16:06 -05:00
async def fetch_async ( args ) :
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
2024-01-21 05:13:09 -05:00
cur = con . cursor ( )
2024-01-22 02:16:06 -05:00
cur . execute ( " CREATE TABLE IF NOT EXISTS works(id TEXT PRIMARY KEY, title TEXT, circle TEXT, date TEXT, description TEXT, series TEXT, virtual INT) " )
2024-02-07 00:09:42 -05:00
cur . execute ( " CREATE TABLE IF NOT EXISTS authors(author TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(author, work)) " )
cur . execute ( " CREATE TABLE IF NOT EXISTS tags(tag TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(tag, work)) " )
2024-01-21 05:13:09 -05:00
2024-01-22 02:16:06 -05:00
thumbnails_dir = args . destdir / ' site ' / ' thumbnails '
thumbnails_dir . mkdir ( parents = True , exist_ok = True )
2024-01-24 02:33:14 -05:00
async with DlsiteAPI ( locale = args . locale ) as api :
2024-01-22 02:16:06 -05:00
for work_path in ( args . destdir / ' extract ' ) . iterdir ( ) :
work_id = work_path . name
res = cur . execute ( " SELECT id FROM works WHERE id = ? " , ( work_id , ) )
if res . fetchone ( ) is not None :
continue
2024-01-21 05:13:09 -05:00
2024-01-29 04:11:55 -05:00
if DLSITE_ID_REGEX . fullmatch ( work_id ) :
print ( f ' Fetching DLSite metadata for { work_id } ' )
dlsite_metadata = await api . get_work ( work_id )
db_row = {
" id " : work_id ,
" title " : dlsite_metadata . work_name ,
" circle " : dlsite_metadata . circle ,
" date " : dlsite_metadata . regist_date . date ( ) . isoformat ( ) ,
" description " : dlsite_metadata . description ,
" series " : dlsite_metadata . series ,
}
authors = dlsite_metadata . author or [ ]
tags = dlsite_metadata . genre or [ ]
thumbnail_url = dlsite_metadata . work_image
if thumbnail_url . startswith ( ' // ' ) :
thumbnail_url = ' https: ' + thumbnail_url
2024-01-29 21:25:21 -05:00
else :
2024-01-29 04:11:55 -05:00
db_row = manual_input_metadata ( work_id )
authors = db_row . pop ( ' authors ' )
tags = db_row . pop ( ' tags ' )
2024-01-29 21:25:21 -05:00
if FANZA_ID_REGEX . fullmatch ( work_id ) :
thumbnail_url = f ' https://doujin-assets.dmm.co.jp/digital/comic/ { work_id } / { work_id } pl.jpg '
elif FAKKU_ID_REGEX . fullmatch ( work_id ) :
2024-02-06 08:29:52 -05:00
thumbnail_url = None
2024-01-29 21:25:21 -05:00
else :
2024-02-06 08:29:52 -05:00
thumbnail_url = input ( ' Thumbnail image URL [default: first page]: ' )
2024-01-29 03:01:35 -05:00
2024-01-21 05:13:09 -05:00
cur . execute (
2024-01-22 02:16:06 -05:00
" INSERT INTO works(id, title, circle, date, description, series) VALUES(:id, :title, :circle, :date, :description, :series) " ,
2024-01-29 04:11:55 -05:00
db_row ,
2024-01-21 05:13:09 -05:00
)
cur . executemany (
" INSERT INTO authors VALUES(:author, :work) " ,
2024-01-29 04:11:55 -05:00
[ { " author " : author , " work " : work_id } for author in authors ] ,
2024-01-21 05:13:09 -05:00
)
cur . executemany (
" INSERT INTO tags VALUES(:tag, :work) " ,
2024-01-29 04:11:55 -05:00
[ { " tag " : tag , " work " : work_id } for tag in tags ] ,
2024-01-21 05:13:09 -05:00
)
2024-01-22 02:16:06 -05:00
2024-02-06 08:29:52 -05:00
if thumbnail_url :
2024-01-29 21:25:21 -05:00
ext = url_file_ext ( thumbnail_url )
dest_file = thumbnails_dir / ( work_id + ext )
print ( f ' Downloading thumbnail for { work_id } from { thumbnail_url } ' )
with open ( dest_file , ' wb ' ) as fd :
with requests . get ( thumbnail_url , stream = True ) as r :
for chunk in r . iter_content ( chunk_size = 16384 ) :
fd . write ( chunk )
2024-01-22 02:16:06 -05:00
2024-01-21 05:13:09 -05:00
con . commit ( )
2024-01-22 02:16:06 -05:00
2024-01-21 17:45:56 -05:00
con . close ( )
def url_file_ext ( url ) :
2024-01-22 02:16:06 -05:00
return splitext ( urlparse ( url ) . path ) [ 1 ]
2024-01-21 17:45:56 -05:00
2024-01-22 02:16:06 -05:00
def fetch ( args ) :
asyncio . run ( fetch_async ( args ) )
2024-01-21 17:45:56 -05:00
2024-01-22 10:36:20 -05:00
def image_xrefs ( pdf ) :
images_by_page = [ page . get_images ( ) for page in pdf ]
if all ( len ( images ) == 1 for images in images_by_page ) :
return [ images [ 0 ] [ 0 ] for images in images_by_page ]
print ( " Checking PDF images the quick way failed, trying the slow way " )
xrefs = [ ]
for ( idx , page ) in enumerate ( pdf ) :
print ( f ' \x1b [2K \r { idx } / { pdf . page_count } pages processed... ' , end = ' ' )
images = page . get_image_info ( xrefs = True )
if len ( images ) != 1 or images [ 0 ] [ ' xref ' ] == 0 :
print ( ' \n Failed ' )
return None
xrefs . append ( images [ 0 ] [ ' xref ' ] )
print ( ' \n Success ' )
return xrefs
2024-02-06 11:59:20 -05:00
def link_pdf ( src , dest , start_index ) :
2024-01-22 02:58:17 -05:00
with fitz . open ( src ) as pdf :
2024-01-22 10:36:20 -05:00
xrefs = image_xrefs ( pdf )
if xrefs is None :
2024-01-22 02:58:17 -05:00
print ( f ' Support for weirder PDFs not yet implemented, skipping { src } ' )
2024-02-06 11:59:20 -05:00
return None
2024-01-22 10:36:20 -05:00
dest . mkdir ( parents = True , exist_ok = True )
for ( idx , xref ) in enumerate ( xrefs , start = start_index ) :
image = pdf . extract_image ( xref )
file_path = dest / f ' { idx : 04d } . { image [ " ext " ] } '
with open ( file_path , ' wb ' ) as f :
f . write ( image [ " image " ] )
2024-01-21 17:45:56 -05:00
2024-02-06 11:59:20 -05:00
return pdf . page_count
2024-01-21 17:45:56 -05:00
def complete_prefix_number_ordering ( entries ) :
2024-02-06 09:26:39 -05:00
if len ( entries ) == 1 :
return entries
2024-02-06 10:22:11 -05:00
entries_by_version = { }
for entry in entries :
2024-02-06 13:20:56 -05:00
version_code = 0
for ( i , version ) in enumerate ( ALT_VERSIONS ) :
if version in entry . name :
version_code | = ( 1 << i )
entries_by_version . setdefault ( version_code , [ ] ) . append ( entry )
2024-02-06 10:22:11 -05:00
2024-02-06 11:02:08 -05:00
numberings_by_version = { ver : unique_hierarchical_prefix_numbering ( entries_by_version [ ver ] ) for ver in entries_by_version }
2024-02-06 10:22:11 -05:00
unified_indices = set ( )
for numbering in numberings_by_version . values ( ) :
if numbering is None :
return None
unified_indices | = set ( numbering . keys ( ) )
2024-02-06 13:20:56 -05:00
unified_indices . discard ( None )
2024-02-06 10:22:11 -05:00
unified_indices = list ( unified_indices )
unified_indices . sort ( )
2024-02-07 23:59:35 -05:00
min_delta_by_level = { }
2024-02-06 11:02:08 -05:00
if len ( unified_indices ) > 1 :
for i in range ( 1 , len ( unified_indices ) ) :
cur = unified_indices [ i ]
prev = unified_indices [ i - 1 ]
for level in range ( min ( len ( cur ) , len ( prev ) ) ) :
if cur [ level ] != prev [ level ] :
2024-02-07 23:59:35 -05:00
delta = cur [ level ] - prev [ level ]
min_delta_by_level [ level ] = min ( min_delta_by_level . get ( level , delta ) , delta )
if any ( delta > 2 for delta in min_delta_by_level . values ( ) ) :
return None
2024-02-06 10:22:48 -05:00
2024-02-06 13:20:56 -05:00
unified_indices . append ( None )
2024-02-06 10:22:11 -05:00
versions = list ( numberings_by_version . keys ( ) )
versions . sort ( )
version_lengths = { ver : len ( numberings_by_version [ ver ] ) for ver in numberings_by_version }
inner_versions = [ ]
outer_versions = [ versions [ 0 ] ]
for ver in versions [ 1 : ] :
if version_lengths [ ver ] > = version_lengths [ versions [ 0 ] ] - 2 :
outer_versions . append ( ver )
else :
inner_versions . append ( ver )
result = [ ]
for out_ver in outer_versions :
for i in unified_indices :
for ver in ( [ out_ver ] + ( inner_versions if out_ver == versions [ 0 ] else [ ] ) ) :
2024-02-06 11:02:08 -05:00
result + = numberings_by_version [ ver ] . get ( i , [ ] )
2024-02-06 10:22:11 -05:00
return result
2024-02-06 11:02:08 -05:00
def unique_hierarchical_prefix_numbering ( entries , start_point = 0 ) :
2024-02-06 13:20:56 -05:00
if len ( entries ) == 1 and not NUMBER_REGEX . search ( entries [ 0 ] . name ) :
return { None : entries }
2024-02-06 13:00:44 -05:00
longest_entry = max ( entries , key = lambda e : len ( e . name ) )
matches = reversed ( list ( NUMBER_REGEX . finditer ( longest_entry . name ) ) )
2024-01-21 17:45:56 -05:00
for m in matches :
pos = m . start ( )
2024-02-06 11:02:08 -05:00
if pos < start_point :
return None
2024-02-06 13:00:44 -05:00
prefix = longest_entry . name [ : pos ]
if all ( e . name . startswith ( prefix ) or prefix . startswith ( e . stem ) for e in entries ) :
2024-02-06 11:02:08 -05:00
numbering = { }
2024-01-21 17:45:56 -05:00
for e in entries :
2024-02-06 13:00:44 -05:00
if pos > = len ( e . stem ) :
i = 0
else :
n = NUMBER_REGEX . match ( e . name [ pos : ] )
if n is None :
return None
i = int ( n . group ( ) )
2024-02-06 11:02:08 -05:00
numbering . setdefault ( ( i , ) , [ ] ) . append ( e )
indices = list ( numbering . keys ( ) )
for idx in indices :
if len ( numbering [ idx ] ) > 1 :
ents_idx = numbering . pop ( idx )
2024-02-06 13:00:44 -05:00
longest = max ( ents_idx , key = lambda e : len ( e . name ) )
next_layer_start = pos + NUMBER_REGEX . match ( longest . name [ pos : ] ) . end ( )
2024-02-06 12:19:11 -05:00
sub_numbering = unique_hierarchical_prefix_numbering ( ents_idx , start_point = next_layer_start ) or alphabetic_numbering ( ents_idx , next_layer_start )
2024-02-06 11:02:08 -05:00
if not sub_numbering :
return None
for sub_idx in sub_numbering :
numbering [ ( * idx , * sub_idx ) ] = sub_numbering [ sub_idx ]
return numbering
2024-02-06 10:22:11 -05:00
2024-01-21 17:45:56 -05:00
return None
2024-02-06 12:19:11 -05:00
def alphabetic_numbering ( entries , start_point ) :
alphabetized = { }
for entry in entries :
ending = entry . stem [ start_point : ]
if len ( ending ) > 1 :
return None
2024-02-07 20:45:46 -05:00
index = 0 if ending == ' ' else ord ( ending . lower ( ) ) - ord ( ' a ' ) + 1
2024-02-07 17:12:02 -05:00
if ( index , ) in alphabetized :
2024-02-06 12:19:11 -05:00
return None
alphabetized [ ( index , ) ] = [ entry ]
indices = list ( alphabetized . keys ( ) )
indices . sort ( )
if indices != [ ( i , ) for i in range ( len ( indices ) ) ] :
return None
return alphabetized
2024-02-06 11:59:20 -05:00
def link_ordered_files ( ordering , dest , start_index ) :
2024-01-22 03:49:00 -05:00
dest . mkdir ( parents = True , exist_ok = True )
2024-01-22 02:16:06 -05:00
2024-01-22 03:49:00 -05:00
for ( idx , src_path ) in enumerate ( ordering , start = start_index ) :
2024-01-22 02:16:06 -05:00
ext = src_path . suffix . lower ( )
link_path = dest / f ' { idx : 04d } { ext } '
link_path . symlink_to ( relpath ( src_path , dest ) )
2024-01-21 05:13:09 -05:00
2024-02-07 19:18:19 -05:00
def check_extension ( path , exts ) :
return path . suffix . lower ( ) in exts
def is_pdf ( path ) :
2024-02-07 19:51:46 -05:00
return check_extension ( path , [ ' .pdf ' ] )
2024-02-07 19:18:19 -05:00
def is_image ( path ) :
2024-02-07 19:51:46 -05:00
return check_extension ( path , IMAGE_FILE_EXTENSIONS )
2024-02-07 19:18:19 -05:00
2024-02-06 23:01:59 -05:00
def ignoreable ( path ) :
2024-02-07 19:18:19 -05:00
return path . name in IGNOREABLE_FILES or check_extension ( path , IGNOREABLE_EXTENSIONS )
2024-02-06 23:01:59 -05:00
2024-02-07 22:32:31 -05:00
def ls_ignore ( directory , exclude ) :
2024-01-22 07:56:45 -05:00
return [
path for path in directory . iterdir ( )
2024-02-07 22:32:31 -05:00
if not ignoreable ( path ) and path not in exclude
2024-01-22 07:56:45 -05:00
]
2024-02-07 22:32:31 -05:00
def descendant_files_ignore ( path , exclude ) :
2024-02-07 19:11:37 -05:00
if path . is_file ( ) :
return [ path ]
2024-02-06 23:01:59 -05:00
result = [ ]
2024-02-07 22:32:31 -05:00
for item in ls_ignore ( path , exclude ) :
2024-02-06 23:01:59 -05:00
if item . is_dir ( ) :
2024-02-07 22:32:31 -05:00
result . extend ( descendant_files_ignore ( item , exclude ) )
2024-02-06 23:01:59 -05:00
else :
result . append ( item )
return result
2024-01-22 03:49:00 -05:00
def collate ( args ) :
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
cur = con . cursor ( )
extraction_dir = args . destdir / ' extract '
2024-02-07 22:32:31 -05:00
hint_map = { Path ( relpath ( hint , extraction_dir ) ) . parents [ - 2 ] . name : hint for hint in args . hints }
2024-01-22 03:49:00 -05:00
2024-02-06 11:59:20 -05:00
collation_staging_area = args . destdir / ' site ' / ' images-staging '
collation_staging_area . mkdir ( parents = True )
2024-01-22 03:49:00 -05:00
for work_path in extraction_dir . iterdir ( ) :
work_id = work_path . name
2024-01-22 07:01:41 -05:00
collation_dir = args . destdir / ' site ' / ' images ' / work_id
2024-01-22 03:49:00 -05:00
if collation_dir . exists ( ) :
continue
virtual = cur . execute ( " SELECT virtual FROM works WHERE id = ? " , ( work_id , ) ) . fetchone ( )
if virtual == ( 1 , ) :
continue
2024-02-06 11:59:20 -05:00
work_staging_dir = collation_staging_area / work_id
2024-02-07 22:32:31 -05:00
pages_collated = collate_from_paths ( [ hint_map . get ( work_id , work_path ) ] , work_staging_dir , 0 , [ ] )
2024-02-06 11:59:20 -05:00
if pages_collated :
print ( f ' Collated { pages_collated } pages for { work_id } ' )
work_staging_dir . rename ( collation_dir )
2024-01-22 03:49:00 -05:00
else :
2024-02-06 11:59:20 -05:00
if work_staging_dir . is_dir ( ) :
for f in work_staging_dir . iterdir ( ) :
f . unlink ( )
work_staging_dir . rmdir ( )
2024-01-22 03:49:00 -05:00
2024-02-06 11:59:20 -05:00
if pages_collated == 0 :
print ( f ' { work_id } contains no files? skipping ' )
elif pages_collated is None :
print ( f ' Unable to deduce file structure for { work_id } , skipping ' )
2024-01-22 03:49:00 -05:00
2024-02-06 11:59:20 -05:00
collation_staging_area . rmdir ( )
con . close ( )
2024-01-22 03:49:00 -05:00
2024-02-07 22:32:31 -05:00
def try_collate_split_regex ( srcs , dest , start_index , exclude , earlier = None , later = None ) :
2024-02-07 17:42:18 -05:00
early_srcs = [ ]
middle_srcs = [ ]
late_srcs = [ ]
2024-02-06 12:48:00 -05:00
for src in srcs :
2024-02-07 17:42:18 -05:00
if earlier and earlier . search ( src . name ) :
early_srcs . append ( src )
elif later and later . search ( src . name ) :
late_srcs . append ( src )
2024-02-06 12:48:00 -05:00
else :
2024-02-07 17:42:18 -05:00
middle_srcs . append ( src )
2024-02-06 12:48:00 -05:00
2024-02-07 17:42:18 -05:00
if sum ( 1 for l in [ early_srcs , middle_srcs , late_srcs ] if l ) < = 1 :
2024-02-06 12:48:00 -05:00
return False
2024-02-07 22:32:31 -05:00
early_page_count = collate_from_paths ( early_srcs , dest , start_index , exclude )
2024-02-07 17:42:18 -05:00
if early_page_count is None :
2024-02-06 12:48:00 -05:00
return None
2024-02-07 17:42:18 -05:00
start_index + = early_page_count
2024-02-07 22:32:31 -05:00
middle_page_count = collate_from_paths ( middle_srcs , dest , start_index , exclude )
2024-02-07 17:42:18 -05:00
if middle_page_count is None :
return None
start_index + = middle_page_count
2024-02-07 22:32:31 -05:00
late_page_count = collate_from_paths ( late_srcs , dest , start_index , exclude )
2024-02-07 17:42:18 -05:00
if late_page_count is None :
2024-02-06 12:48:00 -05:00
return None
2024-02-07 17:42:18 -05:00
return early_page_count + middle_page_count + late_page_count
2024-02-06 12:48:00 -05:00
2024-02-06 23:01:59 -05:00
def standalone_image_size ( filepath ) :
with Image . open ( filepath ) as im :
return im . size
def pdf_image_sizes ( filepath ) :
sizes_by_xref = { }
with fitz . open ( filepath ) as pdf :
for page in pdf :
for ( xref , _ , width , height , * _ ) in page . get_images ( ) :
if xref in sizes_by_xref :
continue
sizes_by_xref [ xref ] = ( width , height )
return list ( sizes_by_xref . values ( ) )
def median ( items ) :
if len ( items ) == 0 :
return None
items . sort ( )
return items [ len ( items ) / / 2 ]
2024-02-07 19:11:37 -05:00
def superior_or_equal ( a , b ) :
return len ( a ) > = len ( b ) and all ( a [ i ] > = b [ i ] for i in range ( len ( b ) ) )
2024-02-07 22:32:31 -05:00
def try_collate_images_vs_pdf ( srcs , dest , start_index , exclude ) :
2024-02-07 19:11:37 -05:00
pdfs = [ src for src in srcs if ' pdf ' in src . name . lower ( ) ]
if len ( pdfs ) != 1 :
return False
outer_pdf = pdfs [ 0 ]
2024-02-07 22:32:31 -05:00
inner_pdfs = [ f for f in descendant_files_ignore ( outer_pdf , exclude ) if is_pdf ( f ) ]
2024-02-07 19:11:37 -05:00
if len ( inner_pdfs ) != 1 :
return False
inner_pdf = inner_pdfs [ 0 ]
non_pdf_srcs = [ src for src in srcs if src != outer_pdf ]
images = [ ]
non_images = [ ]
2024-02-07 22:32:31 -05:00
descendant_files = [ f for src in non_pdf_srcs for f in descendant_files_ignore ( src , exclude ) ]
2024-02-07 19:11:37 -05:00
for f in descendant_files :
2024-02-07 19:18:19 -05:00
if is_image ( f ) :
2024-02-07 19:11:37 -05:00
images . append ( f )
else :
non_images . append ( f )
break
if len ( non_images ) != 0 or len ( images ) == 0 :
return False
pdf_sizes = pdf_image_sizes ( inner_pdf )
standalone_sizes = [ standalone_image_size ( f ) for f in images ]
if abs ( len ( pdf_sizes ) - len ( standalone_sizes ) ) > 2 :
return False
median_pdf_size = median ( pdf_sizes )
median_standalone_size = median ( standalone_sizes )
if not ( median_pdf_size and median_standalone_size ) :
return False
if superior_or_equal ( median_standalone_size , median_pdf_size ) :
2024-02-07 22:32:31 -05:00
return collate_from_paths ( non_pdf_srcs , dest , start_index , exclude )
2024-02-07 19:11:37 -05:00
elif superior_or_equal ( median_pdf_size , median_standalone_size ) :
2024-02-07 22:32:31 -05:00
return collate_from_paths ( [ outer_pdf ] , dest , start_index , exclude )
2024-02-07 19:11:37 -05:00
else :
return False
2024-02-07 22:32:31 -05:00
def collate_from_paths ( srcs , dest , start_index , exclude ) :
2024-02-06 11:59:20 -05:00
if len ( srcs ) == 1 and srcs [ 0 ] . is_dir ( ) :
2024-02-07 22:32:31 -05:00
return collate_from_paths ( ls_ignore ( srcs [ 0 ] , exclude ) , dest , start_index , exclude )
2024-01-22 03:49:00 -05:00
2024-02-07 19:18:19 -05:00
if len ( srcs ) == 1 and is_pdf ( srcs [ 0 ] ) :
2024-02-06 11:59:20 -05:00
print ( f ' Extracting images from { srcs [ 0 ] } ' )
return link_pdf ( srcs [ 0 ] , dest , start_index )
2024-01-22 03:49:00 -05:00
2024-02-06 11:59:20 -05:00
if len ( srcs ) == 0 :
return 0
2024-02-07 17:42:18 -05:00
if len ( srcs ) == 2 and all ( src . is_dir ( ) for src in srcs ) :
hi_res_dirs = [ src for src in srcs if HI_RES_REGEX . search ( src . name ) ]
if len ( hi_res_dirs ) == 1 :
hi_res_dir = hi_res_dirs [ 0 ]
lo_res_dir = next ( src for src in srcs if src != hi_res_dir )
2024-02-07 22:32:31 -05:00
if len ( descendant_files_ignore ( lo_res_dir , exclude ) ) == len ( descendant_files_ignore ( hi_res_dir , exclude ) ) :
return collate_from_paths ( [ hi_res_dir ] , dest , start_index , exclude )
2024-02-07 17:42:18 -05:00
2024-02-07 22:32:31 -05:00
textless_split = try_collate_split_regex ( srcs , dest , start_index , exclude , later = TEXTLESS_REGEX )
2024-02-06 12:48:00 -05:00
if textless_split != False :
return textless_split
2024-02-08 05:11:17 -05:00
cover_split = try_collate_split_regex ( srcs , dest , start_index , exclude , earlier = FRONT_COVER_REGEX , later = BACK_COVER_REGEX )
if cover_split != False :
return cover_split
2024-02-07 22:32:31 -05:00
epilogue_split = try_collate_split_regex ( srcs , dest , start_index , exclude , later = EPILOGUE_REGEX )
2024-02-06 12:48:00 -05:00
if epilogue_split != False :
return epilogue_split
2024-02-06 11:59:20 -05:00
2024-02-07 19:18:19 -05:00
if all ( src . is_file ( ) and is_image ( src ) for src in srcs ) :
2024-02-06 11:59:20 -05:00
ordering = complete_prefix_number_ordering ( srcs )
if ordering :
print ( f ' Symlinking image files: { ordering [ 0 ] } ... ' )
link_ordered_files ( ordering , dest , start_index )
return len ( ordering )
else :
return None
2024-02-07 22:32:31 -05:00
images_vs_pdf = try_collate_images_vs_pdf ( srcs , dest , start_index , exclude )
2024-02-07 19:11:37 -05:00
if images_vs_pdf != False :
return images_vs_pdf
2024-02-06 23:01:59 -05:00
2024-02-06 11:59:20 -05:00
return None
2024-01-22 03:49:00 -05:00
2024-01-22 08:47:20 -05:00
def self_and_parents ( path ) :
return [ path ] + list ( path . parents )
2024-02-07 22:32:31 -05:00
def parse_expressions ( tokens ) :
groups = [ ]
exclusions = [ ]
while tokens :
token = tokens . pop ( 0 )
if token == ' ! ' :
exclusions . extend ( parse_exclusion ( tokens ) )
elif token == ' ( ' :
groups . append ( parse_group ( tokens ) )
else :
groups . append ( [ token ] )
return ( groups , exclusions )
def parse_exclusion ( tokens ) :
token = tokens . pop ( 0 )
if token == ' ( ' :
return parse_group ( tokens )
else :
return [ token ]
def parse_group ( tokens ) :
items = [ ]
while True :
token = tokens . pop ( 0 )
if token == ' ) ' :
return items
else :
items . append ( token )
def normalize_to ( path , ref ) :
return ref / Path ( relpath ( path , ref ) )
2024-01-22 03:49:00 -05:00
def manual_collate ( args ) :
2024-02-07 22:32:31 -05:00
( raw_groups , raw_exclusions ) = parse_expressions ( args . expression )
2024-01-22 07:36:23 -05:00
2024-02-07 22:52:17 -05:00
extraction_dir = args . destdir / ' extract '
sample_path = next ( path for group in ( raw_groups + [ raw_exclusions ] ) for path in group )
work_id = Path ( relpath ( sample_path , extraction_dir ) ) . parents [ - 2 ] . name
2024-02-07 22:32:31 -05:00
exclusions = [ normalize_to ( item , args . destdir ) for item in raw_exclusions ]
if raw_groups :
groups = [ [ normalize_to ( item , args . destdir ) for item in group ] for group in raw_groups ]
else :
2024-02-07 22:52:17 -05:00
groups = [ [ extraction_dir / work_id ] ]
2024-02-07 22:32:31 -05:00
2024-02-07 22:52:17 -05:00
collation_dir = args . destdir / ' site ' / ' images ' / work_id
2024-02-07 22:32:31 -05:00
if collation_dir . exists ( ) :
if len ( list ( collation_dir . iterdir ( ) ) ) > 0 :
print ( f ' Collation directory already exists! ' )
return
else :
collation_dir . rmdir ( )
2024-01-22 07:43:24 -05:00
2024-02-07 22:32:31 -05:00
nonexistent = [ path for group in ( groups + [ exclusions ] ) for path in group if not path . exists ( ) ]
2024-01-22 07:43:24 -05:00
if len ( nonexistent ) > 0 :
print ( f ' Nonexistent paths: { nonexistent } ' )
return
2024-02-07 22:32:31 -05:00
collation_staging_area = args . destdir / ' site ' / ' images-staging '
2024-02-07 22:52:17 -05:00
work_staging_dir = collation_staging_area / work_id
2024-02-07 22:32:31 -05:00
work_staging_dir . mkdir ( parents = True )
pages_collated = 0
for group in groups :
pages_added = collate_from_paths (
[ item for item in group if item not in exclusions ] ,
work_staging_dir ,
pages_collated ,
exclusions ,
)
if pages_added is None :
2024-02-07 22:52:17 -05:00
print ( f ' Unable to deduce file structure for { work_id } subgroup { [ str ( path ) for path in group ] } ' )
2024-02-07 22:32:31 -05:00
pages_collated = None
break
pages_collated + = pages_added
if pages_collated :
2024-02-07 22:52:17 -05:00
print ( f ' Collated { pages_collated } pages for { work_id } ' )
2024-02-07 22:32:31 -05:00
work_staging_dir . rename ( collation_dir )
else :
for f in work_staging_dir . iterdir ( ) :
f . unlink ( )
work_staging_dir . rmdir ( )
if pages_collated == 0 :
print ( f ' No files found for { work_id } ' )
collation_staging_area . rmdir ( )
2024-01-22 03:49:00 -05:00
2024-02-06 23:52:59 -05:00
def fmt_size ( s ) :
return f ' { s [ 0 ] } x { s [ 1 ] } px '
def analyze ( args ) :
extract_dir = args . destdir / ' extract '
2024-02-07 22:32:31 -05:00
files = descendant_files_ignore ( extract_dir / args . work_id , [ ] )
2024-02-06 23:52:59 -05:00
files . sort ( )
for f in files :
print ( f ' { relpath ( f , extract_dir ) } ' , end = ' ' )
2024-02-07 19:18:19 -05:00
if is_image ( f ) :
2024-02-06 23:52:59 -05:00
size = standalone_image_size ( f )
print ( f ' \t { fmt_size ( size ) } ' )
2024-02-07 19:18:19 -05:00
elif is_pdf ( f ) :
2024-02-06 23:52:59 -05:00
sizes = pdf_image_sizes ( f )
if len ( sizes ) == 0 :
print ( f ' \t Contains no images ' )
else :
print ( f ' \t { len ( sizes ) } images, median { fmt_size ( median ( sizes ) ) } , min { fmt_size ( min ( sizes ) ) } , max { fmt_size ( max ( sizes ) ) } ' )
else :
print ( )
2024-01-22 02:16:06 -05:00
def metadata ( args ) :
2024-01-22 02:58:17 -05:00
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
cur = con . cursor ( )
if args . virtual is not None :
cur . execute ( " UPDATE works SET virtual = ? WHERE id = ? " , ( 1 if args . virtual else 0 , args . work_id ) )
con . commit ( )
res = cur . execute (
" SELECT title, circle, date, description, series, virtual FROM works WHERE id = ? " ,
( args . work_id , ) ,
) . fetchone ( )
if res is None :
print ( f ' Work id { args . work_id } not found! ' )
return
( title , circle , date , description , series , virtual ) = res
print ( f ' Work ID: { args . work_id } ' )
print ( f ' Title: { title } ' )
print ( f ' Circle: { circle } ' )
print ( f ' Pub date: { date } ' )
print ( f ' Description: { description } ' )
print ( f ' Series: { series } ' )
print ( f ' Virtual: { " Yes " if virtual == 1 else " No " } ' )
con . close ( )
2024-01-22 02:16:06 -05:00
2024-01-23 17:35:01 -05:00
def copy_recursive ( src , dest ) :
2024-01-22 22:06:04 -05:00
dest . mkdir ( parents = True , exist_ok = True )
for item in src . iterdir ( ) :
2024-01-23 17:35:01 -05:00
if item . is_dir ( ) and not item . is_symlink ( ) :
copy_recursive ( item , dest / item . name )
else :
shutil . copyfile ( item , dest / item . name )
2024-01-22 07:01:41 -05:00
2024-01-23 15:54:17 -05:00
def generate ( args ) :
2024-01-22 07:01:41 -05:00
jenv = Environment (
2024-01-22 22:06:04 -05:00
loader = PackageLoader ( " dlibrary " ) ,
2024-01-22 07:01:41 -05:00
autoescape = select_autoescape ( )
)
viewer_template = jenv . get_template ( " viewer.html " )
2024-01-22 23:20:42 -05:00
list_template = jenv . get_template ( " list.html " )
2024-01-23 00:00:15 -05:00
categorization_template = jenv . get_template ( " categorization.html " )
2024-01-23 00:49:58 -05:00
work_template = jenv . get_template ( " work.html " )
2024-01-25 04:10:17 -05:00
index_template = jenv . get_template ( " index.html " )
2024-01-22 07:01:41 -05:00
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
cur = con . cursor ( )
2024-02-06 08:29:52 -05:00
site_dir = args . destdir / ' site '
collated_work_ids = { p . name for p in ( site_dir / ' images ' ) . iterdir ( ) }
2024-01-22 07:01:41 -05:00
2024-01-22 23:20:42 -05:00
actual_series = { series for ( series , ) in cur . execute ( ' SELECT series FROM works GROUP BY series HAVING count(series) > 1 ' ) }
2024-01-22 07:01:41 -05:00
works = [ ]
for ( work_id , title , circle , date , description , series ) in cur . execute ( ' SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC ' ) . fetchall ( ) :
if work_id not in collated_work_ids :
continue
authors = [ author for ( author , ) in cur . execute ( ' SELECT author FROM authors WHERE work = ? ' , ( work_id , ) ) ]
tags = [ tag for ( tag , ) in cur . execute ( ' SELECT tag FROM tags WHERE work = ? ' , ( work_id , ) ) ]
2024-02-06 08:29:52 -05:00
images = [ path . name for path in ( site_dir / ' images ' / work_id ) . iterdir ( ) ]
images . sort ( )
try :
thumbnail_path = relpath ( next (
f for f in ( site_dir / ' thumbnails ' ) . iterdir ( ) if f . stem == work_id
) , site_dir )
except StopIteration :
thumbnail_path = f ' images/ { work_id } / { images [ 0 ] } '
2024-01-22 07:01:41 -05:00
work = {
' id ' : work_id ,
' title ' : title ,
' circle ' : circle ,
' date ' : date ,
' description ' : description ,
' series ' : series ,
' authors ' : authors ,
' tags ' : tags ,
2024-02-06 08:29:52 -05:00
' thumbnail_path ' : thumbnail_path ,
2024-01-22 07:01:41 -05:00
}
works . append ( work )
2024-02-06 08:29:52 -05:00
work_dir = site_dir / ' works ' / work_id
2024-01-23 00:49:58 -05:00
viewer_dir = work_dir / ' view '
viewer_dir . mkdir ( parents = True , exist_ok = True )
2024-01-22 07:01:41 -05:00
with open ( work_dir / ' index.html ' , ' w ' ) as f :
2024-01-23 00:49:58 -05:00
f . write ( work_template . render ( depth = 2 , work = work , title = title , images = images ) )
with open ( viewer_dir / ' index.html ' , ' w ' ) as f :
f . write ( viewer_template . render ( depth = 3 , work = work , title = title , images = images ) )
2024-01-22 07:01:41 -05:00
2024-01-23 00:55:06 -05:00
def make_categorization ( categorization , query , work_filter , work_style_cards = False ) :
2024-02-06 08:29:52 -05:00
categorization_dir = site_dir / categorization
2024-01-23 00:00:15 -05:00
2024-01-22 23:20:42 -05:00
cats = [ cat for ( cat , ) in cur . execute ( query ) ]
2024-01-23 00:00:15 -05:00
cat_samples = { }
2024-01-22 23:20:42 -05:00
for cat in cats :
2024-01-23 00:00:15 -05:00
cat_works = list ( filter ( work_filter ( cat ) , works ) )
cat_samples [ cat ] = cat_works [ 0 ] if len ( cat_works ) > 0 else None
2024-01-22 23:20:42 -05:00
safeish_cat = cat . replace ( ' / ' , ' ' )
2024-01-23 00:00:15 -05:00
cat_dir = categorization_dir / safeish_cat
2024-01-22 23:20:42 -05:00
cat_dir . mkdir ( parents = True , exist_ok = True )
with open ( cat_dir / ' index.html ' , ' w ' ) as f :
f . write ( list_template . render (
depth = 2 ,
2024-01-23 00:00:15 -05:00
works = cat_works ,
2024-01-22 23:20:42 -05:00
title = cat ,
2024-01-23 00:00:15 -05:00
categorization = categorization ,
2024-01-22 23:20:42 -05:00
) )
2024-01-23 00:00:15 -05:00
categorization_dir . mkdir ( parents = True , exist_ok = True )
with open ( categorization_dir / ' index.html ' , ' w ' ) as f :
f . write ( categorization_template . render (
depth = 1 ,
categorization = categorization ,
categories = cats ,
samples = cat_samples ,
2024-01-23 00:55:06 -05:00
work_style_cards = work_style_cards ,
2024-01-23 00:00:15 -05:00
) )
make_categorization (
2024-01-22 23:20:42 -05:00
' authors ' ,
' SELECT DISTINCT author FROM authors ORDER BY author ' ,
lambda author : lambda work : author in work [ ' authors ' ] ,
)
2024-01-23 00:00:15 -05:00
make_categorization (
2024-01-22 23:20:42 -05:00
' tags ' ,
' SELECT DISTINCT tag FROM tags ORDER BY tag ' ,
lambda tag : lambda work : tag in work [ ' tags ' ] ,
)
2024-01-23 00:00:15 -05:00
make_categorization (
2024-01-22 23:20:42 -05:00
' circles ' ,
' SELECT DISTINCT circle FROM works WHERE circle NOT NULL ORDER BY circle ' ,
lambda circle : lambda work : work [ ' circle ' ] == circle ,
)
2024-01-23 00:00:15 -05:00
make_categorization (
2024-01-22 23:20:42 -05:00
' series ' ,
' SELECT DISTINCT series FROM works WHERE series NOT NULL ORDER BY series ' ,
lambda series : lambda work : work [ ' series ' ] == series ,
2024-01-23 00:55:06 -05:00
work_style_cards = True ,
2024-01-22 23:20:42 -05:00
)
2024-01-22 22:06:04 -05:00
with resources . as_file ( resources . files ( " dlibrary " ) ) as r :
2024-02-06 08:29:52 -05:00
copy_recursive ( r / ' static ' , site_dir / ' static ' )
2024-01-22 07:01:41 -05:00
2024-02-06 08:29:52 -05:00
with open ( site_dir / ' index.html ' , ' w ' ) as f :
2024-01-25 04:10:17 -05:00
f . write ( index_template . render ( depth = 0 , works = works ) )
2024-01-22 07:01:41 -05:00
con . close ( )
2024-01-22 02:16:06 -05:00
2024-01-23 15:54:17 -05:00
argparser = argparse . ArgumentParser (
prog = ' dlibrary ' ,
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = textwrap . dedent ( """ \
Organize DRM - free works purchased from DLSite into a library
that can be viewed in a web browser .
Intended workflow :
- ` extract ` a collection of zipfiles downloaded from DLSite
into DLibrary ' s data directory, giving each work its own
subfolder .
- ` fetch ` metadata and thumbnail images for extracted works
from DLSite .
- ` collate ` and / or ` manual - collate ` extracted works ,
producing a single sequence of image files ( or symlinks
into the extracted data , when possible ) for each work .
- Manually adjust works ' `metadata` when necessary.
- ` generate ` a static website providing a catalog and viewer
for all collated works .
""" ),
)
2024-01-22 02:16:06 -05:00
argparser . add_argument (
' -d ' , ' --destdir ' ,
type = Path ,
2024-02-06 23:01:59 -05:00
default = Path ( os . getenv ( ' DLIBRARY_DIR ' , ' ./dlibrary ' ) ) ,
2024-01-23 15:54:17 -05:00
help = ' directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary) ' ,
2024-01-22 02:16:06 -05:00
)
2024-01-22 22:09:26 -05:00
subparsers = argparser . add_subparsers ( title = " subcommands " , required = True )
2024-01-22 02:16:06 -05:00
2024-02-08 05:03:40 -05:00
parser_extract = subparsers . add_parser ( ' extract ' , aliases = [ ' x ' , ' ex ' ] , help = ' extract zipfiles ' )
2024-01-22 02:16:06 -05:00
parser_extract . add_argument (
' -r ' , ' --remove ' ,
action = ' store_true ' ,
help = ' remove original zipfiles after extraction ' ,
)
parser_extract . add_argument (
' zipfiles ' ,
metavar = ' FILE ' ,
type = Path ,
nargs = ' + ' ,
help = ' zipfiles to extract ' ,
)
parser_extract . set_defaults ( func = extract )
2024-02-08 05:03:40 -05:00
parser_fetch = subparsers . add_parser ( ' fetch ' , aliases = [ ' f ' , ' fet ' ] , help = ' fetch metadata and thumbnails ' )
2024-01-24 02:33:14 -05:00
parser_fetch . add_argument (
' -l ' , ' --locale ' ,
type = str ,
2024-02-06 23:01:59 -05:00
default = os . getenv ( ' DLIBRARY_LOCALE ' , ' en_US ' ) ,
2024-01-24 02:33:14 -05:00
help = ( ' locale to use when requesting metadata (e.g. " ja_JP " , " en_US " ). '
' May still fall back to Japanese if metadata in other languages is unavailable. '
' (default: $DLIBRARY_LOCALE or en_US) ' ) ,
)
2024-01-22 02:16:06 -05:00
parser_fetch . set_defaults ( func = fetch )
2024-01-23 15:54:17 -05:00
parser_collate = subparsers . add_parser (
' collate ' ,
2024-02-08 05:03:40 -05:00
aliases = [ ' c ' , ' co ' , ' col ' ] ,
2024-01-23 15:54:17 -05:00
help = ' collate each work into a sequence of image files ' ,
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = textwrap . dedent ( """ \
For each extracted work that has not already been collated ,
DLibrary will attempt to intuit its structure as follows :
- Enter the work ' s directory. If the directory contains
nothing except a single subdirectory ( ignoring a few types
of files that are definitely not relevant ) , traverse
downwards repeatedly .
- If the current directory contains nothing except a single
PDF ( again , ignoring irrelevant files ) , attempt to extract
a series of images from the PDF . This process expects that
each page of the PDF consists of a single embedded image ,
which will be extracted at full resolution . Support for
more complex PDFs is not yet implemented .
- If the current directory contains nothing except image
files , and the image files are named in a way that clearly
indicates a complete numerical order ( each filename
consists of a shared prefix followed by a distinct
number ) , symlink files in the inferred order .
- Otherwise , skip processing this work for now .
DLibrary can be given " collation hints " which provide
alternative starting points for this search process . A hint
is a path under $ DLIBRARY_DIR / extract / [ work id ] /
indicating a different directory or PDF file to begin the
search process for that work , rather than starting at the
top level of the extracted data . There can be at most one
hint per work ; for more complicated scenarios where a work
includes multiple folders that need to be collated together ,
or where filenames do not clearly indicate an ordering , use
` manual - collate ` instead .
""" ),
)
2024-01-22 03:49:00 -05:00
parser_collate . add_argument (
' hints ' ,
metavar = ' PATH ' ,
type = Path ,
nargs = ' * ' ,
2024-01-23 15:54:17 -05:00
help = ' paths within extraction folders as collation hints '
2024-01-22 03:49:00 -05:00
)
2024-01-22 02:16:06 -05:00
parser_collate . set_defaults ( func = collate )
2024-01-23 15:54:17 -05:00
parser_manual_collate = subparsers . add_parser (
' manual-collate ' ,
2024-02-08 05:03:40 -05:00
aliases = [ ' mc ' , ' man ' , ' manual ' ] ,
2024-01-23 15:54:17 -05:00
help = ' collate a single work manually ' ,
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = textwrap . dedent ( """ \
2024-02-07 22:32:31 -05:00
Provide an expression or sequence of expressions specifying groups
of paths to collate or skip . An expression can be :
PATH
A single path . If this is an image , it will be appended to
the sequence of collated images ; if this is a PDF , images will be
extracted from it and concatenated to the sequence ; if this is a
directory , the contents of the directory will be collated based on
the normal heuristics and concatenated to the sequence .
( PATH [ PATH . . . ] )
A group of paths contained in parentheses . You may need to escape
the parentheses to avoid them getting parsed by your shell .
All the paths in this group will be considered together , and
collated based on the normal heuristics , regardless of what
order the paths are provided in .
! PATH
! ( PATH [ PATH . . . ] )
A path or group of paths to exclude from collation . You may
need to escape the ! . If an excluded path appears within any
of the other specified paths , it will be ignored .
If the only expressions provided are negations , then auto - collation
will start from the top level of the extracted work while excluding
the negated paths .
2024-01-23 15:54:17 -05:00
All provided paths must be under $ DLIBRARY_DIR / extract / [ work id ] /
for the work being manually collated . ` manual - collate ` can
2024-02-07 22:52:17 -05:00
only handle one work at a time .
2024-01-23 15:54:17 -05:00
""" ),
)
2024-01-22 03:49:00 -05:00
parser_manual_collate . add_argument (
2024-02-07 22:32:31 -05:00
' expression ' ,
2024-01-22 03:49:00 -05:00
nargs = ' + ' ,
2024-02-07 22:32:31 -05:00
help = ' expressions indicating paths to collate or skip ' ,
2024-01-22 03:49:00 -05:00
)
parser_manual_collate . set_defaults ( func = manual_collate )
2024-02-08 05:03:40 -05:00
parser_analyze = subparsers . add_parser ( ' analyze ' , aliases = [ ' a ' , ' an ' , ' anal ' ] , help = ' analyze an extracted folder to assist in collation ' )
2024-02-06 23:52:59 -05:00
parser_analyze . add_argument ( ' work_id ' )
parser_analyze . set_defaults ( func = analyze )
2024-02-08 05:03:40 -05:00
parser_metadata = subparsers . add_parser ( ' metadata ' , aliases = [ ' m ' , ' me ' , ' meta ' ] , help = ' view or modify metadata for a work ' )
2024-01-22 02:16:06 -05:00
parser_metadata . add_argument ( ' work_id ' )
parser_metadata . add_argument (
' --virtual ' ,
action = argparse . BooleanOptionalAction ,
help = ' set work as virtual ' ,
)
parser_metadata . set_defaults ( func = metadata )
2024-01-23 15:54:17 -05:00
parser_generate = subparsers . add_parser (
' generate ' ,
2024-02-08 05:03:40 -05:00
aliases = [ ' g ' , ' gen ' ] ,
2024-01-23 15:54:17 -05:00
help = ' generate HTML/CSS/JS for library site ' ,
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = textwrap . dedent ( """ \
The static site will be generated under $ DLIBRARY_DIR / site /
and can be served by pointing an HTTP server at that
directory . Note that some files inside the static site
hierarchy will be symlinks into $ DLIBRARY_DIR / extract /
outside the site hierarchy , so make sure your HTTP server
will allow those symlinks to be read .
""" ),
)
parser_generate . set_defaults ( func = generate )
2024-01-22 02:16:06 -05:00
2024-01-22 22:06:04 -05:00
def main ( ) :
2024-01-22 02:16:06 -05:00
args = argparser . parse_args ( )
args . func ( args )
2024-01-22 22:06:04 -05:00
if __name__ == " __main__ " :
main ( )