2024-01-21 05:13:09 -05:00
#!/usr/bin/env python3
2024-01-22 02:16:06 -05:00
import argparse
2024-01-21 05:13:09 -05:00
import asyncio
2024-01-22 22:06:04 -05:00
import importlib_resources as resources
2024-03-12 04:35:09 -04:00
from io import BytesIO
2024-01-22 02:16:06 -05:00
from pathlib import Path
2024-02-06 23:01:59 -05:00
import os
2024-04-01 21:50:25 -04:00
from os . path import relpath , splitext , join
2024-03-19 15:12:49 -04:00
import random
2024-01-21 17:45:56 -05:00
import re
2024-02-07 00:24:30 -05:00
import readline
2024-01-22 07:01:41 -05:00
import shutil
2024-01-21 05:13:09 -05:00
import sqlite3
2024-04-01 21:50:25 -04:00
import stat
2024-01-23 15:54:17 -05:00
import textwrap
2024-02-16 16:08:56 -05:00
import unicodedata
2024-01-21 17:45:56 -05:00
from urllib . parse import urlparse
2024-01-21 05:13:09 -05:00
import zipfile
2024-04-01 22:47:07 -04:00
import dlsite_async
2024-01-22 02:16:06 -05:00
import fitz
2024-03-12 04:35:09 -04:00
from libsixel import *
2024-02-06 23:01:59 -05:00
from PIL import Image
2024-01-22 22:06:04 -05:00
from jinja2 import Environment , PackageLoader , select_autoescape
2024-04-02 14:24:42 -04:00
import pyuca
2024-03-15 16:00:37 -04:00
import rarfile
2024-01-21 17:45:56 -05:00
import requests
2024-01-21 05:13:09 -05:00
2024-02-08 04:54:36 -05:00
NUMBER_REGEX = re . compile ( ' [0-90 -9 ]+ ' )
2024-01-21 17:45:56 -05:00
2024-01-29 03:01:35 -05:00
DLSITE_ID_REGEX = re . compile ( ' ^[BR]J[0-9]+$ ' )
2024-01-29 04:11:55 -05:00
FANZA_ID_REGEX = re . compile ( ' ^d_[0-9]+$ ' )
2024-01-29 21:25:21 -05:00
FAKKU_ID_REGEX = re . compile ( ' .*_FAKKU$ ' )
2024-01-29 03:01:35 -05:00
2024-04-01 22:07:21 -04:00
HI_RES_REGEX = re . compile ( ' 高解像度|原寸|大サイズ ' , re . I )
2024-02-22 19:24:01 -05:00
NO_TONE_REGEX = re . compile ( ' トーン(効果)?[な無]し|グレースケール ' , re . I )
TONE_REGEX = re . compile ( ' トーン(版|(効果)?[有あ]り) ' , re . I )
COLOR_REGEX = re . compile ( ' カラー ' , re . I )
MONOCHROME_REGEX = re . compile ( ' モノクロ ' , re . I )
2024-02-20 17:23:33 -05:00
IMAGE_QUALITY_REGEXES = [
{ ' better ' : HI_RES_REGEX } ,
{ ' better ' : NO_TONE_REGEX , ' worse ' : TONE_REGEX } ,
{ ' better ' : COLOR_REGEX , ' worse ' : MONOCHROME_REGEX } ,
]
2024-04-01 22:27:28 -04:00
IMAGE_RESOLUTION_REGEX = re . compile ( ' ^(?P<x>[0-9]+)x(?P<y>[0-9]+)$ ' )
2024-02-13 10:45:21 -05:00
2024-02-22 19:24:01 -05:00
LANGUAGE_REGEXES = {
' en_US ' : re . compile ( ' english|英語 ' , re . I ) ,
' ja_JP ' : re . compile ( ' 日本語 ' , re . I ) ,
' zh_CN ' : re . compile ( ' (^|[^體])中文|中国語 ' , re . I ) ,
' zh_TW ' : re . compile ( ' 繁體中文 ' , re . I ) ,
' ko_KR ' : re . compile ( ' 한국어 ' , re . I ) ,
}
2024-04-01 23:11:28 -04:00
TEXTLESS_REGEX = re . compile ( ' (台詞|セリフ|せりふ|テキスト|文字|文章)((な|無)し|抜き)|notext|textless ' , re . I )
2024-03-23 00:32:37 -04:00
FXLESS_REGEX = re . compile ( ' 効果音(な|無)し ' , re . I )
2024-03-23 00:27:35 -04:00
FRONT_COVER_REGEX = re . compile ( ' (?<!裏)表紙(?!裏)|(?<!back[-_ ])(?<!back)cover|(?<!ura[-_ ])(?<!ura)hyou?sh?i(?![-_ ]?ura) ' , re . I )
BACK_COVER_REGEX = re . compile ( ' 裏表紙|hyou?sh?i[-_ ]?ura|ura[-_ ]?hyou?sh?i ' , re . I )
2024-03-23 02:19:04 -04:00
BONUS_REGEX = re . compile ( ' 設定|キャラ|特典|ポスター|bonus ' , re . I )
2024-02-22 19:24:01 -05:00
EPILOGUE_REGEX = re . compile ( ' after|後日談|おまけ ' , re . I )
2024-02-13 10:45:21 -05:00
SPLITS = [
{ ' later ' : TEXTLESS_REGEX } ,
2024-03-23 00:32:37 -04:00
{ ' later ' : FXLESS_REGEX } ,
2024-02-13 10:45:21 -05:00
{ ' earlier ' : FRONT_COVER_REGEX , ' later ' : BACK_COVER_REGEX } ,
{ ' later ' : BONUS_REGEX } ,
{ ' later ' : EPILOGUE_REGEX } ,
]
2024-02-06 13:29:38 -05:00
ALT_VERSIONS = [
' 褐色 ' ,
' 日焼け ' ,
' pink ' ,
' 金髪 ' ,
' 白肌 ' ,
' うつろ目 ' ,
' dark skin ' ,
' ラバー ' ,
' ゾンビ肌 ' ,
' マスク ' ,
2024-02-06 13:59:49 -05:00
' アヘ顔 ' ,
2024-02-06 13:29:38 -05:00
]
2024-02-06 09:26:39 -05:00
2024-02-08 00:14:24 -05:00
IMAGE_FILE_EXTENSIONS = [ ' .png ' , ' .jpg ' , ' .jpeg ' , ' .gif ' , ' .tiff ' , ' .bmp ' ]
2024-01-22 02:16:06 -05:00
2024-03-22 20:50:51 -04:00
IGNOREABLE_FILES = [ ' Thumbs.db ' , ' __MACOSX ' , ' .DS_Store ' , ' desktop.ini ' ]
2024-02-06 23:02:59 -05:00
IGNOREABLE_EXTENSIONS = [ ' .txt ' , ' .html ' , ' .htm ' , ' .psd ' , ' .mp4 ' ]
2024-01-22 07:56:45 -05:00
2024-03-12 04:35:09 -04:00
PDF_CONVERSION_DPI = 300
PDF_PREVIEW_DPI = 72
2024-03-02 18:09:46 -05:00
2024-03-11 12:08:08 -04:00
IRRELEVANT_PDF_BLOCK_REGEX = re . compile ( r ' \ bTCPDF \ b ' , re . I )
2024-03-15 16:00:37 -04:00
MULTIPART_RAR_HEAD_REGEX = re . compile ( r ' ^(.+) \ .part0*1 \ .exe$ ' , re . I )
MULTIPART_RAR_TAIL_REGEX = re . compile ( r ' ^(.+) \ .part0*([^1]|[^0].+) \ .rar$ ' , re . I )
2024-03-16 01:51:01 -04:00
PDF_REFERENCED_IMAGE_REGEX = re . compile ( r ' (^|(?<= \ s))/(?P<ref_name> \ S+) \ s+Do($|(?= \ s)) ' )
PDF_INLINE_IMAGE_REGEX = re . compile ( r ' (^| \ s)(BI|ID|EI)($| \ s) ' )
2024-03-19 15:12:49 -04:00
SUGGESTED_WORKS_COUNT = 10
2024-04-01 21:50:25 -04:00
READONLY_FILE = stat . S_IRUSR | stat . S_IRGRP | stat . S_IROTH
READONLY_DIR = READONLY_FILE | stat . S_IXUSR | stat . S_IXGRP | stat . S_IXOTH
2024-03-15 16:50:56 -04:00
debug_mode = False
def debug ( s ) :
if debug_mode :
print ( s )
2024-01-21 05:13:09 -05:00
def open_zipfile_with_encoding ( path ) :
2024-03-15 16:00:37 -04:00
for enc in [ " utf-8 " , " shift-jis " , " shift-jisx0213 " ] :
try :
return zipfile . ZipFile ( path , metadata_encoding = enc )
except UnicodeDecodeError :
pass
2024-01-21 05:13:09 -05:00
2024-03-15 16:00:37 -04:00
print ( f ' { path } contains filenames with unknown character encoding! ' )
exit ( 1 )
2024-01-21 05:13:09 -05:00
2024-03-15 16:00:37 -04:00
def open_rarfile_with_encoding ( path ) :
for enc in [ " utf-8 " , " shift-jis " , " shift-jisx0213 " ] :
rf = rarfile . RarFile ( path , charset = enc )
if all ( ' <EFBFBD> ' not in info . filename for info in rf . infolist ( ) ) :
return rf
print ( f ' { path } contains filenames with unknown character encoding! ' )
exit ( 1 )
2024-01-21 05:13:09 -05:00
2024-04-01 21:50:25 -04:00
def readonly ( path ) :
for parentdir , dirs , files in os . walk ( path , topdown = False ) :
for f in files :
os . chmod ( join ( parentdir , f ) , READONLY_FILE , follow_symlinks = False )
os . chmod ( parentdir , READONLY_DIR , follow_symlinks = False )
2024-01-22 02:16:06 -05:00
def extract ( args ) :
2024-03-15 16:00:37 -04:00
absolute_archive_paths = set ( path . resolve ( strict = True ) for path in args . archives )
2024-03-16 02:44:21 -04:00
any_skipped = False
2024-03-15 16:00:37 -04:00
for archive_path in args . archives :
if archive_path . suffix . lower ( ) == ' .zip ' :
work_id = archive_path . stem
work_extract_path = args . destdir / ' extract ' / work_id
print ( f ' Extracting { archive_path } to { work_extract_path } ' )
2024-01-21 05:13:09 -05:00
2024-03-15 16:00:37 -04:00
with open_zipfile_with_encoding ( archive_path ) as z :
work_extract_path . mkdir ( parents = True )
z . extractall ( path = work_extract_path )
2024-02-04 00:49:26 -05:00
2024-04-01 21:50:25 -04:00
readonly ( work_extract_path )
2024-03-15 16:00:37 -04:00
if args . remove :
archive_path . unlink ( )
2024-01-21 05:13:09 -05:00
2024-03-15 16:00:37 -04:00
elif rar_match := MULTIPART_RAR_HEAD_REGEX . fullmatch ( archive_path . name ) :
work_id = rar_match . group ( 1 )
work_extract_path = args . destdir / ' extract ' / work_id
print ( f ' Extracting multipart RAR archive beginning with { archive_path } to { work_extract_path } ' )
with open_rarfile_with_encoding ( archive_path ) as r :
volumes = [ Path ( vol ) . resolve ( strict = True ) for vol in r . volumelist ( ) ]
if any ( vol not in absolute_archive_paths for vol in volumes ) :
print ( f ' Multipart RAR archive starting with { archive_path } contains volume files not listed on command-line, skipping ' )
2024-03-16 02:44:21 -04:00
any_skipped = True
2024-03-15 16:00:37 -04:00
continue
work_extract_path . mkdir ( parents = True )
r . extractall ( path = work_extract_path )
2024-04-01 21:50:25 -04:00
readonly ( work_extract_path )
2024-03-15 16:00:37 -04:00
if args . remove :
for vol in volumes :
vol . unlink ( )
elif MULTIPART_RAR_TAIL_REGEX . fullmatch ( archive_path . name ) :
pass
else :
print ( f ' Unknown archive file type { archive_path } , skipping ' )
2024-03-16 02:44:21 -04:00
any_skipped = True
if args . auto and not any_skipped :
parser_fetch . parse_args ( args = [ ] , namespace = args )
fetch ( args )
2024-01-21 05:13:09 -05:00
2024-01-29 04:11:55 -05:00
def manual_input_metadata ( work_id ) :
print ( f " Don ' t know how to fetch metadata for { work_id } , input manually: " )
title = input ( ' Title: ' )
circle = input ( ' Circle [None]: ' ) or None
authors = [ author . strip ( ) for author in input ( ' Authors (comma-separated): ' ) . split ( ' , ' ) if author . strip ( ) ]
tags = [ tag . strip ( ) for tag in input ( ' Tags (comma-separated): ' ) . split ( ' , ' ) if tag . strip ( ) ]
date = input ( ' Pub date (yyyy-mm-dd): ' )
description = input ( ' Description: ' )
series = input ( ' Series [None]: ' ) or None
return {
" id " : work_id ,
" title " : title ,
" circle " : circle ,
" authors " : authors ,
" tags " : tags ,
" date " : date ,
" description " : description ,
" series " : series ,
}
2024-01-22 02:16:06 -05:00
async def fetch_async ( args ) :
2024-04-01 22:47:07 -04:00
any_warnings = False
2024-01-22 02:16:06 -05:00
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
2024-01-21 05:13:09 -05:00
cur = con . cursor ( )
2024-01-22 02:16:06 -05:00
cur . execute ( " CREATE TABLE IF NOT EXISTS works(id TEXT PRIMARY KEY, title TEXT, circle TEXT, date TEXT, description TEXT, series TEXT, virtual INT) " )
2024-02-07 00:09:42 -05:00
cur . execute ( " CREATE TABLE IF NOT EXISTS authors(author TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(author, work)) " )
cur . execute ( " CREATE TABLE IF NOT EXISTS tags(tag TEXT, work TEXT, FOREIGN KEY(work) REFERENCES works(id), PRIMARY KEY(tag, work)) " )
2024-01-21 05:13:09 -05:00
2024-01-22 02:16:06 -05:00
thumbnails_dir = args . destdir / ' site ' / ' thumbnails '
thumbnails_dir . mkdir ( parents = True , exist_ok = True )
2024-04-01 22:47:07 -04:00
async with dlsite_async . DlsiteAPI ( locale = args . locale ) as api :
2024-01-22 02:16:06 -05:00
for work_path in ( args . destdir / ' extract ' ) . iterdir ( ) :
work_id = work_path . name
res = cur . execute ( " SELECT id FROM works WHERE id = ? " , ( work_id , ) )
if res . fetchone ( ) is not None :
continue
2024-01-21 05:13:09 -05:00
2024-01-29 04:11:55 -05:00
if DLSITE_ID_REGEX . fullmatch ( work_id ) :
print ( f ' Fetching DLSite metadata for { work_id } ' )
dlsite_metadata = await api . get_work ( work_id )
2024-04-01 22:47:07 -04:00
if dlsite_metadata . work_type not in [ dlsite_async . WorkType . MANGA , dlsite_async . WorkType . CG_ILLUSTRATIONS ] :
print ( f ' Work { work_id } is not a manga or CG set, skipping ' )
any_warnings = True
continue
2024-01-29 04:11:55 -05:00
db_row = {
" id " : work_id ,
" title " : dlsite_metadata . work_name ,
" circle " : dlsite_metadata . circle ,
" date " : dlsite_metadata . regist_date . date ( ) . isoformat ( ) ,
" description " : dlsite_metadata . description ,
" series " : dlsite_metadata . series ,
}
authors = dlsite_metadata . author or [ ]
tags = dlsite_metadata . genre or [ ]
thumbnail_url = dlsite_metadata . work_image
if thumbnail_url . startswith ( ' // ' ) :
thumbnail_url = ' https: ' + thumbnail_url
2024-01-29 21:25:21 -05:00
else :
2024-01-29 04:11:55 -05:00
db_row = manual_input_metadata ( work_id )
authors = db_row . pop ( ' authors ' )
tags = db_row . pop ( ' tags ' )
2024-01-29 21:25:21 -05:00
if FANZA_ID_REGEX . fullmatch ( work_id ) :
2024-03-22 21:28:54 -04:00
candidate_urls = [
f ' https://doujin-assets.dmm.co.jp/digital/ { work_type } / { work_id } / { work_id } pl.jpg '
for work_type in [ ' comic ' , ' cg ' ]
]
thumbnail_url = None
for url in candidate_urls :
h = requests . head ( url , allow_redirects = False )
if h . status_code == 200 :
thumbnail_url = url
break
2024-01-29 21:25:21 -05:00
elif FAKKU_ID_REGEX . fullmatch ( work_id ) :
2024-02-06 08:29:52 -05:00
thumbnail_url = None
2024-01-29 21:25:21 -05:00
else :
2024-02-06 08:29:52 -05:00
thumbnail_url = input ( ' Thumbnail image URL [default: first page]: ' )
2024-01-29 03:01:35 -05:00
2024-01-21 05:13:09 -05:00
cur . execute (
2024-01-22 02:16:06 -05:00
" INSERT INTO works(id, title, circle, date, description, series) VALUES(:id, :title, :circle, :date, :description, :series) " ,
2024-01-29 04:11:55 -05:00
db_row ,
2024-01-21 05:13:09 -05:00
)
cur . executemany (
" INSERT INTO authors VALUES(:author, :work) " ,
2024-01-29 04:11:55 -05:00
[ { " author " : author , " work " : work_id } for author in authors ] ,
2024-01-21 05:13:09 -05:00
)
cur . executemany (
" INSERT INTO tags VALUES(:tag, :work) " ,
2024-01-29 04:11:55 -05:00
[ { " tag " : tag , " work " : work_id } for tag in tags ] ,
2024-01-21 05:13:09 -05:00
)
2024-01-22 02:16:06 -05:00
2024-02-06 08:29:52 -05:00
if thumbnail_url :
2024-01-29 21:25:21 -05:00
ext = url_file_ext ( thumbnail_url )
dest_file = thumbnails_dir / ( work_id + ext )
print ( f ' Downloading thumbnail for { work_id } from { thumbnail_url } ' )
with open ( dest_file , ' wb ' ) as fd :
with requests . get ( thumbnail_url , stream = True ) as r :
for chunk in r . iter_content ( chunk_size = 16384 ) :
fd . write ( chunk )
2024-01-22 02:16:06 -05:00
2024-01-21 05:13:09 -05:00
con . commit ( )
2024-01-22 02:16:06 -05:00
2024-01-21 17:45:56 -05:00
con . close ( )
2024-04-01 22:47:07 -04:00
return any_warnings
2024-01-21 17:45:56 -05:00
def url_file_ext ( url ) :
2024-01-22 02:16:06 -05:00
return splitext ( urlparse ( url ) . path ) [ 1 ]
2024-01-21 17:45:56 -05:00
2024-01-22 02:16:06 -05:00
def fetch ( args ) :
2024-04-01 22:47:07 -04:00
any_warnings = asyncio . run ( fetch_async ( args ) )
if args . auto and not any_warnings :
2024-03-16 02:44:21 -04:00
parser_collate . parse_args ( args = [ ] , namespace = args )
collate ( args )
2024-01-21 17:45:56 -05:00
2024-03-02 18:10:22 -05:00
def self_and_parents ( path ) :
return [ path ] + list ( path . parents )
2024-03-01 23:43:38 -05:00
def collate ( args ) :
extraction_dir = args . destdir / ' extract '
2024-03-12 02:16:47 -04:00
def extracted_path_work_id ( path ) :
trail = self_and_parents ( Path ( relpath ( path , extraction_dir ) ) )
if len ( trail ) < 2 :
return None
result = trail [ - 2 ] . name
if result == ' .. ' :
return None
return result
( raw_groups , raw_exclusions ) = parse_expressions ( args . expression )
specified_works = set ( )
works_groups = { }
for group in raw_groups :
if len ( group ) == 0 :
continue
work_id = extracted_path_work_id ( group [ 0 ] )
if not work_id :
print ( f ' Group { group } contains paths outside an extracted work! ' )
exit ( 1 )
if not all ( extracted_path_work_id ( item ) == work_id for item in group [ 1 : ] ) :
print ( f ' Group { group } contains paths from multiple works! ' )
exit ( 1 )
specified_works . add ( work_id )
if work_id not in works_groups :
works_groups [ work_id ] = [ ]
normalized_paths = [ normalize_to ( item , args . destdir ) for item in group ]
if not all ( path . exists ( ) for path in normalized_paths ) :
print ( f ' Group { group } contains nonexistent paths! ' )
exit ( 1 )
works_groups [ work_id ] . append ( normalized_paths )
exclusions = [ ]
for exclusion in raw_exclusions :
work_id = extracted_path_work_id ( exclusion )
if not work_id :
print ( f ' Excluded path { exclusion } does not belong to an extracted work! ' )
exit ( 1 )
specified_works . add ( work_id )
normalized_path = normalize_to ( exclusion , args . destdir )
if not normalized_path . exists ( ) :
print ( f ' Excluded path { exclusion } does not exist! ' )
exit ( 1 )
exclusions . append ( normalized_path )
2024-03-01 23:43:38 -05:00
collation_staging_area = args . destdir / ' site ' / ' images-staging '
collation_staging_area . mkdir ( parents = True )
collation_area = args . destdir / ' site ' / ' images '
collation_area . mkdir ( parents = True , exist_ok = True )
2024-03-12 02:16:47 -04:00
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
cur = con . cursor ( )
2024-03-16 02:44:21 -04:00
any_warnings = False
2024-03-01 23:43:38 -05:00
for work_path in extraction_dir . iterdir ( ) :
work_id = work_path . name
2024-03-12 02:16:47 -04:00
if args . only_specified_works and work_id not in specified_works :
continue
2024-03-01 23:43:38 -05:00
work_collation_dir = collation_area / work_id
if work_collation_dir . exists ( ) :
2024-03-12 02:16:47 -04:00
if work_id not in specified_works :
continue
if len ( list ( work_collation_dir . iterdir ( ) ) ) > 0 :
print ( f ' Collation directory for work { work_id } already exists! ' )
2024-03-16 02:44:21 -04:00
any_warnings = True
2024-03-12 02:16:47 -04:00
break
else :
work_collation_dir . rmdir ( )
2024-03-01 23:43:38 -05:00
virtual = cur . execute ( " SELECT virtual FROM works WHERE id = ? " , ( work_id , ) ) . fetchone ( )
2024-03-29 15:16:49 -04:00
if virtual in [ ( 1 , ) , None ] :
2024-03-12 02:16:47 -04:00
if work_id in specified_works :
2024-03-29 15:16:49 -04:00
print ( f ' Work { work_id } { " is virtual " if virtual == ( 1 , ) else " has no metadata " } ! ' )
2024-03-16 02:44:21 -04:00
any_warnings = True
2024-03-12 02:16:47 -04:00
break
2024-03-01 23:43:38 -05:00
continue
work_staging_dir = collation_staging_area / work_id
2024-03-12 02:16:47 -04:00
collator = Collator ( work_staging_dir , exclusions , args )
for group in works_groups . get ( work_id , [ [ work_path ] ] ) :
collation_result = collator . collate_from_paths ( [ item for item in group if item not in exclusions ] )
if not collation_result :
print ( f ' Unable to deduce file structure for { work_id } subgroup { [ str ( path ) for path in group ] } ' )
break
2024-03-01 23:43:38 -05:00
if collation_result and collator . index > 0 :
print ( f ' Collated { collator . index } pages for { work_id } ' )
work_staging_dir . rename ( work_collation_dir )
else :
if work_staging_dir . is_dir ( ) :
for f in work_staging_dir . iterdir ( ) :
f . unlink ( )
work_staging_dir . rmdir ( )
if not collation_result :
print ( f ' Unable to deduce file structure for { work_id } , skipping ' )
elif collator . index == 0 :
2024-03-12 02:16:47 -04:00
print ( f ' No files found for { work_id } , skipping ' )
2024-03-01 23:43:38 -05:00
2024-03-16 02:44:21 -04:00
any_warnings = True
2024-03-01 23:43:38 -05:00
collation_staging_area . rmdir ( )
con . close ( )
2024-03-16 02:44:21 -04:00
if args . auto and not any_warnings :
parser_generate . parse_args ( args = [ ] , namespace = args )
generate ( args )
2024-03-01 23:43:38 -05:00
class Collator :
2024-03-02 18:09:46 -05:00
def __init__ ( self , dest , exclude , args ) :
2024-03-01 23:43:38 -05:00
self . dest = dest
self . exclude = exclude
2024-03-02 18:09:46 -05:00
self . args = args
2024-03-01 23:43:38 -05:00
self . index = 0
def collate_from_paths ( self , srcs ) :
2024-03-11 12:44:06 -04:00
srcs = [ src for src in srcs if len ( descendant_files_ignore ( src , self . exclude ) ) > 0 ]
2024-03-01 23:43:38 -05:00
if len ( srcs ) == 1 and srcs [ 0 ] . is_dir ( ) :
return self . collate_from_paths ( ls_ignore ( srcs [ 0 ] , self . exclude ) )
if len ( srcs ) == 1 and is_pdf ( srcs [ 0 ] ) :
print ( f ' Extracting images from { srcs [ 0 ] } ' )
return self . link_pdf ( srcs [ 0 ] )
if len ( srcs ) == 0 :
return True
2024-03-23 02:19:11 -04:00
debug ( f ' Auto-collating { srcs } ' )
2024-03-02 00:13:30 -05:00
select_language = self . try_collate_select_language ( srcs )
if select_language is not False :
return select_language
2024-04-01 22:07:21 -04:00
dirs = [ src for src in srcs if src . is_dir ( ) ]
2024-04-01 22:27:28 -04:00
non_dirs = [ src for src in srcs if not src . is_dir ( ) ]
if len ( dirs ) == 2 and len ( descendant_files_ignore ( dirs [ 0 ] , self . exclude ) ) == len ( descendant_files_ignore ( dirs [ 1 ] , self . exclude ) ) :
2024-04-01 23:11:28 -04:00
debug ( f ' Checking for image quality references between dirs { dirs [ 0 ] } and { dirs [ 1 ] } ' )
2024-04-01 22:27:28 -04:00
resolution_matches = [ IMAGE_RESOLUTION_REGEX . match ( nname ( src ) ) for src in dirs ]
if all ( resolution_matches ) :
2024-04-01 23:11:28 -04:00
debug ( f ' Directory names are resolutions ' )
2024-04-01 22:27:28 -04:00
pairs = [ ( int ( m . group ( ' x ' ) ) , int ( m . group ( ' y ' ) ) ) for m in resolution_matches ]
for i in range ( 2 ) :
if pairs [ i ] [ 0 ] > pairs [ 1 - i ] [ 0 ] and pairs [ i ] [ 1 ] > pairs [ 1 - i ] [ 1 ] :
return self . collate_from_paths ( [ dirs [ i ] ] + non_dirs )
2024-04-01 23:11:28 -04:00
debug ( f ' Checking image quality regexes ' )
2024-03-01 23:43:38 -05:00
for quality in IMAGE_QUALITY_REGEXES :
def a_not_b ( a , b , src ) :
if a in quality :
return quality [ a ] . search ( nname ( src ) )
else :
return not quality [ b ] . search ( nname ( src ) )
2024-04-01 22:27:28 -04:00
better_dirs = [ src for src in dirs if a_not_b ( ' better ' , ' worse ' , src ) ]
worse_dirs = [ src for src in dirs if a_not_b ( ' worse ' , ' better ' , src ) ]
if len ( better_dirs ) == 1 and len ( worse_dirs ) == 1 and better_dirs [ 0 ] != worse_dirs [ 0 ] :
return self . collate_from_paths ( better_dirs + non_dirs )
2024-03-01 23:43:38 -05:00
images_vs_pdf = self . try_collate_images_vs_pdf ( srcs )
if images_vs_pdf is not False :
return images_vs_pdf
for regexes in SPLITS :
split_attempt = self . try_collate_split_regex ( srcs , * * regexes )
if split_attempt is not False :
return split_attempt
if all ( src . is_file ( ) and is_image ( src ) for src in srcs ) :
ordering = complete_prefix_number_ordering ( srcs )
if ordering :
print ( f ' Symlinking image files: { ordering [ 0 ] } ... ' )
return self . link_ordered_files ( ordering )
else :
return None
return None
def link_pdf ( self , src ) :
with fitz . open ( src ) as pdf :
2024-03-12 15:50:12 -04:00
images = pdf_images ( pdf , self . args . pdf_strategy )
2024-03-02 18:09:46 -05:00
if images is None :
2024-03-12 15:58:39 -04:00
print ( f ' Failed to enumerate page images in PDF { src } ' )
2024-03-01 23:43:38 -05:00
return None
self . dest . mkdir ( parents = True , exist_ok = True )
2024-03-12 15:56:40 -04:00
print ( f ' 0 pages collated... ' , end = ' ' )
2024-03-02 18:09:46 -05:00
for ( idx , image ) in enumerate ( images , start = self . index ) :
2024-03-01 23:43:38 -05:00
file_path = self . dest / f ' { idx : 04d } . { image [ " ext " ] } '
with open ( file_path , ' wb ' ) as f :
f . write ( image [ " image " ] )
2024-03-12 15:56:40 -04:00
print ( f ' \x1b [2K \r { idx + 1 - self . index } pages collated... ' , end = ' ' )
print ( )
2024-03-01 23:43:38 -05:00
self . index + = pdf . page_count
return True
def link_ordered_files ( self , ordering ) :
self . dest . mkdir ( parents = True , exist_ok = True )
for ( idx , src_path ) in enumerate ( ordering , start = self . index ) :
ext = src_path . suffix . lower ( )
link_path = self . dest / f ' { idx : 04d } { ext } '
link_path . symlink_to ( relpath ( src_path , self . dest ) )
self . index + = len ( ordering )
return True
def try_collate_split_regex ( self , srcs , earlier = None , later = None ) :
early_srcs = [ ]
middle_srcs = [ ]
late_srcs = [ ]
for src in srcs :
if earlier and earlier . search ( nname ( src ) ) :
early_srcs . append ( src )
elif later and later . search ( nname ( src ) ) :
late_srcs . append ( src )
else :
middle_srcs . append ( src )
if sum ( 1 for l in [ early_srcs , middle_srcs , late_srcs ] if l ) < = 1 :
return False
early_page_collation = self . collate_from_paths ( early_srcs )
if early_page_collation is None :
return None
middle_page_collation = self . collate_from_paths ( middle_srcs )
if middle_page_collation is None :
return None
late_page_collation = self . collate_from_paths ( late_srcs )
if late_page_collation is None :
return None
return True
def try_collate_images_vs_pdf ( self , srcs ) :
pdfs = [ src for src in srcs if ' pdf ' in src . name . lower ( ) ]
if len ( pdfs ) != 1 :
return False
outer_pdf = pdfs [ 0 ]
inner_pdfs = [ f for f in descendant_files_ignore ( outer_pdf , self . exclude ) if is_pdf ( f ) ]
if len ( inner_pdfs ) != 1 :
return False
inner_pdf = inner_pdfs [ 0 ]
non_pdf_srcs = [ src for src in srcs if src != outer_pdf ]
images = [ ]
non_images = [ ]
descendant_files = [ f for src in non_pdf_srcs for f in descendant_files_ignore ( src , self . exclude ) ]
for f in descendant_files :
if is_image ( f ) :
images . append ( f )
else :
non_images . append ( f )
break
if len ( non_images ) != 0 or len ( images ) == 0 :
return False
2024-03-23 02:19:11 -04:00
debug ( f ' Comparing PDF { inner_pdf } and images { images } ' )
2024-03-01 23:43:38 -05:00
pdf_sizes = pdf_image_sizes ( inner_pdf )
standalone_sizes = [ standalone_image_size ( f ) for f in images ]
median_pdf_size = median ( pdf_sizes )
median_standalone_size = median ( standalone_sizes )
if not ( median_pdf_size and median_standalone_size ) :
return False
2024-03-23 02:19:11 -04:00
debug ( f ' PDF: { len ( pdf_sizes ) } images, { median_pdf_size } ; standalone: { len ( standalone_sizes ) } images, median { median_standalone_size } ' )
2024-03-01 23:43:38 -05:00
if abs ( len ( pdf_sizes ) - len ( standalone_sizes ) ) > 2 :
with fitz . open ( inner_pdf ) as pdf :
pdf_page_count = len ( pdf )
height_adjusted_pdf_image_count = (
len ( pdf_sizes ) *
mean ( [ size [ 1 ] for size in pdf_sizes ] ) / mean ( [ size [ 1 ] for size in standalone_sizes ] )
)
if (
abs ( pdf_page_count - len ( standalone_sizes ) ) < = 2 and
len ( pdf_sizes ) > len ( standalone_sizes ) and
median_pdf_size [ 0 ] == median_standalone_size [ 0 ] and
abs ( height_adjusted_pdf_image_count - len ( standalone_sizes ) ) < = 2
) :
return self . collate_from_paths ( non_pdf_srcs )
else :
return False
if superior_or_equal ( median_standalone_size , median_pdf_size ) :
return self . collate_from_paths ( non_pdf_srcs )
elif superior_or_equal ( median_pdf_size , median_standalone_size ) :
return self . collate_from_paths ( [ outer_pdf ] )
else :
return False
2024-03-02 00:13:30 -05:00
def try_collate_select_language ( self , srcs ) :
2024-03-02 18:09:46 -05:00
if self . args . locale not in LANGUAGE_REGEXES :
2024-03-02 00:13:30 -05:00
return False
if not all ( any ( lang . search ( nname ( src ) ) for lang in LANGUAGE_REGEXES . values ( ) ) for src in srcs ) :
return False
2024-03-02 18:09:46 -05:00
srcs_matching_language = [ src for src in srcs if LANGUAGE_REGEXES [ self . args . locale ] . search ( nname ( src ) ) ]
2024-03-02 00:13:30 -05:00
if len ( srcs_matching_language ) == len ( srcs ) or len ( srcs_matching_language ) == 0 :
return False
return self . collate_from_paths ( srcs_matching_language )
2024-03-11 12:08:08 -04:00
def block_is_image ( block ) :
return block [ 6 ] == 1
def block_text ( block ) :
return block [ 4 ]
def block_relevant ( block ) :
return block_is_image ( block ) or not IRRELEVANT_PDF_BLOCK_REGEX . search ( block_text ( block ) )
def relevant_blocks ( page ) :
2024-03-02 18:27:15 -05:00
blocks = page . get_text ( ' blocks ' )
2024-03-11 12:08:08 -04:00
return [ block for block in blocks if block_relevant ( block ) ]
def is_single_image ( page ) :
blocks = relevant_blocks ( page )
return len ( blocks ) == 1 and block_is_image ( blocks [ 0 ] )
2024-03-02 18:27:15 -05:00
2024-03-03 01:47:29 -05:00
def extract_image ( pdf , xref ) :
image = pdf . extract_image ( xref )
if f ' . { image [ " ext " ] } ' in IMAGE_FILE_EXTENSIONS :
return image
print ( f ' Converting image from { image [ " ext " ] } to png ' )
pix = fitz . Pixmap ( pdf , xref )
return { ' ext ' : ' png ' , ' image ' : pix . tobytes ( ' png ' ) }
2024-03-16 01:51:01 -04:00
def get_displayed_image_xref ( page ) :
ref_names = [ ]
for content_xref in page . get_contents ( ) :
content = page . parent . xref_stream ( content_xref ) . decode ( ' ascii ' , ' replace ' )
if PDF_INLINE_IMAGE_REGEX . search ( content ) :
debug ( ' Inline image detected ' )
return None
for m in PDF_REFERENCED_IMAGE_REGEX . finditer ( content ) :
ref_names . append ( m . group ( ' ref_name ' ) )
if len ( ref_names ) == 0 :
debug ( ' Page does not reference any xobjects ' )
return None
if len ( ref_names ) > 1 :
debug ( f ' Page references multiple xobjects: { ref_names } ' )
return None
image_xrefs = [ image [ 0 ] for image in page . get_images ( ) if image [ 7 ] == ref_names [ 0 ] ]
if len ( image_xrefs ) == 1 :
return image_xrefs [ 0 ]
if len ( image_xrefs ) == 0 :
debug ( f ' No images found matching ref name { ref_names [ 0 ] } ' )
else :
debug ( f " Multiple images found matching ref name { ref_names [ 0 ] } , that probably shouldn ' t happen " )
return None
2024-04-04 08:15:18 -04:00
def display_sixel_pixmap ( pixmap_bytes ) :
2024-03-12 04:35:09 -04:00
s = BytesIO ( )
2024-04-04 08:15:18 -04:00
image = Image . open ( BytesIO ( pixmap_bytes ) )
2024-03-12 04:35:09 -04:00
width , height = image . size
try :
data = image . tobytes ( )
except NotImplementedError :
data = image . tostring ( )
output = sixel_output_new ( lambda data , s : s . write ( data ) , s )
try :
if image . mode == ' RGBA ' :
dither = sixel_dither_new ( 256 )
sixel_dither_initialize ( dither , data , width , height , SIXEL_PIXELFORMAT_RGBA8888 )
elif image . mode == ' RGB ' :
dither = sixel_dither_new ( 256 )
sixel_dither_initialize ( dither , data , width , height , SIXEL_PIXELFORMAT_RGB888 )
elif image . mode == ' P ' :
palette = image . getpalette ( )
dither = sixel_dither_new ( 256 )
sixel_dither_set_palette ( dither , palette )
sixel_dither_set_pixelformat ( dither , SIXEL_PIXELFORMAT_PAL8 )
elif image . mode == ' L ' :
dither = sixel_dither_get ( SIXEL_BUILTIN_G8 )
sixel_dither_set_pixelformat ( dither , SIXEL_PIXELFORMAT_G8 )
elif image . mode == ' 1 ' :
dither = sixel_dither_get ( SIXEL_BUILTIN_G1 )
sixel_dither_set_pixelformat ( dither , SIXEL_PIXELFORMAT_G1 )
else :
raise RuntimeError ( ' unexpected image mode ' )
try :
sixel_encode ( data , width , height , 1 , dither , output )
print ( s . getvalue ( ) . decode ( ' ascii ' ) )
finally :
sixel_dither_unref ( dither )
finally :
sixel_output_unref ( output )
2024-03-12 15:50:12 -04:00
def pdf_images ( pdf , strategy ) :
2024-03-12 15:56:40 -04:00
print ( f ' 0/ { pdf . page_count } pages analyzed... ' , end = ' ' )
2024-03-12 03:23:57 -04:00
image_extractors = [ ]
for ( idx , page ) in enumerate ( pdf ) :
2024-03-16 01:51:01 -04:00
xref = get_displayed_image_xref ( page )
2024-03-12 03:23:57 -04:00
if xref is not None and is_single_image ( page ) :
image_extractors . append ( lambda p = pdf , x = xref : extract_image ( p , x ) )
else :
2024-03-16 01:51:01 -04:00
page_images = page . get_image_info ( )
print ( f ' \n Page { idx + 1 } : { len ( page_images ) } images, { len ( relevant_blocks ( page ) ) } total relevant objects ' )
2024-03-12 15:50:12 -04:00
choice = strategy
while True :
if choice . lower ( ) . startswith ( ' n ' ) :
return None
if choice . lower ( ) . startswith ( ' c ' ) :
if choice == strategy :
print ( f ' Converting page { idx + 1 } ' )
image_extractors . append ( lambda p = page : { ' ext ' : ' png ' , ' image ' : p . get_pixmap ( dpi = PDF_CONVERSION_DPI ) . tobytes ( ' png ' ) } )
break
if xref is not None and ( choice . lower ( ) . startswith ( ' x ' ) or choice . lower ( ) == ' extract ' ) :
if choice == strategy :
print ( f ' Extracting image from page { idx + 1 } without text ' )
image_extractors . append ( lambda p = pdf , x = xref : extract_image ( p , x ) )
break
if choice . lower ( ) . startswith ( ' d ' ) :
if choice == strategy :
print ( f ' Dropping page { idx + 1 } ' )
break
if choice . lower ( ) . startswith ( ' s ' ) :
2024-04-04 08:15:18 -04:00
display_sixel_pixmap ( page . get_pixmap ( dpi = PDF_PREVIEW_DPI ) . tobytes ( ' png ' ) )
if xref is not None :
pixmap = fitz . Pixmap ( pdf , xref )
pixmap . shrink ( 2 )
display_sixel_pixmap ( pixmap . tobytes ( ' png ' ) )
2024-03-12 15:50:12 -04:00
choice = input ( f ' [N]ope out / [c]onvert page { " " if xref is None else " / e[x]tract image " } / [d]rop page / [s]how page? [n/c { " " if xref is None else " /x " } /d/s] ' )
2024-03-12 15:56:40 -04:00
print ( f ' \x1b [2K \r { idx + 1 } / { pdf . page_count } pages analyzed... ' , end = ( ' ' if idx + 1 < pdf . page_count else ' \n ' ) )
2024-03-02 18:09:46 -05:00
2024-03-12 03:23:57 -04:00
return ( extractor ( ) for extractor in image_extractors )
2024-01-22 10:36:20 -05:00
2024-02-16 16:08:56 -05:00
def nfc ( s ) :
return unicodedata . normalize ( ' NFC ' , s )
def nname ( entry ) :
return nfc ( entry . name )
2024-01-21 17:45:56 -05:00
def complete_prefix_number_ordering ( entries ) :
2024-02-06 09:26:39 -05:00
if len ( entries ) == 1 :
return entries
2024-02-06 10:22:11 -05:00
entries_by_version = { }
for entry in entries :
2024-02-06 13:20:56 -05:00
version_code = 0
for ( i , version ) in enumerate ( ALT_VERSIONS ) :
2024-02-16 16:08:56 -05:00
if version in nname ( entry ) :
2024-02-06 13:20:56 -05:00
version_code | = ( 1 << i )
entries_by_version . setdefault ( version_code , [ ] ) . append ( entry )
2024-02-06 10:22:11 -05:00
2024-02-06 11:02:08 -05:00
numberings_by_version = { ver : unique_hierarchical_prefix_numbering ( entries_by_version [ ver ] ) for ver in entries_by_version }
2024-02-06 10:22:11 -05:00
unified_indices = set ( )
for numbering in numberings_by_version . values ( ) :
if numbering is None :
return None
unified_indices | = set ( numbering . keys ( ) )
2024-02-06 13:20:56 -05:00
unified_indices . discard ( None )
2024-02-06 10:22:11 -05:00
unified_indices = list ( unified_indices )
unified_indices . sort ( )
2024-02-07 23:59:35 -05:00
min_delta_by_level = { }
2024-02-06 11:02:08 -05:00
if len ( unified_indices ) > 1 :
for i in range ( 1 , len ( unified_indices ) ) :
cur = unified_indices [ i ]
prev = unified_indices [ i - 1 ]
for level in range ( min ( len ( cur ) , len ( prev ) ) ) :
2024-03-29 13:55:22 -04:00
if cur [ level ] != prev [ level ] and not ( cur [ level ] == 5 and prev [ level ] == 0 ) :
2024-02-07 23:59:35 -05:00
delta = cur [ level ] - prev [ level ]
min_delta_by_level [ level ] = min ( min_delta_by_level . get ( level , delta ) , delta )
if any ( delta > 2 for delta in min_delta_by_level . values ( ) ) :
return None
2024-02-06 10:22:48 -05:00
2024-02-06 13:20:56 -05:00
unified_indices . append ( None )
2024-02-06 10:22:11 -05:00
versions = list ( numberings_by_version . keys ( ) )
versions . sort ( )
version_lengths = { ver : len ( numberings_by_version [ ver ] ) for ver in numberings_by_version }
inner_versions = [ ]
outer_versions = [ versions [ 0 ] ]
for ver in versions [ 1 : ] :
if version_lengths [ ver ] > = version_lengths [ versions [ 0 ] ] - 2 :
outer_versions . append ( ver )
else :
inner_versions . append ( ver )
result = [ ]
for out_ver in outer_versions :
for i in unified_indices :
for ver in ( [ out_ver ] + ( inner_versions if out_ver == versions [ 0 ] else [ ] ) ) :
2024-02-06 11:02:08 -05:00
result + = numberings_by_version [ ver ] . get ( i , [ ] )
2024-02-06 10:22:11 -05:00
return result
2024-02-06 11:02:08 -05:00
def unique_hierarchical_prefix_numbering ( entries , start_point = 0 ) :
2024-02-16 16:08:56 -05:00
if len ( entries ) == 1 and not NUMBER_REGEX . search ( nname ( entries [ 0 ] ) ) :
2024-02-06 13:20:56 -05:00
return { None : entries }
2024-03-15 16:50:56 -04:00
debug ( f ' Finding unique hierarchical prefix ordering from start point { start_point } for { entries } ' )
2024-02-16 16:08:56 -05:00
longest_entry = max ( entries , key = lambda e : len ( nname ( e ) ) )
matches = reversed ( list ( NUMBER_REGEX . finditer ( nname ( longest_entry ) ) ) )
2024-01-21 17:45:56 -05:00
for m in matches :
pos = m . start ( )
2024-02-06 11:02:08 -05:00
if pos < start_point :
return None
2024-02-16 16:08:56 -05:00
prefix = nname ( longest_entry ) [ : pos ]
2024-03-15 16:50:56 -04:00
debug ( f ' Checking prefix { prefix } ' )
2024-02-16 16:08:56 -05:00
if all ( nname ( e ) . startswith ( prefix ) or prefix . startswith ( nfc ( e . stem ) ) for e in entries ) :
2024-02-06 11:02:08 -05:00
numbering = { }
2024-01-21 17:45:56 -05:00
for e in entries :
2024-02-16 16:08:56 -05:00
if pos > = len ( nfc ( e . stem ) ) :
2024-02-06 13:00:44 -05:00
i = 0
else :
2024-02-16 16:08:56 -05:00
n = NUMBER_REGEX . match ( nname ( e ) [ pos : ] )
2024-02-06 13:00:44 -05:00
if n is None :
return None
i = int ( n . group ( ) )
2024-02-06 11:02:08 -05:00
numbering . setdefault ( ( i , ) , [ ] ) . append ( e )
indices = list ( numbering . keys ( ) )
for idx in indices :
if len ( numbering [ idx ] ) > 1 :
ents_idx = numbering . pop ( idx )
2024-03-15 16:50:56 -04:00
debug ( f ' Index { idx } has multiple entries ' )
2024-02-16 16:08:56 -05:00
longest = max ( ents_idx , key = lambda e : len ( nname ( e ) ) )
2024-03-29 14:29:35 -04:00
next_match = NUMBER_REGEX . match ( nname ( longest ) [ pos : ] )
if not next_match :
return None
next_layer_start = pos + next_match . end ( )
2024-02-06 12:19:11 -05:00
sub_numbering = unique_hierarchical_prefix_numbering ( ents_idx , start_point = next_layer_start ) or alphabetic_numbering ( ents_idx , next_layer_start )
2024-02-06 11:02:08 -05:00
if not sub_numbering :
return None
for sub_idx in sub_numbering :
numbering [ ( * idx , * sub_idx ) ] = sub_numbering [ sub_idx ]
return numbering
2024-02-06 10:22:11 -05:00
2024-01-21 17:45:56 -05:00
return None
2024-02-06 12:19:11 -05:00
def alphabetic_numbering ( entries , start_point ) :
2024-03-15 16:50:56 -04:00
debug ( f ' Finding alphabetic numbering from start point { start_point } for { entries } ' )
2024-02-06 12:19:11 -05:00
alphabetized = { }
for entry in entries :
2024-03-11 12:19:56 -04:00
ending = nfc ( entry . stem ) [ start_point : ] . strip ( ' -_() ' )
2024-03-15 16:50:56 -04:00
debug ( f ' { entry } has ending { ending } ' )
2024-02-06 12:19:11 -05:00
if len ( ending ) > 1 :
2024-03-15 16:50:56 -04:00
debug ( ' Ending is more than one character, giving up ' )
2024-02-06 12:19:11 -05:00
return None
2024-02-07 20:45:46 -05:00
index = 0 if ending == ' ' else ord ( ending . lower ( ) ) - ord ( ' a ' ) + 1
2024-03-11 12:19:56 -04:00
if index < 0 or index > 26 :
2024-03-15 16:50:56 -04:00
debug ( ' Ending is not a letter, giving up ' )
2024-03-11 12:19:56 -04:00
return None
2024-02-07 17:12:02 -05:00
if ( index , ) in alphabetized :
2024-03-15 16:50:56 -04:00
debug ( f ' Index value { index } is already present, giving up ' )
2024-02-06 12:19:11 -05:00
return None
alphabetized [ ( index , ) ] = [ entry ]
return alphabetized
2024-02-07 19:18:19 -05:00
def check_extension ( path , exts ) :
return path . suffix . lower ( ) in exts
def is_pdf ( path ) :
2024-02-07 19:51:46 -05:00
return check_extension ( path , [ ' .pdf ' ] )
2024-02-07 19:18:19 -05:00
def is_image ( path ) :
2024-02-07 19:51:46 -05:00
return check_extension ( path , IMAGE_FILE_EXTENSIONS )
2024-02-07 19:18:19 -05:00
2024-02-06 23:01:59 -05:00
def ignoreable ( path ) :
2024-02-07 19:18:19 -05:00
return path . name in IGNOREABLE_FILES or check_extension ( path , IGNOREABLE_EXTENSIONS )
2024-02-06 23:01:59 -05:00
2024-02-07 22:32:31 -05:00
def ls_ignore ( directory , exclude ) :
2024-01-22 07:56:45 -05:00
return [
path for path in directory . iterdir ( )
2024-02-07 22:32:31 -05:00
if not ignoreable ( path ) and path not in exclude
2024-01-22 07:56:45 -05:00
]
2024-02-07 22:32:31 -05:00
def descendant_files_ignore ( path , exclude ) :
2024-02-07 19:11:37 -05:00
if path . is_file ( ) :
return [ path ]
2024-02-06 23:01:59 -05:00
result = [ ]
2024-02-07 22:32:31 -05:00
for item in ls_ignore ( path , exclude ) :
2024-02-06 23:01:59 -05:00
if item . is_dir ( ) :
2024-02-07 22:32:31 -05:00
result . extend ( descendant_files_ignore ( item , exclude ) )
2024-02-06 23:01:59 -05:00
else :
result . append ( item )
return result
def standalone_image_size ( filepath ) :
with Image . open ( filepath ) as im :
return im . size
def pdf_image_sizes ( filepath ) :
sizes_by_xref = { }
with fitz . open ( filepath ) as pdf :
for page in pdf :
for ( xref , _ , width , height , * _ ) in page . get_images ( ) :
if xref in sizes_by_xref :
continue
sizes_by_xref [ xref ] = ( width , height )
return list ( sizes_by_xref . values ( ) )
def median ( items ) :
if len ( items ) == 0 :
return None
items . sort ( )
return items [ len ( items ) / / 2 ]
2024-02-15 19:32:50 -05:00
def mean ( items ) :
if len ( items ) == 0 :
return None
return sum ( items ) / len ( items )
2024-02-07 19:11:37 -05:00
def superior_or_equal ( a , b ) :
return len ( a ) > = len ( b ) and all ( a [ i ] > = b [ i ] for i in range ( len ( b ) ) )
2024-01-22 03:49:00 -05:00
2024-02-07 22:32:31 -05:00
def parse_expressions ( tokens ) :
groups = [ ]
exclusions = [ ]
while tokens :
token = tokens . pop ( 0 )
if token == ' ! ' :
exclusions . extend ( parse_exclusion ( tokens ) )
elif token == ' ( ' :
groups . append ( parse_group ( tokens ) )
else :
groups . append ( [ token ] )
return ( groups , exclusions )
def parse_exclusion ( tokens ) :
token = tokens . pop ( 0 )
if token == ' ( ' :
return parse_group ( tokens )
else :
return [ token ]
def parse_group ( tokens ) :
items = [ ]
while True :
token = tokens . pop ( 0 )
if token == ' ) ' :
return items
else :
items . append ( token )
def normalize_to ( path , ref ) :
return ref / Path ( relpath ( path , ref ) )
2024-01-22 03:49:00 -05:00
2024-02-06 23:52:59 -05:00
def fmt_size ( s ) :
return f ' { s [ 0 ] } x { s [ 1 ] } px '
def analyze ( args ) :
extract_dir = args . destdir / ' extract '
2024-02-07 22:32:31 -05:00
files = descendant_files_ignore ( extract_dir / args . work_id , [ ] )
2024-02-06 23:52:59 -05:00
files . sort ( )
for f in files :
print ( f ' { relpath ( f , extract_dir ) } ' , end = ' ' )
2024-02-07 19:18:19 -05:00
if is_image ( f ) :
2024-02-06 23:52:59 -05:00
size = standalone_image_size ( f )
print ( f ' \t { fmt_size ( size ) } ' )
2024-02-07 19:18:19 -05:00
elif is_pdf ( f ) :
2024-02-06 23:52:59 -05:00
sizes = pdf_image_sizes ( f )
if len ( sizes ) == 0 :
2024-03-02 00:15:18 -05:00
print ( ' \t Contains no images ' )
2024-02-06 23:52:59 -05:00
else :
print ( f ' \t { len ( sizes ) } images, median { fmt_size ( median ( sizes ) ) } , min { fmt_size ( min ( sizes ) ) } , max { fmt_size ( max ( sizes ) ) } ' )
else :
print ( )
2024-01-22 02:16:06 -05:00
def metadata ( args ) :
2024-01-22 02:58:17 -05:00
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
cur = con . cursor ( )
if args . virtual is not None :
cur . execute ( " UPDATE works SET virtual = ? WHERE id = ? " , ( 1 if args . virtual else 0 , args . work_id ) )
con . commit ( )
res = cur . execute (
" SELECT title, circle, date, description, series, virtual FROM works WHERE id = ? " ,
( args . work_id , ) ,
) . fetchone ( )
if res is None :
print ( f ' Work id { args . work_id } not found! ' )
return
( title , circle , date , description , series , virtual ) = res
print ( f ' Work ID: { args . work_id } ' )
print ( f ' Title: { title } ' )
print ( f ' Circle: { circle } ' )
print ( f ' Pub date: { date } ' )
print ( f ' Description: { description } ' )
print ( f ' Series: { series } ' )
print ( f ' Virtual: { " Yes " if virtual == 1 else " No " } ' )
con . close ( )
2024-01-22 02:16:06 -05:00
2024-01-23 17:35:01 -05:00
def copy_recursive ( src , dest ) :
2024-01-22 22:06:04 -05:00
dest . mkdir ( parents = True , exist_ok = True )
for item in src . iterdir ( ) :
2024-01-23 17:35:01 -05:00
if item . is_dir ( ) and not item . is_symlink ( ) :
copy_recursive ( item , dest / item . name )
else :
shutil . copyfile ( item , dest / item . name )
2024-01-22 07:01:41 -05:00
2024-03-03 02:56:46 -05:00
memoized_similarities = { }
def similarity ( a , b ) :
2024-03-03 11:42:21 -05:00
if len ( a ) < len ( b ) or ( len ( a ) == len ( b ) and a < b ) :
2024-03-03 02:56:46 -05:00
shorter = a
longer = b
else :
shorter = b
longer = a
if len ( shorter ) == 0 :
return 0
if ( shorter , longer ) in memoized_similarities :
return memoized_similarities [ ( shorter , longer ) ]
2024-03-03 03:21:53 -05:00
options = [ similarity ( shorter [ 1 : ] , longer ) ]
for i in range ( 1 , len ( shorter ) + 1 ) :
match_idx = longer . find ( shorter [ : i ] )
if match_idx == - 1 :
break
options . append ( i * i + similarity ( shorter [ i : ] , longer [ match_idx + i : ] ) )
result = max ( options )
2024-03-03 02:56:46 -05:00
memoized_similarities [ ( shorter , longer ) ] = result
return result
2024-03-19 15:12:49 -04:00
def top ( items , n , key , overflow = 0 ) :
2024-03-03 11:42:21 -05:00
winners = [ ]
for item in items :
score = key ( item )
2024-03-19 15:12:49 -04:00
if len ( winners ) < n or score > = winners [ - 1 ] [ 1 ] :
2024-03-03 11:42:21 -05:00
for i in range ( len ( winners ) + 1 ) :
2024-03-19 15:12:49 -04:00
if i == len ( winners ) or score > = winners [ i ] [ 1 ] :
2024-03-03 11:42:21 -05:00
winners . insert ( i , ( item , score ) )
2024-03-03 02:56:46 -05:00
break
2024-03-19 15:12:49 -04:00
while len ( winners ) > n and winners [ - 1 ] [ 1 ] < winners [ n - 1 ] [ 1 ] :
2024-03-03 02:56:46 -05:00
winners . pop ( )
2024-03-19 15:12:49 -04:00
# shuffle followed by stable sort to randomly shuffle within each score tier
random . shuffle ( winners )
winners . sort ( key = lambda w : w [ 1 ] , reverse = True )
return [ item for ( item , score ) in winners [ : n + overflow ] ]
2024-03-03 02:56:46 -05:00
2024-01-23 15:54:17 -05:00
def generate ( args ) :
2024-01-22 07:01:41 -05:00
jenv = Environment (
2024-01-22 22:06:04 -05:00
loader = PackageLoader ( " dlibrary " ) ,
2024-01-22 07:01:41 -05:00
autoescape = select_autoescape ( )
)
viewer_template = jenv . get_template ( " viewer.html " )
2024-01-22 23:20:42 -05:00
list_template = jenv . get_template ( " list.html " )
2024-01-23 00:00:15 -05:00
categorization_template = jenv . get_template ( " categorization.html " )
2024-01-23 00:49:58 -05:00
work_template = jenv . get_template ( " work.html " )
2024-01-25 04:10:17 -05:00
index_template = jenv . get_template ( " index.html " )
2024-01-22 07:01:41 -05:00
con = sqlite3 . connect ( args . destdir / ' meta.db ' )
cur = con . cursor ( )
2024-02-06 08:29:52 -05:00
site_dir = args . destdir / ' site '
collated_work_ids = { p . name for p in ( site_dir / ' images ' ) . iterdir ( ) }
2024-01-22 07:01:41 -05:00
works = [ ]
for ( work_id , title , circle , date , description , series ) in cur . execute ( ' SELECT id, title, circle, date, description, series FROM works ORDER BY date DESC ' ) . fetchall ( ) :
if work_id not in collated_work_ids :
continue
authors = [ author for ( author , ) in cur . execute ( ' SELECT author FROM authors WHERE work = ? ' , ( work_id , ) ) ]
tags = [ tag for ( tag , ) in cur . execute ( ' SELECT tag FROM tags WHERE work = ? ' , ( work_id , ) ) ]
2024-02-06 08:29:52 -05:00
images = [ path . name for path in ( site_dir / ' images ' / work_id ) . iterdir ( ) ]
images . sort ( )
try :
thumbnail_path = relpath ( next (
f for f in ( site_dir / ' thumbnails ' ) . iterdir ( ) if f . stem == work_id
) , site_dir )
except StopIteration :
thumbnail_path = f ' images/ { work_id } / { images [ 0 ] } '
2024-01-22 07:01:41 -05:00
work = {
' id ' : work_id ,
' title ' : title ,
' circle ' : circle ,
' date ' : date ,
' description ' : description ,
' series ' : series ,
' authors ' : authors ,
' tags ' : tags ,
2024-02-06 08:29:52 -05:00
' thumbnail_path ' : thumbnail_path ,
2024-03-03 02:14:56 -05:00
' images ' : images ,
2024-01-22 07:01:41 -05:00
}
works . append ( work )
2024-03-03 02:56:46 -05:00
for ( idx , work ) in enumerate ( works ) :
def suggestion_priority ( other_work ) :
if other_work is work :
return - 2
if work [ ' series ' ] and work [ ' series ' ] == other_work [ ' series ' ] :
return - 1
return similarity ( work [ ' title ' ] , other_work [ ' title ' ] )
2024-03-19 15:12:49 -04:00
suggested = top ( works , SUGGESTED_WORKS_COUNT , suggestion_priority )
2024-03-03 02:56:46 -05:00
2024-03-03 02:14:56 -05:00
work_dir = site_dir / ' works ' / work [ ' id ' ]
2024-01-23 00:49:58 -05:00
viewer_dir = work_dir / ' view '
viewer_dir . mkdir ( parents = True , exist_ok = True )
2024-01-22 07:01:41 -05:00
with open ( work_dir / ' index.html ' , ' w ' ) as f :
2024-03-03 02:56:46 -05:00
f . write ( work_template . render ( depth = 2 , work = work , title = work [ ' title ' ] , suggested = suggested ) )
2024-01-23 00:49:58 -05:00
with open ( viewer_dir / ' index.html ' , ' w ' ) as f :
2024-03-03 02:14:56 -05:00
f . write ( viewer_template . render ( depth = 3 , work = work , title = work [ ' title ' ] ) )
2024-01-22 07:01:41 -05:00
2024-03-12 04:58:19 -04:00
print ( f ' \x1b [2K \r { idx + 1 } / { len ( works ) } works processed... ' , end = ( ' ' if idx + 1 < len ( works ) else ' \n ' ) )
2024-03-03 02:56:46 -05:00
2024-04-02 14:24:42 -04:00
uca = pyuca . Collator ( ) . sort_key
2024-01-23 00:55:06 -05:00
def make_categorization ( categorization , query , work_filter , work_style_cards = False ) :
2024-02-06 08:29:52 -05:00
categorization_dir = site_dir / categorization
2024-01-23 00:00:15 -05:00
2024-04-02 14:24:42 -04:00
cats = sorted ( ( cat for ( cat , ) in cur . execute ( query ) ) , key = uca )
2024-01-23 00:00:15 -05:00
cat_samples = { }
2024-01-22 23:20:42 -05:00
for cat in cats :
2024-01-23 00:00:15 -05:00
cat_works = list ( filter ( work_filter ( cat ) , works ) )
cat_samples [ cat ] = cat_works [ 0 ] if len ( cat_works ) > 0 else None
2024-01-22 23:20:42 -05:00
safeish_cat = cat . replace ( ' / ' , ' ' )
2024-01-23 00:00:15 -05:00
cat_dir = categorization_dir / safeish_cat
2024-01-22 23:20:42 -05:00
cat_dir . mkdir ( parents = True , exist_ok = True )
with open ( cat_dir / ' index.html ' , ' w ' ) as f :
f . write ( list_template . render (
depth = 2 ,
2024-01-23 00:00:15 -05:00
works = cat_works ,
2024-01-22 23:20:42 -05:00
title = cat ,
2024-01-23 00:00:15 -05:00
categorization = categorization ,
2024-01-22 23:20:42 -05:00
) )
2024-01-23 00:00:15 -05:00
categorization_dir . mkdir ( parents = True , exist_ok = True )
with open ( categorization_dir / ' index.html ' , ' w ' ) as f :
f . write ( categorization_template . render (
depth = 1 ,
categorization = categorization ,
categories = cats ,
samples = cat_samples ,
2024-01-23 00:55:06 -05:00
work_style_cards = work_style_cards ,
2024-01-23 00:00:15 -05:00
) )
make_categorization (
2024-01-22 23:20:42 -05:00
' authors ' ,
2024-04-02 14:05:38 -04:00
' SELECT DISTINCT author FROM authors ' ,
2024-01-22 23:20:42 -05:00
lambda author : lambda work : author in work [ ' authors ' ] ,
)
2024-01-23 00:00:15 -05:00
make_categorization (
2024-01-22 23:20:42 -05:00
' tags ' ,
2024-04-02 14:05:38 -04:00
' SELECT DISTINCT tag FROM tags ' ,
2024-01-22 23:20:42 -05:00
lambda tag : lambda work : tag in work [ ' tags ' ] ,
)
2024-01-23 00:00:15 -05:00
make_categorization (
2024-01-22 23:20:42 -05:00
' circles ' ,
2024-04-02 14:05:38 -04:00
' SELECT DISTINCT circle FROM works WHERE circle NOT NULL ' ,
2024-01-22 23:20:42 -05:00
lambda circle : lambda work : work [ ' circle ' ] == circle ,
)
2024-01-23 00:00:15 -05:00
make_categorization (
2024-01-22 23:20:42 -05:00
' series ' ,
2024-04-02 14:05:38 -04:00
' SELECT DISTINCT series FROM works WHERE series NOT NULL ' ,
2024-01-22 23:20:42 -05:00
lambda series : lambda work : work [ ' series ' ] == series ,
2024-01-23 00:55:06 -05:00
work_style_cards = True ,
2024-01-22 23:20:42 -05:00
)
2024-01-22 22:06:04 -05:00
with resources . as_file ( resources . files ( " dlibrary " ) ) as r :
2024-02-06 08:29:52 -05:00
copy_recursive ( r / ' static ' , site_dir / ' static ' )
2024-01-22 07:01:41 -05:00
2024-02-06 08:29:52 -05:00
with open ( site_dir / ' index.html ' , ' w ' ) as f :
2024-01-25 04:10:17 -05:00
f . write ( index_template . render ( depth = 0 , works = works ) )
2024-01-22 07:01:41 -05:00
con . close ( )
2024-01-22 02:16:06 -05:00
2024-01-23 15:54:17 -05:00
argparser = argparse . ArgumentParser (
prog = ' dlibrary ' ,
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = textwrap . dedent ( """ \
Organize DRM - free works purchased from DLSite into a library
that can be viewed in a web browser .
Intended workflow :
2024-03-15 16:00:37 -04:00
- ` extract ` a collection of archive files into DLibrary ' s data
directory , automatically giving each work its own subfolder .
2024-01-23 15:54:17 -05:00
- ` fetch ` metadata and thumbnail images for extracted works
from DLSite .
2024-03-12 02:16:47 -04:00
- ` collate ` extracted works , producing a single sequence of
image files ( or symlinks into the extracted data , when
possible ) for each work .
2024-01-23 15:54:17 -05:00
- Manually adjust works ' `metadata` when necessary.
- ` generate ` a static website providing a catalog and viewer
for all collated works .
""" ),
)
2024-01-22 02:16:06 -05:00
argparser . add_argument (
' -d ' , ' --destdir ' ,
type = Path ,
2024-02-06 23:01:59 -05:00
default = Path ( os . getenv ( ' DLIBRARY_DIR ' , ' ./dlibrary ' ) ) ,
2024-01-23 15:54:17 -05:00
help = ' directory to store dlibrary content and metadata to (default: $DLIBRARY_DIR or ./dlibrary) ' ,
2024-01-22 02:16:06 -05:00
)
2024-03-15 16:50:56 -04:00
argparser . add_argument (
' -D ' , ' --debug ' ,
action = ' store_true ' ,
help = ' print out debugging info ' ,
)
2024-03-01 23:43:38 -05:00
argparser . add_argument (
' -l ' , ' --locale ' ,
type = str ,
default = os . getenv ( ' DLIBRARY_LOCALE ' , ' en_US ' ) ,
help = ( ' preferred locale for requesting metadata and collating (e.g. " ja_JP " , " en_US " ). '
' May still fall back to Japanese if other languages are unavailable. '
' (default: $DLIBRARY_LOCALE or en_US) ' ) ,
)
2024-03-16 02:44:21 -04:00
argparser . add_argument (
' -a ' , ' --auto ' ,
action = ' store_true ' ,
help = ' automatically continue the extract->fetch->collate->generate pipeline starting from whatever subcommand is being run ' ,
)
2024-01-22 22:09:26 -05:00
subparsers = argparser . add_subparsers ( title = " subcommands " , required = True )
2024-01-22 02:16:06 -05:00
2024-03-15 16:00:37 -04:00
parser_extract = subparsers . add_parser ( ' extract ' , aliases = [ ' x ' ] , help = ' extract archive files ' )
2024-01-22 02:16:06 -05:00
parser_extract . add_argument (
' -r ' , ' --remove ' ,
action = ' store_true ' ,
2024-03-15 16:00:37 -04:00
help = ' remove original archive files after extraction ' ,
2024-01-22 02:16:06 -05:00
)
parser_extract . add_argument (
2024-03-15 16:00:37 -04:00
' archives ' ,
2024-01-22 02:16:06 -05:00
metavar = ' FILE ' ,
type = Path ,
nargs = ' + ' ,
2024-03-15 16:00:37 -04:00
help = ' archive files to extract ' ,
2024-01-22 02:16:06 -05:00
)
parser_extract . set_defaults ( func = extract )
2024-03-12 02:16:47 -04:00
parser_fetch = subparsers . add_parser ( ' fetch ' , aliases = [ ' f ' ] , help = ' fetch metadata and thumbnails ' )
2024-01-22 02:16:06 -05:00
parser_fetch . set_defaults ( func = fetch )
2024-01-23 15:54:17 -05:00
parser_collate = subparsers . add_parser (
' collate ' ,
2024-03-12 02:16:47 -04:00
aliases = [ ' c ' ] ,
help = ' collate works into sequences of image files ' ,
2024-01-23 15:54:17 -05:00
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = textwrap . dedent ( """ \
For each extracted work that has not already been collated ,
2024-03-12 02:16:47 -04:00
DLibrary will attempt to intuit its structure and create
a single ordered list of image files in the site data
directory . Each image will either be a symlink to an image
file in the extraction folder , or a single page extracted
from a PDF file .
DLibrary may fail to automatically collate a work if its
files and subdirectories are not named in a way that
indicates a clear linear ordering . In order to assist with
collation , you can provide a list of expressions specifying
where to start traversing the directory structure , what
files to include in what order , and / or what files to ignore
entirely .
An expression can be :
2024-02-07 22:32:31 -05:00
PATH
A single path . If this is an image , it will be appended to
2024-03-12 02:16:47 -04:00
the sequence of collated images for the work it belongs to ;
if this is a PDF , images will be extracted from it and
concatenated to the sequence ; if this is a directory , the
contents of the directory will be automatically collated
using DLibrary ' s default heuristics, and concatenated
to the sequence .
2024-02-07 22:32:31 -05:00
( PATH [ PATH . . . ] )
A group of paths contained in parentheses . You may need to escape
the parentheses to avoid them getting parsed by your shell .
All the paths in this group will be considered together , and
2024-03-12 02:16:47 -04:00
automatically collated using the default heuristics , regardless
of what order the paths are provided in .
2024-02-07 22:32:31 -05:00
! PATH
! ( PATH [ PATH . . . ] )
A path or group of paths to exclude from collation . You may
need to escape the ! . If an excluded path appears within any
2024-03-12 02:16:47 -04:00
of the other specified paths , it will be skipped by the collation
heuristics .
2024-02-07 22:32:31 -05:00
If the only expressions provided are negations , then auto - collation
2024-03-12 02:16:47 -04:00
will start from the top level of the extracted work while skipping
the excluded paths .
2024-02-07 22:32:31 -05:00
2024-01-23 15:54:17 -05:00
All provided paths must be under $ DLIBRARY_DIR / extract / [ work id ] /
2024-03-12 02:16:47 -04:00
for some not - yet - collated work . Paths belonging to multiple
different works can all be provided on the same command line , and
expressions will be clustered together by work id while otherwise
preserving the order they were provided in . A parenthesized group
expression must only contain paths belonging to a single work .
By default , DLibrary will attempt to collate every not - yet - collated
work ( excluding " virtual " works ) , using the provided expressions
to assist in collation when available . The ` - o ` flag will direct
DLibrary to * only * collate works included in the provided expressions ,
even if other uncollated works are present .
""" ),
2024-01-23 15:54:17 -05:00
)
2024-03-12 02:16:47 -04:00
parser_collate . add_argument (
' -o ' , ' --only-specified-works ' ,
action = ' store_true ' ,
help = " only collate works that are explicitly specified " ,
)
parser_collate . add_argument (
2024-03-12 15:50:12 -04:00
' -p ' , ' --pdf-strategy ' ,
choices = [
' ask ' , ' ? ' ,
' show-ask ' , ' s ' ,
' convert ' , ' c ' ,
' extract ' , ' x ' ,
' drop ' , ' d ' ,
' nope ' , ' n '
] ,
default = ' show-ask ' ,
help = " how to handle PDF pages that aren ' t a single image with no text " ,
2024-03-02 18:09:46 -05:00
)
2024-03-12 02:16:47 -04:00
parser_collate . add_argument (
2024-02-07 22:32:31 -05:00
' expression ' ,
2024-03-12 02:16:47 -04:00
nargs = ' * ' ,
2024-02-07 22:32:31 -05:00
help = ' expressions indicating paths to collate or skip ' ,
2024-01-22 03:49:00 -05:00
)
2024-03-12 02:16:47 -04:00
parser_collate . set_defaults ( func = collate )
2024-01-22 03:49:00 -05:00
2024-03-12 02:16:47 -04:00
parser_analyze = subparsers . add_parser ( ' analyze ' , aliases = [ ' a ' ] , help = ' analyze an extracted folder to assist in collation ' )
2024-02-06 23:52:59 -05:00
parser_analyze . add_argument ( ' work_id ' )
parser_analyze . set_defaults ( func = analyze )
2024-03-12 02:16:47 -04:00
parser_metadata = subparsers . add_parser ( ' metadata ' , aliases = [ ' m ' ] , help = ' view or modify metadata for a work ' )
2024-01-22 02:16:06 -05:00
parser_metadata . add_argument ( ' work_id ' )
parser_metadata . add_argument (
' --virtual ' ,
action = argparse . BooleanOptionalAction ,
help = ' set work as virtual ' ,
)
parser_metadata . set_defaults ( func = metadata )
2024-01-23 15:54:17 -05:00
parser_generate = subparsers . add_parser (
' generate ' ,
2024-03-12 02:16:47 -04:00
aliases = [ ' g ' ] ,
2024-01-23 15:54:17 -05:00
help = ' generate HTML/CSS/JS for library site ' ,
formatter_class = argparse . RawDescriptionHelpFormatter ,
description = textwrap . dedent ( """ \
The static site will be generated under $ DLIBRARY_DIR / site /
and can be served by pointing an HTTP server at that
directory . Note that some files inside the static site
hierarchy will be symlinks into $ DLIBRARY_DIR / extract /
outside the site hierarchy , so make sure your HTTP server
will allow those symlinks to be read .
""" ),
)
parser_generate . set_defaults ( func = generate )
2024-01-22 02:16:06 -05:00
2024-01-22 22:06:04 -05:00
def main ( ) :
2024-01-22 02:16:06 -05:00
args = argparser . parse_args ( )
2024-03-15 16:50:56 -04:00
global debug_mode
debug_mode = args . debug
2024-01-22 02:16:06 -05:00
args . func ( args )
2024-01-22 22:06:04 -05:00
if __name__ == " __main__ " :
main ( )