2022년 10월 20일 목요일

[python] WolfCom Crawling

import requests
import bs4, codecs
import os
import io
 
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
image_ext = 'jpg'
request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
}
 
def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')
 
def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')
 
def urlToFile(url, file_name, referer=None):
    request_headers['Referer'] = referer
    resp = requests.get(url, verify=False, headers=request_headers, params=())
    with open(file_name, "wb") as f:
        f.write(resp.content)
    
def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]
 
def getWolfCom( comicsUrl, baseUrl, baseDir):
    doc = getUrl(comicsUrl)
    title = doc.find("title").text
    elist = doc.select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li")
    print(title)
    
    new_dir = os.path.join(baseDir, title.replace(":","_"))
    if not os.path.isdir(new_dir): os.mkdir(new_dir)
 
    d = 999
    for e in elist:
        a = e.find('a',"view_open",href=True)
        if not a: continue
        url = baseUrl + a['href']
 
        doc = getUrl(url, request_headers)
        title = doc.find("title").text
        imgs = doc.select("section.webtoon-body div.group.image-view img")
        print(title)
        
        #sub_dir = os.path.join(new_dir, str(d) + '_' + title.replace(":","_"))
        sub_dir = os.path.join(new_dir, title.replace(":","_"))
        if not os.path.isdir(sub_dir): os.mkdir(sub_dir)
 
        k = 1;
        for img in imgs:
            img_url = img.get('src')
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                file_name = "img_%04d.jpg" % k
                urlToFile( img_url, os.path.join( sub_dir, file_name), comicsUrl)
                print( img_url + " -> " + file_name )
                k = k + 1
        d = d - 1

def getMultipleWolfCom(url):
    iurl = "https://wfwf164.com"
    bdir = "D:/Temp2/"
    getWolfCom(url, iurl, bdir)
        
if __name__ == "__main__":
    urls = [
        "https://wfwf164.com/list?toon=585&title=%B9%DD%C1%DF%B7%C2%BC%D2%B3%E0",
        "https://wfwf164.com/list?toon=1114&title=%BF%C1%C5%BE%C0%C7%C0%FC%BC%B3",
        "https://wfwf164.com/list?toon=1387&title=%B3%CA%C5%AC%B0%C9KNUCKLEGIRL",
        ] 
    iurl = "https://wfwf164.com"
    bdir = "D:/Temp2/"        
    for url in urls:
        getWolfCom(url, iurl, bdir )
    print("END")

[python] ToonKor V2 Crawling All

import bs4, codecs
import requests
import base64
import os
import io

quit_flag = False

import signal
import sys

def signal_handler(sig, frame):
    quit_flag = True
    print('You pressed Ctrl+C!', quit_flag)    
    #sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)
#print('Press Ctrl+C')
#signal.pause()


target_folder = r"D:/Temp6"

requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
image_ext = None
request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
}

def safeFileName(filename):
    filename = filename.replace(":","_")
    filename = filename.replace("?","_")
    filename = filename.replace("/","_")
    filename = filename.replace("*","_")
    filename = filename.replace("<","_")
    filename = filename.replace(">","_")
    filename = filename.replace("\t","_")
    return filename.strip()
    
def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')
 
def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')
 
def getUrlHtml(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8')
 
def urlToFile(url, file_name):
    resp = requests.get(url, verify=False, headers=request_headers, params=())
    with open(file_name, "wb") as f:
        f.write(resp.content)
 
def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]
 
def getToonKor( comicsUrl, baseUrl, baseDir):
    while True:
        try:
            doc = getUrl(comicsUrl)  
            table = doc.select("table.bt_view2")[0]
            elist = table.select("td.bt_title")
            title = elist[0].text
            break
        except:
            print( comicsUrl, "-> retry")
            if quit_flag: return
            continue
 
    table = doc.select("table.web_list")[0]
    elist = table.select("td.content__title")
 
    new_dir = os.path.join(baseDir, safeFileName(title))

    if not os.path.isdir(new_dir): os.mkdir(new_dir)
    else: return
    
    count = 0
    for e in elist:
        count += 1
        url = baseUrl + e['data-role']
        title = e['alt']
        while True:
            try:
                bs_img, html_img = getUrlHtml(url, request_headers)
                begin = html_img.index("var tnimg = '")
                break
            except:
                print( url, "-> retry")
                if quit_flag: return
                continue
        end = html_img.index("';",begin)
        data = html_img[begin + 13: end]
        img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8")
        doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser')
        imgs = doc_imgs.select("img")
        
        #sub_dir = os.path.join(new_dir, title.replace(":","_"))
        #if not os.path.isdir(sub_dir): os.mkdir(sub_dir)
 
        html_file = os.path.join(new_dir, safeFileName(title) + ".html")
        
        if os.path.isfile(html_file): print(html_file, "-> exists"); continue
        print( len(elist), count, html_file)
        
        f = open( html_file, "w" )
        f.write('<meta name="referrer" content="no-referrer" /><br>\n')
        k = 1;
        for img in imgs:
            img_url = img.get('src')
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                #file_name = "img_%04d.jpg" % k
                #urlToFile( img_url, os.path.join( sub_dir, file_name) )
                #print( img_url + " -> " + file_name )
                #print( img_url  )
                f.write('<img src="' + img_url + '" /><br>\n')
                k = k + 1
        f.close()

def saveToonKorComics():
    urls = [ 
        "https://tkr035.com/webtoon/1061",
        ]
    iurl = "https://tkr035.com"
    bdir = "D:/Temp2/"
    for url in urls:
        getToonKor(url, iurl, bdir)
        if quit_flag: break
    print("END")

def getToonKorList(list_url,start=0):
    doc = getUrl(list_url)
    lists = doc.select("div.section-item-inner")
    #print(lists)
    i = 0
    for l in lists:
        i += 1
        if i < start: continue
        comics = l.select("a")[0]
        print(i, len(lists), comics['alt'], comics['href'])
        getToonKor(comics['href'], "https://tkr035.com", target_folder)
        if quit_flag: break

def get_finished_webtoons():
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0", 36)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=2", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=3", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=4", 130)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=5", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=6", 195)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=7", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=8", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=9", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=10", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=11", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=12", 0)
    getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=13", 0)

def get_continue_webtoons():
    getToonKorList("https://tkr035.com/wt/%EC%B5%9C%EC%8B%A0/0/all/%EC%9D%B8%EA%B8%B0//%EC%A0%84%EC%B2%B4", 0)

def get_week_webtoons(week,page,start=0):
    global target_folder
    target_folder = r"D:/Temp7/" + str(week)
    url = "https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/"
    url += str(week)
    url += "/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page="
    url += str(page)
    getToonKorList(url,start)
    
def get_week_webtoons_all():
    #for i in range(3): get_week_webtoons(1,i+1)
    #get_week_webtoons(1,2,150)
    #get_week_webtoons(1,3)
    for i in range(3): get_week_webtoons(2,i+1)
    for i in range(3): get_week_webtoons(3,i+1)
    for i in range(3): get_week_webtoons(4,i+1)
    for i in range(3): get_week_webtoons(5,i+1)
    for i in range(3): get_week_webtoons(6,i+1)
    for i in range(3): get_week_webtoons(7,i+1)
    for i in range(1): get_week_webtoons(8,i+1)
        
    '''
    global target_folder
    target_folder = r"D:/Temp7/1"
    getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4")
    getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=2")
    getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=3")
    '''

if __name__ == "__main__":
    #get_continue_webtoons()
    get_week_webtoons_all()
    
    

[python] ToonKor V2 Crawling

import bs4, codecs
import requests
import base64
import os
import io
import sys
 
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
image_ext = None
request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
}

def safeFileName(filename):
    return filename.replace(":","_").replace("?","_").replace("<","_").replace(">","_").strip()

def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')
 
def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')
 
def getUrlHtml(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8')
 
def urlToFile(url, file_name):
    resp = requests.get(url, verify=False, headers=request_headers, params=())
    with open(file_name, "wb") as f:
        f.write(resp.content)
 
def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]
 
def getToonKor( comicsUrl, baseUrl, baseDir):
    while True:
        try:
            doc = getUrl(comicsUrl)
            table = doc.select("table.bt_view2")[0]
            elist = table.select("td.bt_title")
            title = elist[0].text
            break
        except:
            print(comicsUrl, "-> retry")
            continue
 
    table = doc.select("table.web_list")[0]
    elist = table.select("td.content__title")
 
    new_dir = os.path.join(baseDir, safeFileName(title))
    if not os.path.isdir(new_dir): os.mkdir(new_dir)
 
    for e in elist:
        url = baseUrl + e['data-role']
        title = e['alt']
        while True:
            try:
                bs_img, html_img = getUrlHtml(url, request_headers)
                begin = html_img.index("var tnimg = '")
                break
            except:
                print( url, "-> retry")
                continue
        end = html_img.index("';",begin)
        data = html_img[begin + 13: end]
        img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8")
        doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser')
        imgs = doc_imgs.select("img")
        
        #sub_dir = os.path.join(new_dir, title.replace(":","_"))
        #if not os.path.isdir(sub_dir): os.mkdir(sub_dir)
 
        html_file = os.path.join(new_dir, safeFileName(title) + ".html")
        print(html_file)
        f = open( html_file, "w" )
        f.write('<meta name="referrer" content="no-referrer" /><br>\n')
        k = 1;
        for img in imgs:
            img_url = img.get('src')
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                #file_name = "img_%04d.jpg" % k
                #urlToFile( img_url, os.path.join( sub_dir, file_name) )
                #print( img_url + " -> " + file_name )
                print( img_url  )
                f.write('<img src="' + img_url + '" /><br>\n')
                k = k + 1
        f.close()
        
if __name__ == "__main__":
    urls = []
    if len(sys.argv) > 1:
        for i in range(1,len(sys.argv)):
            urls.append(sys.argv[i])
    else:
        urls.append(
            #"https://tkr035.com/webtoon/2939" #사내맞선
            #"https://tkr035.com/webtoon/826" #황제의 외동딸
            #"https://tkr035.com/webtoon/2794" #그만바의 자취방
            #"https://tkr035.com/webtoon/2647" #첩
            "https://tkr035.com/webtoon/6117" #외모지상주의
        )
    iurl = "https://tkr035.com"
    bdir = "D:/Temp2/"
    for url in urls:
        getToonKor(url, iurl, bdir)
    print("END")

[python] TookKor V1 Crawling

import bs4, codecs
import requests
import base64
import os
import io
import sys
import re

requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
image_ext = None
request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
}

def safeFileName(filename):
    return re.sub("[:?/*<>\t.]", "_", filename ).strip()
    
def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')

def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')

def getUrlHtml(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8')

def urlToFile(url, file_name):
    while True:
        try:
            resp = requests.get(url, verify=False, headers=request_headers, params=())
            with open(file_name, "wb") as f:
                f.write(resp.content)
            break
        except:
            print( 'retry -->', file_name ); 
            continue
            
def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]

def getToonKor( comicsUrl, baseUrl, baseDir):
    doc = getUrl(comicsUrl)
    title = doc.find("title").text
    table = doc.select("table.web_list")[0]
    elist = table.select("td.episode__index")

    #print( doc, title, table, elist )
    
    new_dir = os.path.join(baseDir, safeFileName(title))
    if not os.path.isdir(new_dir): os.mkdir(new_dir)

    for e in elist:
        url = baseUrl + e['data-role']
        while True:
            try:
                bs_img, html_img = getUrlHtml(url, request_headers)
                title = bs_img.find("title").text
                begin = html_img.index("var toon_img = '")
                end = html_img.index("';",begin)
                data = html_img[begin + 16: end]
                img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8")
                doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser')
                imgs = doc_imgs.select("img")
                break
            except:
                pass
        sub_dir = os.path.join(new_dir, safeFileName(title))
        if not os.path.isdir(sub_dir): os.mkdir(sub_dir)
        else: print( 'skip -->', sub_dir ); continue
        print(sub_dir)

        k = 1;
        for img in imgs:
            img_url = img.get('src')
            #print(img_url)
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                ext = img_url.rfind(".")
                if ext >= 0: file_name = ("img_%04d" % k) + img_url[ext:]
                else: file_name = "img_%04d.jpg" % k
                urlToFile( img_url, os.path.join( sub_dir, file_name) )
                print( img_url + " -> " + file_name )
                k = k + 1

if __name__ == "__main__":
    #https://tkor.fish/%EC%9B%B9%ED%88%B0
    url = "https://toonkor103.com/%EC%96%B4%EA%B2%8C%EC%9D%B8-%EB%A7%88%EC%9D%B4-%EB%9D%BC%EC%9D%B4%ED%94%84"
    baseUrl = "https://toonkor103.com"
    outDir = "D:/Temp2/"
    if len(sys.argv) > 1:
        url = sys.argv[1]    
        baseUrl = url[:url.find('/',8)]
    if len(sys.argv) > 2:
        outDir = sys.argv[2]
    getToonKor(url, baseUrl, outDir)
    
        

[python] Directory / Zip to Resize

from fpdf import FPDF
from PIL import Image
import zipfile
import sys
import os
import io

a4_w = 595.28
a4_h = 841.89
max_img_h = a4_h * 4
max_page_h = a4_h * 12

def key_str_num(path):
    import re
    strList = re.split('(\d+)',path)
    strList = [x for x in strList if len(x) > 0]
    newStr = []
    for s in strList:
        try: newStr += "%04d" % int(s)
        except: newStr += s          
    return newStr

def dir2dir(src_dir,dest_dir,width=None):
    def get_listdir(folder):
        file_list = []
        files = os.listdir(folder)
        for file in files:
            path = os.path.join( folder, file )
            if os.path.isdir(path): file_list += get_listdir(path)
            else: file_list.append(path)
        return file_list
    def write_file(path,data):
        with open(path,"rw") as f:
            f.write(data)
    src_dir = os.path.normpath(src_dir)
    dest_dir = os.path.normpath(dest_dir)
    if not os.path.exists(dest_dir): os.mkdir(dest_dir)
    error = []
    namelist = get_listdir(src_dir)
    namelist = sorted(namelist, key=key_str_num)
    for name in namelist:
        if os.path.isdir(name): continue
        try:
            image = Image.open(name)
            if width and image.width > width:
                image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS)
            filename = os.path.join(dest_dir, name[len(src_dir)+1:])
            dirname = os.path.dirname(filename)
            if not os.path.exists(dirname): os.mkdir(dirname)
            image.save( filename, quality=70 )
            print(filename)
        except Exception as e:
            error.append((name,e))
    for e in error: print(e)    
    
def zip2zip(src,dest,width=None):
    def is_dir(filename):
        return filename.endswith('/') or filename.endswith('\\')
    error = []
    in_zip = zipfile.ZipFile(src)
    out_zip = zipfile.ZipFile(dest, mode='w')
    namelist = in_zip.namelist() #filename
    namelist = sorted(namelist, key=key_str_num)
    for name in namelist:
        if is_dir(name): continue
        image_bytes = in_zip.read(name)
        try:
            image = Image.open(io.BytesIO(image_bytes))
            if width and image.width > width:
                image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS)
            image_bytes = io.BytesIO()
            image.save(image_bytes, format='jpeg', quality=70)
            #image_bytes.seek(0) + image_bytes.read() -> getvalue() or getBuffer().tobytes()
            name = name.encode('cp437').decode('euc-kr','ignore')
            out_zip.writestr(name, image_bytes.getvalue(), compress_type=zipfile.ZIP_DEFLATED, compresslevel=9)
            print(name)
        except Exception as e:
            error.append((name,e))
    out_zip.close()
    for e in error: print(e)

def img2img(src,width=None):
    if os.path.isdir(src):
        dir2dir(src,src + ".zip",width)
    else:
        if zipfile.is_zipfile(src):
            zip2zip(src,src + ".zip",width)
        else:
            print( src, 'is neigher direcotry nor zipfile')
            
def show_gui():
    def file_drop(files):
        global file_list
        file_list = []
        for file in files:
            table.Add( (os.path.basename(file), "Ready") )
            file_list.append(file)  
    def make_handler(ev):
        for file in file_list:
            img2img(file)
        print(file_list)
    def get_panel(parent):
        global table
        panel = w.VBox(parent)
        table = w.Table(panel, ( ('File', 500, -1), ('Status', 70, 0) ), drop=file_drop)
        button = w.Button(panel, "Make", make_handler )
        panel.Add( table, expand=True, fill=True )
        panel.Add( button, expand=False, fill=False, right=True )
        return panel
    import wxez as w
    win = w.WxWin("Image to PDF", 600, 400)
    panel = get_panel(win)
    win.Add(panel, expand=True, fill=True)
    win.Run()
    
if __name__ == "__main__":
    if len(sys.argv) < 3:
        show_gui()
    elif len(sys.argv) == 2:
        img2img(sys.argv[1])
    elif len(sys.argv) == 3:
        img2img(sys.argv[1],int(sys.argv[2]))

[python] Directory / Zip to PDF

from fpdf import FPDF
from PIL import Image
import zipfile
import sys
import os
import io

a4_w = 595.28
a4_h = 841.89
max_img_h = a4_h * 4
max_page_h = a4_h * 12

class Pdf(FPDF):
    def __init__(self,orientation='P', unit='pt', pagesize='A4'): #A4 (595.28,841.89)
        super(Pdf,self).__init__(orientation=orientation, unit=unit, format=pagesize)
        self.set_margins(0,0,0)
        self.set_default_font()
    def set_info(self,title=None,subject=None,author=None,creator=None):
        if title: self.set_title(title)
        if subject: self.set_subject(subject)
        if author: self.set_author(author)
        if creator: self.set_creator(creator)
    def set_default_font(self): self.set_font("Courier", "", 12)
    def set_hangul_font(self):
        self.add_font('malgun', '', 'malgun.ttf', uni=True);
        self.set_font('malgun', '', 12);
    def header(self,title=None,logo=None): pass
    def footer(self): pass
    def page_wdith(self): return self.w
    def page_height(self): return self.h
    def save(self,filename): self.output(name=filename,dest='F') 
    def save_tostring(self): return self.output(dest='S') 
    def add_outline(self,text): self.start_section(text)
    def print_text(self,text,align='L',style="",fontsize=12):
        self.cell(self.get_string_width(text), 
              fontsize, txt=text, ln=0,
              align=align, border=0, fill=False)
    def print_textln(self,text="",style="",fontsize=12):
        self.print_text(text,style,fontsize); self.ln(20)
    def print_multiline_text(self,text,border=0,align="J",fill=False):
        #border(0,1,L,T,R,B) align(L,C,R,J)
        width = self.w - self.l_margin - self.r_margin
        height = 20
        self.multi_cell(width, height, text, border, align, fill)

def key_str_num_2(path):
    import re
    strList = re.split('(\d+)',path)
    strList = [x for x in strList if len(x) > 0]
    newStr = []
    for s in strList:
        try: newStr += "%04d" % int(s)
        except: newStr += s          
    return newStr
    
def key_str_num(strItem):
    import re
    strList = re.split('(\d+)',strItem)
    strList = [x for x in strList if len(x) > 0]
    newList = []
    for s in strList:
        try: newList.append(int(s))
        except: newList.append(s)            
    return newList
    
def image_split_h(img,max_h):
    h = 0
    images = []
    print('image_split_h',img.width,img.height)
    while h < img.height:
        if img.height - h < max_h:
            max_h = img.height - h
        print(h,h+max_h)
        images.append( img.crop( (0,h,img.width,h+max_h) ) )
        h += max_h
    return images
   
def images_info(path,files):
    info = []
    for file in files:
        file_path = os.path.join( path, file )
        img = Image.open(file_path)
        w = a4_w
        h = (w * img.height / img.width) 
        if h > max_img_h:
            imgs = image_split_h(img,max_img_h)
            for im in imgs:
                w = a4_w
                h = (w * im.height / im.width) 
                info.append({'width':w, 'height':h, 'image':im, 'filename':file})
        else:
            info.append({'width':w, 'height':h, 'image':img, 'filename':file})
    return info
    
def page_info(infos):
    pages = []
    page = []
    h = 0
    for info in infos:
        if h + info['height'] > max_page_h:
            pages.append({'height':h, 'images':page})
            page = []
            h = 0
        page.append(info)
        h += info['height']
    if h > 0: pages.append({'height': h, 'images':page})
    return pages
    
def dir2pdf_dynamic(folder):
    error = []
    pdf = Pdf()
    #pdf.add_page(format=(a4_w,a4_h))
    subdirs = os.listdir(folder)
    subdirs.sort()
    for subdir in subdirs:
        path = os.path.join( folder, subdir )
        if not os.path.isdir(path):
            continue
        print(path)
        files = os.listdir( path )
        files.sort()
        infos = images_info(path, files)
        pages = page_info(infos)

        for index, page in enumerate(pages):
            #print(page)
            print(len(page['images']),page['height'])
            pdf.add_page(format=(a4_w,page['height']+100))
            if index == 0: pdf.add_outline(subdir)
            for info in page['images']:
                try: 
                    img_byte_arr = io.BytesIO()
                    info['image'].save(img_byte_arr, format='jpeg')
                    pdf.image( img_byte_arr, w = a4_w)
                except Exception as e: 
                    error.append( (subdir, info['filename'], e) )
                    print(e)
    pdf.save(folder + ".pdf")
    for e in error: print(e)

def dir2pdf(folder, width=None):
    def get_listdir(folder):
        file_list = []
        files = os.listdir(folder)
        for file in files:
            path = os.path.join( folder, file )
            if os.path.isdir(path):
                file_list += get_listdir(path)
            else:
                file_list.append(path)
        return file_list
    def read_file(path):
        with open(path,"rb") as f:
            return f.read()
            
    error = []
    pdf = Pdf()
    pdf.add_page(format=(a4_w,a4_h*10))
    namelist = get_listdir(folder)
    namelist = sorted(namelist, key=key_str_num_2)
    prev_dir = None
    for name in namelist:
        if os.path.isdir(name): continue
        print(name)
        curr_dir = os.path.dirname(name)
        try: 
            image = Image.open(name)
            if width and image.width > width:
                image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS)
        except: continue
        if image.height > max_img_h:
            images =  image_split_h(image,max_img_h)
        else:
            images = [ image ]
        try:
            if prev_dir != curr_dir:
                pdf.add_outline( os.path.basename(curr_dir) )
                prev_dir = curr_dir
            for image in images:
                image_bytes = io.BytesIO()
                image.save(image_bytes, format='jpeg', quality=70)                
                pdf.image(image_bytes,w = a4_w)
        except Exception as e:
            error.append((name,e))
    pdf.save( os.path.join( os.path.dirname(folder), os.path.basename(folder) + '.pdf') )
    for e in error: print(e)
    
    
def zip2pdf(filename, width=None):
    def is_dir(filename):
        return filename.endswith('/') or filename.endswith('\\')
    def is_image(filedata):
        return filename.endswith('.png') or filename.endswith('.jpg') or filename.endswith('.jpeg')
    error = []
    pdf = Pdf()
    pdf.add_page(format=(a4_w,a4_h*10))
    zf = zipfile.ZipFile(filename,mode="r")
    namelist = zf.namelist() #filename
    namelist = sorted(namelist, key=key_str_num_2)
    #infolist = zf.infolist() #filename,size(org,cmp),attr,method
    prev_dir = None
    for name in namelist:
        if is_dir(name): continue
        if not name.endswith(".jpeg") and not name.endswith('.jpg') and not name.endswith('.png'):
            continue
        curr_dir = os.path.dirname(name)
        curr_dir = curr_dir.encode('cp437').decode('euc-kr','ignore')
        image_bytes = zf.read(name)
        try: 
            image = Image.open(io.BytesIO(image_bytes))
            if width and image.width > width:
                image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS)
        except Exception as e: print(len(image_bytes),e); continue
        print(name.encode('cp437').decode('euc-kr','ignore'))
        if image.height > max_img_h:
            try: images =  image_split_h(image,max_img_h)
            except Exception as e: print(e); continue
        else:
            images = [ image ]
        try:
            if prev_dir != curr_dir:
                pdf.add_outline( os.path.basename(curr_dir) )
                prev_dir = curr_dir
            for image in images:
                image_bytes = io.BytesIO()
                image.save(image_bytes, format='jpeg', quality=70)
                pdf.image(image_bytes,w = a4_w)
        except Exception as e:
            error.append((name,e))
    pdf.save(filename + '.pdf')
    for e in error: print(e)

def file2pdf(filename,width=None):
    if os.path.isdir(filename):
        dir2pdf(filename,width)
    else:
        if zipfile.is_zipfile(filename):
            zip2pdf(filename,width)
        else:
            print( filename, 'is neigher direcotry nor zipfile')
            
def show_gui():
    def file_drop(files):
        global file_list
        file_list = []
        table.DeleteAllItems()
        for file in files:
            table.Add( (os.path.basename(file), "Ready") )
            file_list.append(file)  
    def make_handler(ev):
        import threading
        def thread_handler():
            for index, file in enumerate(file_list):
                w.RunLater( lambda: table.SetItem( index, 1, "Proc" ) )
                file2pdf(file)
                w.RunLater( lambda: table.SetItem( index, 1, "OK" ) )
        thread = threading.Thread(target=thread_handler,args=())
        thread.daemon = False
        thread.start()
    def get_panel(parent):
        global table
        panel = w.VBox(parent)
        table = w.Table(panel, ( ('File', 500, -1), ('Status', 70, 0) ), drop=file_drop)
        button = w.Button(panel, "Make", make_handler )
        panel.Add( table, expand=True, fill=True )
        panel.Add( button, expand=False, fill=False, right=True )
        return panel
    import wxez as w
    win = w.WxWin("Image to PDF", 600, 400)
    panel = get_panel(win)
    win.Add(panel, expand=True, fill=True)
    win.Run()
    
if __name__ == "__main__":
    if len(sys.argv) < 2:
        show_gui()
    elif len(sys.argv) == 2:
        file2pdf(sys.argv[1])
    elif len(sys.argv) == 3:
        file2pdf(sys.argv[1],int(sys.argv[2]))