import requests import bs4, codecs import os import io requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = 'jpg' request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def urlToFile(url, file_name, referer=None): request_headers['Referer'] = referer resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getWolfCom( comicsUrl, baseUrl, baseDir): doc = getUrl(comicsUrl) title = doc.find("title").text elist = doc.select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li") print(title) new_dir = os.path.join(baseDir, title.replace(":","_")) if not os.path.isdir(new_dir): os.mkdir(new_dir) d = 999 for e in elist: a = e.find('a',"view_open",href=True) if not a: continue url = baseUrl + a['href'] doc = getUrl(url, request_headers) title = doc.find("title").text imgs = doc.select("section.webtoon-body div.group.image-view img") print(title) #sub_dir = os.path.join(new_dir, str(d) + '_' + title.replace(":","_")) sub_dir = os.path.join(new_dir, title.replace(":","_")) if not os.path.isdir(sub_dir): os.mkdir(sub_dir) k = 1; for img in imgs: img_url = img.get('src') if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url file_name = "img_%04d.jpg" % k urlToFile( img_url, os.path.join( sub_dir, file_name), comicsUrl) print( img_url + " -> " + file_name ) k = k + 1 d = d - 1 def getMultipleWolfCom(url): iurl = "https://wfwf164.com" bdir = "D:/Temp2/" getWolfCom(url, iurl, bdir) if __name__ == "__main__": urls = [ "https://wfwf164.com/list?toon=585&title=%B9%DD%C1%DF%B7%C2%BC%D2%B3%E0", "https://wfwf164.com/list?toon=1114&title=%BF%C1%C5%BE%C0%C7%C0%FC%BC%B3", "https://wfwf164.com/list?toon=1387&title=%B3%CA%C5%AC%B0%C9KNUCKLEGIRL", ] iurl = "https://wfwf164.com" bdir = "D:/Temp2/" for url in urls: getWolfCom(url, iurl, bdir ) print("END")
2022년 10월 20일 목요일
[python] WolfCom Crawling
[python] ToonKor V2 Crawling All
import bs4, codecs import requests import base64 import os import io quit_flag = False import signal import sys def signal_handler(sig, frame): quit_flag = True print('You pressed Ctrl+C!', quit_flag) #sys.exit(0) signal.signal(signal.SIGINT, signal_handler) #print('Press Ctrl+C') #signal.pause() target_folder = r"D:/Temp6" requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = None request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def safeFileName(filename): filename = filename.replace(":","_") filename = filename.replace("?","_") filename = filename.replace("/","_") filename = filename.replace("*","_") filename = filename.replace("<","_") filename = filename.replace(">","_") filename = filename.replace("\t","_") return filename.strip() def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def getUrlHtml(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8') def urlToFile(url, file_name): resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getToonKor( comicsUrl, baseUrl, baseDir): while True: try: doc = getUrl(comicsUrl) table = doc.select("table.bt_view2")[0] elist = table.select("td.bt_title") title = elist[0].text break except: print( comicsUrl, "-> retry") if quit_flag: return continue table = doc.select("table.web_list")[0] elist = table.select("td.content__title") new_dir = os.path.join(baseDir, safeFileName(title)) if not os.path.isdir(new_dir): os.mkdir(new_dir) else: return count = 0 for e in elist: count += 1 url = baseUrl + e['data-role'] title = e['alt'] while True: try: bs_img, html_img = getUrlHtml(url, request_headers) begin = html_img.index("var tnimg = '") break except: print( url, "-> retry") if quit_flag: return continue end = html_img.index("';",begin) data = html_img[begin + 13: end] img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8") doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser') imgs = doc_imgs.select("img") #sub_dir = os.path.join(new_dir, title.replace(":","_")) #if not os.path.isdir(sub_dir): os.mkdir(sub_dir) html_file = os.path.join(new_dir, safeFileName(title) + ".html") if os.path.isfile(html_file): print(html_file, "-> exists"); continue print( len(elist), count, html_file) f = open( html_file, "w" ) f.write('<meta name="referrer" content="no-referrer" /><br>\n') k = 1; for img in imgs: img_url = img.get('src') if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url #file_name = "img_%04d.jpg" % k #urlToFile( img_url, os.path.join( sub_dir, file_name) ) #print( img_url + " -> " + file_name ) #print( img_url ) f.write('<img src="' + img_url + '" /><br>\n') k = k + 1 f.close() def saveToonKorComics(): urls = [ "https://tkr035.com/webtoon/1061", ] iurl = "https://tkr035.com" bdir = "D:/Temp2/" for url in urls: getToonKor(url, iurl, bdir) if quit_flag: break print("END") def getToonKorList(list_url,start=0): doc = getUrl(list_url) lists = doc.select("div.section-item-inner") #print(lists) i = 0 for l in lists: i += 1 if i < start: continue comics = l.select("a")[0] print(i, len(lists), comics['alt'], comics['href']) getToonKor(comics['href'], "https://tkr035.com", target_folder) if quit_flag: break def get_finished_webtoons(): getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0", 36) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=2", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=3", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=4", 130) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=5", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=6", 195) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=7", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=8", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=9", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=10", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=11", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=12", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=13", 0) def get_continue_webtoons(): getToonKorList("https://tkr035.com/wt/%EC%B5%9C%EC%8B%A0/0/all/%EC%9D%B8%EA%B8%B0//%EC%A0%84%EC%B2%B4", 0) def get_week_webtoons(week,page,start=0): global target_folder target_folder = r"D:/Temp7/" + str(week) url = "https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/" url += str(week) url += "/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=" url += str(page) getToonKorList(url,start) def get_week_webtoons_all(): #for i in range(3): get_week_webtoons(1,i+1) #get_week_webtoons(1,2,150) #get_week_webtoons(1,3) for i in range(3): get_week_webtoons(2,i+1) for i in range(3): get_week_webtoons(3,i+1) for i in range(3): get_week_webtoons(4,i+1) for i in range(3): get_week_webtoons(5,i+1) for i in range(3): get_week_webtoons(6,i+1) for i in range(3): get_week_webtoons(7,i+1) for i in range(1): get_week_webtoons(8,i+1) ''' global target_folder target_folder = r"D:/Temp7/1" getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4") getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=2") getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=3") ''' if __name__ == "__main__": #get_continue_webtoons() get_week_webtoons_all()
[python] ToonKor V2 Crawling
import bs4, codecs import requests import base64 import os import io import sys requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = None request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def safeFileName(filename): return filename.replace(":","_").replace("?","_").replace("<","_").replace(">","_").strip() def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def getUrlHtml(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8') def urlToFile(url, file_name): resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getToonKor( comicsUrl, baseUrl, baseDir): while True: try: doc = getUrl(comicsUrl) table = doc.select("table.bt_view2")[0] elist = table.select("td.bt_title") title = elist[0].text break except: print(comicsUrl, "-> retry") continue table = doc.select("table.web_list")[0] elist = table.select("td.content__title") new_dir = os.path.join(baseDir, safeFileName(title)) if not os.path.isdir(new_dir): os.mkdir(new_dir) for e in elist: url = baseUrl + e['data-role'] title = e['alt'] while True: try: bs_img, html_img = getUrlHtml(url, request_headers) begin = html_img.index("var tnimg = '") break except: print( url, "-> retry") continue end = html_img.index("';",begin) data = html_img[begin + 13: end] img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8") doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser') imgs = doc_imgs.select("img") #sub_dir = os.path.join(new_dir, title.replace(":","_")) #if not os.path.isdir(sub_dir): os.mkdir(sub_dir) html_file = os.path.join(new_dir, safeFileName(title) + ".html") print(html_file) f = open( html_file, "w" ) f.write('<meta name="referrer" content="no-referrer" /><br>\n') k = 1; for img in imgs: img_url = img.get('src') if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url #file_name = "img_%04d.jpg" % k #urlToFile( img_url, os.path.join( sub_dir, file_name) ) #print( img_url + " -> " + file_name ) print( img_url ) f.write('<img src="' + img_url + '" /><br>\n') k = k + 1 f.close() if __name__ == "__main__": urls = [] if len(sys.argv) > 1: for i in range(1,len(sys.argv)): urls.append(sys.argv[i]) else: urls.append( #"https://tkr035.com/webtoon/2939" #사내맞선 #"https://tkr035.com/webtoon/826" #황제의 외동딸 #"https://tkr035.com/webtoon/2794" #그만바의 자취방 #"https://tkr035.com/webtoon/2647" #첩 "https://tkr035.com/webtoon/6117" #외모지상주의 ) iurl = "https://tkr035.com" bdir = "D:/Temp2/" for url in urls: getToonKor(url, iurl, bdir) print("END")
[python] TookKor V1 Crawling
import bs4, codecs import requests import base64 import os import io import sys import re requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = None request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def safeFileName(filename): return re.sub("[:?/*<>\t.]", "_", filename ).strip() def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def getUrlHtml(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8') def urlToFile(url, file_name): while True: try: resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) break except: print( 'retry -->', file_name ); continue def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getToonKor( comicsUrl, baseUrl, baseDir): doc = getUrl(comicsUrl) title = doc.find("title").text table = doc.select("table.web_list")[0] elist = table.select("td.episode__index") #print( doc, title, table, elist ) new_dir = os.path.join(baseDir, safeFileName(title)) if not os.path.isdir(new_dir): os.mkdir(new_dir) for e in elist: url = baseUrl + e['data-role'] while True: try: bs_img, html_img = getUrlHtml(url, request_headers) title = bs_img.find("title").text begin = html_img.index("var toon_img = '") end = html_img.index("';",begin) data = html_img[begin + 16: end] img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8") doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser') imgs = doc_imgs.select("img") break except: pass sub_dir = os.path.join(new_dir, safeFileName(title)) if not os.path.isdir(sub_dir): os.mkdir(sub_dir) else: print( 'skip -->', sub_dir ); continue print(sub_dir) k = 1; for img in imgs: img_url = img.get('src') #print(img_url) if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url ext = img_url.rfind(".") if ext >= 0: file_name = ("img_%04d" % k) + img_url[ext:] else: file_name = "img_%04d.jpg" % k urlToFile( img_url, os.path.join( sub_dir, file_name) ) print( img_url + " -> " + file_name ) k = k + 1 if __name__ == "__main__": #https://tkor.fish/%EC%9B%B9%ED%88%B0 url = "https://toonkor103.com/%EC%96%B4%EA%B2%8C%EC%9D%B8-%EB%A7%88%EC%9D%B4-%EB%9D%BC%EC%9D%B4%ED%94%84" baseUrl = "https://toonkor103.com" outDir = "D:/Temp2/" if len(sys.argv) > 1: url = sys.argv[1] baseUrl = url[:url.find('/',8)] if len(sys.argv) > 2: outDir = sys.argv[2] getToonKor(url, baseUrl, outDir)
[python] Directory / Zip to Resize
from fpdf import FPDF from PIL import Image import zipfile import sys import os import io a4_w = 595.28 a4_h = 841.89 max_img_h = a4_h * 4 max_page_h = a4_h * 12 def key_str_num(path): import re strList = re.split('(\d+)',path) strList = [x for x in strList if len(x) > 0] newStr = [] for s in strList: try: newStr += "%04d" % int(s) except: newStr += s return newStr def dir2dir(src_dir,dest_dir,width=None): def get_listdir(folder): file_list = [] files = os.listdir(folder) for file in files: path = os.path.join( folder, file ) if os.path.isdir(path): file_list += get_listdir(path) else: file_list.append(path) return file_list def write_file(path,data): with open(path,"rw") as f: f.write(data) src_dir = os.path.normpath(src_dir) dest_dir = os.path.normpath(dest_dir) if not os.path.exists(dest_dir): os.mkdir(dest_dir) error = [] namelist = get_listdir(src_dir) namelist = sorted(namelist, key=key_str_num) for name in namelist: if os.path.isdir(name): continue try: image = Image.open(name) if width and image.width > width: image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS) filename = os.path.join(dest_dir, name[len(src_dir)+1:]) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.mkdir(dirname) image.save( filename, quality=70 ) print(filename) except Exception as e: error.append((name,e)) for e in error: print(e) def zip2zip(src,dest,width=None): def is_dir(filename): return filename.endswith('/') or filename.endswith('\\') error = [] in_zip = zipfile.ZipFile(src) out_zip = zipfile.ZipFile(dest, mode='w') namelist = in_zip.namelist() #filename namelist = sorted(namelist, key=key_str_num) for name in namelist: if is_dir(name): continue image_bytes = in_zip.read(name) try: image = Image.open(io.BytesIO(image_bytes)) if width and image.width > width: image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS) image_bytes = io.BytesIO() image.save(image_bytes, format='jpeg', quality=70) #image_bytes.seek(0) + image_bytes.read() -> getvalue() or getBuffer().tobytes() name = name.encode('cp437').decode('euc-kr','ignore') out_zip.writestr(name, image_bytes.getvalue(), compress_type=zipfile.ZIP_DEFLATED, compresslevel=9) print(name) except Exception as e: error.append((name,e)) out_zip.close() for e in error: print(e) def img2img(src,width=None): if os.path.isdir(src): dir2dir(src,src + ".zip",width) else: if zipfile.is_zipfile(src): zip2zip(src,src + ".zip",width) else: print( src, 'is neigher direcotry nor zipfile') def show_gui(): def file_drop(files): global file_list file_list = [] for file in files: table.Add( (os.path.basename(file), "Ready") ) file_list.append(file) def make_handler(ev): for file in file_list: img2img(file) print(file_list) def get_panel(parent): global table panel = w.VBox(parent) table = w.Table(panel, ( ('File', 500, -1), ('Status', 70, 0) ), drop=file_drop) button = w.Button(panel, "Make", make_handler ) panel.Add( table, expand=True, fill=True ) panel.Add( button, expand=False, fill=False, right=True ) return panel import wxez as w win = w.WxWin("Image to PDF", 600, 400) panel = get_panel(win) win.Add(panel, expand=True, fill=True) win.Run() if __name__ == "__main__": if len(sys.argv) < 3: show_gui() elif len(sys.argv) == 2: img2img(sys.argv[1]) elif len(sys.argv) == 3: img2img(sys.argv[1],int(sys.argv[2]))
[python] Directory / Zip to PDF
from fpdf import FPDF from PIL import Image import zipfile import sys import os import io a4_w = 595.28 a4_h = 841.89 max_img_h = a4_h * 4 max_page_h = a4_h * 12 class Pdf(FPDF): def __init__(self,orientation='P', unit='pt', pagesize='A4'): #A4 (595.28,841.89) super(Pdf,self).__init__(orientation=orientation, unit=unit, format=pagesize) self.set_margins(0,0,0) self.set_default_font() def set_info(self,title=None,subject=None,author=None,creator=None): if title: self.set_title(title) if subject: self.set_subject(subject) if author: self.set_author(author) if creator: self.set_creator(creator) def set_default_font(self): self.set_font("Courier", "", 12) def set_hangul_font(self): self.add_font('malgun', '', 'malgun.ttf', uni=True); self.set_font('malgun', '', 12); def header(self,title=None,logo=None): pass def footer(self): pass def page_wdith(self): return self.w def page_height(self): return self.h def save(self,filename): self.output(name=filename,dest='F') def save_tostring(self): return self.output(dest='S') def add_outline(self,text): self.start_section(text) def print_text(self,text,align='L',style="",fontsize=12): self.cell(self.get_string_width(text), fontsize, txt=text, ln=0, align=align, border=0, fill=False) def print_textln(self,text="",style="",fontsize=12): self.print_text(text,style,fontsize); self.ln(20) def print_multiline_text(self,text,border=0,align="J",fill=False): #border(0,1,L,T,R,B) align(L,C,R,J) width = self.w - self.l_margin - self.r_margin height = 20 self.multi_cell(width, height, text, border, align, fill) def key_str_num_2(path): import re strList = re.split('(\d+)',path) strList = [x for x in strList if len(x) > 0] newStr = [] for s in strList: try: newStr += "%04d" % int(s) except: newStr += s return newStr def key_str_num(strItem): import re strList = re.split('(\d+)',strItem) strList = [x for x in strList if len(x) > 0] newList = [] for s in strList: try: newList.append(int(s)) except: newList.append(s) return newList def image_split_h(img,max_h): h = 0 images = [] print('image_split_h',img.width,img.height) while h < img.height: if img.height - h < max_h: max_h = img.height - h print(h,h+max_h) images.append( img.crop( (0,h,img.width,h+max_h) ) ) h += max_h return images def images_info(path,files): info = [] for file in files: file_path = os.path.join( path, file ) img = Image.open(file_path) w = a4_w h = (w * img.height / img.width) if h > max_img_h: imgs = image_split_h(img,max_img_h) for im in imgs: w = a4_w h = (w * im.height / im.width) info.append({'width':w, 'height':h, 'image':im, 'filename':file}) else: info.append({'width':w, 'height':h, 'image':img, 'filename':file}) return info def page_info(infos): pages = [] page = [] h = 0 for info in infos: if h + info['height'] > max_page_h: pages.append({'height':h, 'images':page}) page = [] h = 0 page.append(info) h += info['height'] if h > 0: pages.append({'height': h, 'images':page}) return pages def dir2pdf_dynamic(folder): error = [] pdf = Pdf() #pdf.add_page(format=(a4_w,a4_h)) subdirs = os.listdir(folder) subdirs.sort() for subdir in subdirs: path = os.path.join( folder, subdir ) if not os.path.isdir(path): continue print(path) files = os.listdir( path ) files.sort() infos = images_info(path, files) pages = page_info(infos) for index, page in enumerate(pages): #print(page) print(len(page['images']),page['height']) pdf.add_page(format=(a4_w,page['height']+100)) if index == 0: pdf.add_outline(subdir) for info in page['images']: try: img_byte_arr = io.BytesIO() info['image'].save(img_byte_arr, format='jpeg') pdf.image( img_byte_arr, w = a4_w) except Exception as e: error.append( (subdir, info['filename'], e) ) print(e) pdf.save(folder + ".pdf") for e in error: print(e) def dir2pdf(folder, width=None): def get_listdir(folder): file_list = [] files = os.listdir(folder) for file in files: path = os.path.join( folder, file ) if os.path.isdir(path): file_list += get_listdir(path) else: file_list.append(path) return file_list def read_file(path): with open(path,"rb") as f: return f.read() error = [] pdf = Pdf() pdf.add_page(format=(a4_w,a4_h*10)) namelist = get_listdir(folder) namelist = sorted(namelist, key=key_str_num_2) prev_dir = None for name in namelist: if os.path.isdir(name): continue print(name) curr_dir = os.path.dirname(name) try: image = Image.open(name) if width and image.width > width: image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS) except: continue if image.height > max_img_h: images = image_split_h(image,max_img_h) else: images = [ image ] try: if prev_dir != curr_dir: pdf.add_outline( os.path.basename(curr_dir) ) prev_dir = curr_dir for image in images: image_bytes = io.BytesIO() image.save(image_bytes, format='jpeg', quality=70) pdf.image(image_bytes,w = a4_w) except Exception as e: error.append((name,e)) pdf.save( os.path.join( os.path.dirname(folder), os.path.basename(folder) + '.pdf') ) for e in error: print(e) def zip2pdf(filename, width=None): def is_dir(filename): return filename.endswith('/') or filename.endswith('\\') def is_image(filedata): return filename.endswith('.png') or filename.endswith('.jpg') or filename.endswith('.jpeg') error = [] pdf = Pdf() pdf.add_page(format=(a4_w,a4_h*10)) zf = zipfile.ZipFile(filename,mode="r") namelist = zf.namelist() #filename namelist = sorted(namelist, key=key_str_num_2) #infolist = zf.infolist() #filename,size(org,cmp),attr,method prev_dir = None for name in namelist: if is_dir(name): continue if not name.endswith(".jpeg") and not name.endswith('.jpg') and not name.endswith('.png'): continue curr_dir = os.path.dirname(name) curr_dir = curr_dir.encode('cp437').decode('euc-kr','ignore') image_bytes = zf.read(name) try: image = Image.open(io.BytesIO(image_bytes)) if width and image.width > width: image = image.resize((width,int(width*image.height/image.width)), Image.ANTIALIAS) except Exception as e: print(len(image_bytes),e); continue print(name.encode('cp437').decode('euc-kr','ignore')) if image.height > max_img_h: try: images = image_split_h(image,max_img_h) except Exception as e: print(e); continue else: images = [ image ] try: if prev_dir != curr_dir: pdf.add_outline( os.path.basename(curr_dir) ) prev_dir = curr_dir for image in images: image_bytes = io.BytesIO() image.save(image_bytes, format='jpeg', quality=70) pdf.image(image_bytes,w = a4_w) except Exception as e: error.append((name,e)) pdf.save(filename + '.pdf') for e in error: print(e) def file2pdf(filename,width=None): if os.path.isdir(filename): dir2pdf(filename,width) else: if zipfile.is_zipfile(filename): zip2pdf(filename,width) else: print( filename, 'is neigher direcotry nor zipfile') def show_gui(): def file_drop(files): global file_list file_list = [] table.DeleteAllItems() for file in files: table.Add( (os.path.basename(file), "Ready") ) file_list.append(file) def make_handler(ev): import threading def thread_handler(): for index, file in enumerate(file_list): w.RunLater( lambda: table.SetItem( index, 1, "Proc" ) ) file2pdf(file) w.RunLater( lambda: table.SetItem( index, 1, "OK" ) ) thread = threading.Thread(target=thread_handler,args=()) thread.daemon = False thread.start() def get_panel(parent): global table panel = w.VBox(parent) table = w.Table(panel, ( ('File', 500, -1), ('Status', 70, 0) ), drop=file_drop) button = w.Button(panel, "Make", make_handler ) panel.Add( table, expand=True, fill=True ) panel.Add( button, expand=False, fill=False, right=True ) return panel import wxez as w win = w.WxWin("Image to PDF", 600, 400) panel = get_panel(win) win.Add(panel, expand=True, fill=True) win.Run() if __name__ == "__main__": if len(sys.argv) < 2: show_gui() elif len(sys.argv) == 2: file2pdf(sys.argv[1]) elif len(sys.argv) == 3: file2pdf(sys.argv[1],int(sys.argv[2]))
피드 구독하기:
글 (Atom)