TookKor Jython Example
# -*- coding: utf-8 import os import sys sys.path.append(r"d:\Lib\jar\jsoup-1.12.1.jar") from java.io import * from java.net import * from java.util import * from java.lang import * from java.nio.file import * from org.jsoup import * from org.jsoup.nodes import * from org.jsoup.select import * debug = True filter = None #"jpg" def decodeBase64(data): import base64 try: return base64.b64decode(data) except BaseException as e: print('except', str(e)) return None def urlToFile(urlStr, fileName, referer): url = URL(urlStr) hc = url.openConnection() hc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36") hc.setRequestProperty("Referer", referer); status = hc.getResponseCode(); while status != HttpURLConnection.HTTP_OK: # and status != HttpURLConnection.HTTP_NOT_FOUND: if status == HttpURLConnection.HTTP_MOVED_TEMP or status == HttpURLConnection.HTTP_MOVED_PERM or status == HttpURLConnection.HTTP_SEE_OTHER: newUrl = hc.getHeaderField("Location") hc = URL(newUrl).openConnection() hc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"); status = hc.getResponseCode(); Files.copy(hc.getInputStream(), Paths.get(fileName), StandardCopyOption.REPLACE_EXISTING) def getJsoupDocument(url): while True: try: return Jsoup.connect(url).get() except: Thread.sleep(1000) def getToonKor(comicsUrl, baseUrl, baseDir): doc_toc = getJsoupDocument(comicsUrl) #if debug: print(doc_toc.html()) table = doc_toc.select("table[class=web_list]").first() list = table.select("td[class=episode__index]") #if debug: print(list.html()) dir = os.path.join(baseDir,doc_toc.title().replace(":","_")) if not os.path.isdir(dir): os.mkdir(dir) #dir.mkdirs() for e in list: #if( i++ < 38 ) continue; try: url = baseUrl + e.attr("data-role") if debug: print(url) doc_img = getJsoupDocument(url) html_img = doc_img.html() begin = html_img.find("var toon_img = '"); end = html_img.find("';",begin); data = html_img[begin+16:end] img_list = decodeBase64(data) doc_imgs = Jsoup.parse(img_list) imgs = doc_imgs.select("img") subdir = os.path.join(dir,doc_img.title().replace(":","_")) if not os.path.isdir(subdir): os.mkdir(subdir) k = 1; for img in imgs: img_url = img.attr("src") print(img_url) if filter == None or img_url.endswith(filter): if not img_url.startswith("http"): img_url = baseUrl + img_url; file_name = "img_%04d.jpg" % k k = k + 1 print( img_url + " -> " + file_name ) urlToFile(img_url, os.path.join(subdir, file_name), comicsUrl) except BaseException as e: print('except2', str(e)) def getWolfCom(comicsUrl, baseUrl, baseDir): doc_toc = getJsoupDocument(comicsUrl) if debug: print(doc_toc.html()) #doc_toc.text() list = doc_toc.select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li") if debug: print(list.html()) dir = os.path.join(baseDir,doc_toc.title().replace(":","_")) if not os.path.isdir(dir): os.mkdir(dir) #dir.mkdirs() for e in list: #if( i++ < 38 ) continue; try: url = baseUrl + e.select("a").first().attr("href") if debug: print(url) doc_img = getJsoupDocument(url) imgs = doc_img.select("section.webtoon-body > div.group.image-view > img") print(doc_img.title()) subdir = os.path.join(dir,doc_img.title().replace(":","_")) if not os.path.isdir(subdir): os.mkdir(subdir) #subdir.mkdirs() k = 1; for img in imgs: img_url = img.attr("src") if filter == None or img_url.endswith(filter): if not img_url.startswith("http"): img_url = baseUrl + img_url; file_name = "img_%04d.jpg" % k k = k + 1 print( img_url + " -> " + file_name ) urlToFile(img_url, os.path.join(subdir, file_name), comicsUrl) except: pass if __name__ == "__main__": url = "https://tkor.work/%EB%AA%A8%EA%B8%B0%EB%96%BC" iurl = "https://tkor.work" dir = "D:/Temp2/" getToonKor(url, iurl, dir) print("END")
댓글 없음:
댓글 쓰기