import bs4, codecs import requests import base64 import os import io import sys import re requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = None request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def safeFileName(filename): return re.sub("[:?/*<>\t.]", "_", filename ).strip() def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def getUrlHtml(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8') def urlToFile(url, file_name): while True: try: resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) break except: print( 'retry -->', file_name ); continue def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getToonKor( comicsUrl, baseUrl, baseDir): doc = getUrl(comicsUrl) title = doc.find("title").text table = doc.select("table.web_list")[0] elist = table.select("td.episode__index") #print( doc, title, table, elist ) new_dir = os.path.join(baseDir, safeFileName(title)) if not os.path.isdir(new_dir): os.mkdir(new_dir) for e in elist: url = baseUrl + e['data-role'] while True: try: bs_img, html_img = getUrlHtml(url, request_headers) title = bs_img.find("title").text begin = html_img.index("var toon_img = '") end = html_img.index("';",begin) data = html_img[begin + 16: end] img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8") doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser') imgs = doc_imgs.select("img") break except: pass sub_dir = os.path.join(new_dir, safeFileName(title)) if not os.path.isdir(sub_dir): os.mkdir(sub_dir) else: print( 'skip -->', sub_dir ); continue print(sub_dir) k = 1; for img in imgs: img_url = img.get('src') #print(img_url) if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url ext = img_url.rfind(".") if ext >= 0: file_name = ("img_%04d" % k) + img_url[ext:] else: file_name = "img_%04d.jpg" % k urlToFile( img_url, os.path.join( sub_dir, file_name) ) print( img_url + " -> " + file_name ) k = k + 1 if __name__ == "__main__": #https://tkor.fish/%EC%9B%B9%ED%88%B0 url = "https://toonkor103.com/%EC%96%B4%EA%B2%8C%EC%9D%B8-%EB%A7%88%EC%9D%B4-%EB%9D%BC%EC%9D%B4%ED%94%84" baseUrl = "https://toonkor103.com" outDir = "D:/Temp2/" if len(sys.argv) > 1: url = sys.argv[1] baseUrl = url[:url.find('/',8)] if len(sys.argv) > 2: outDir = sys.argv[2] getToonKor(url, baseUrl, outDir)
2022년 10월 20일 목요일
[python] TookKor V1 Crawling
피드 구독하기:
댓글 (Atom)
댓글 없음:
댓글 쓰기