import bs4, codecs import requests import base64 import os import io import sys requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = None request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def safeFileName(filename): return filename.replace(":","_").replace("?","_").replace("<","_").replace(">","_").strip() def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def getUrlHtml(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8') def urlToFile(url, file_name): resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getToonKor( comicsUrl, baseUrl, baseDir): while True: try: doc = getUrl(comicsUrl) table = doc.select("table.bt_view2")[0] elist = table.select("td.bt_title") title = elist[0].text break except: print(comicsUrl, "-> retry") continue table = doc.select("table.web_list")[0] elist = table.select("td.content__title") new_dir = os.path.join(baseDir, safeFileName(title)) if not os.path.isdir(new_dir): os.mkdir(new_dir) for e in elist: url = baseUrl + e['data-role'] title = e['alt'] while True: try: bs_img, html_img = getUrlHtml(url, request_headers) begin = html_img.index("var tnimg = '") break except: print( url, "-> retry") continue end = html_img.index("';",begin) data = html_img[begin + 13: end] img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8") doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser') imgs = doc_imgs.select("img") #sub_dir = os.path.join(new_dir, title.replace(":","_")) #if not os.path.isdir(sub_dir): os.mkdir(sub_dir) html_file = os.path.join(new_dir, safeFileName(title) + ".html") print(html_file) f = open( html_file, "w" ) f.write('<meta name="referrer" content="no-referrer" /><br>\n') k = 1; for img in imgs: img_url = img.get('src') if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url #file_name = "img_%04d.jpg" % k #urlToFile( img_url, os.path.join( sub_dir, file_name) ) #print( img_url + " -> " + file_name ) print( img_url ) f.write('<img src="' + img_url + '" /><br>\n') k = k + 1 f.close() if __name__ == "__main__": urls = [] if len(sys.argv) > 1: for i in range(1,len(sys.argv)): urls.append(sys.argv[i]) else: urls.append( #"https://tkr035.com/webtoon/2939" #사내맞선 #"https://tkr035.com/webtoon/826" #황제의 외동딸 #"https://tkr035.com/webtoon/2794" #그만바의 자취방 #"https://tkr035.com/webtoon/2647" #첩 "https://tkr035.com/webtoon/6117" #외모지상주의 ) iurl = "https://tkr035.com" bdir = "D:/Temp2/" for url in urls: getToonKor(url, iurl, bdir) print("END")
2022년 10월 20일 목요일
[python] ToonKor V2 Crawling
피드 구독하기:
댓글 (Atom)
댓글 없음:
댓글 쓰기