import bs4, codecs import requests import base64 import os import io quit_flag = False import signal import sys def signal_handler(sig, frame): quit_flag = True print('You pressed Ctrl+C!', quit_flag) #sys.exit(0) signal.signal(signal.SIGINT, signal_handler) #print('Press Ctrl+C') #signal.pause() target_folder = r"D:/Temp6" requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = None request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def safeFileName(filename): filename = filename.replace(":","_") filename = filename.replace("?","_") filename = filename.replace("/","_") filename = filename.replace("*","_") filename = filename.replace("<","_") filename = filename.replace(">","_") filename = filename.replace("\t","_") return filename.strip() def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def getUrlHtml(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8') def urlToFile(url, file_name): resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getToonKor( comicsUrl, baseUrl, baseDir): while True: try: doc = getUrl(comicsUrl) table = doc.select("table.bt_view2")[0] elist = table.select("td.bt_title") title = elist[0].text break except: print( comicsUrl, "-> retry") if quit_flag: return continue table = doc.select("table.web_list")[0] elist = table.select("td.content__title") new_dir = os.path.join(baseDir, safeFileName(title)) if not os.path.isdir(new_dir): os.mkdir(new_dir) else: return count = 0 for e in elist: count += 1 url = baseUrl + e['data-role'] title = e['alt'] while True: try: bs_img, html_img = getUrlHtml(url, request_headers) begin = html_img.index("var tnimg = '") break except: print( url, "-> retry") if quit_flag: return continue end = html_img.index("';",begin) data = html_img[begin + 13: end] img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8") doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser') imgs = doc_imgs.select("img") #sub_dir = os.path.join(new_dir, title.replace(":","_")) #if not os.path.isdir(sub_dir): os.mkdir(sub_dir) html_file = os.path.join(new_dir, safeFileName(title) + ".html") if os.path.isfile(html_file): print(html_file, "-> exists"); continue print( len(elist), count, html_file) f = open( html_file, "w" ) f.write('<meta name="referrer" content="no-referrer" /><br>\n') k = 1; for img in imgs: img_url = img.get('src') if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url #file_name = "img_%04d.jpg" % k #urlToFile( img_url, os.path.join( sub_dir, file_name) ) #print( img_url + " -> " + file_name ) #print( img_url ) f.write('<img src="' + img_url + '" /><br>\n') k = k + 1 f.close() def saveToonKorComics(): urls = [ "https://tkr035.com/webtoon/1061", ] iurl = "https://tkr035.com" bdir = "D:/Temp2/" for url in urls: getToonKor(url, iurl, bdir) if quit_flag: break print("END") def getToonKorList(list_url,start=0): doc = getUrl(list_url) lists = doc.select("div.section-item-inner") #print(lists) i = 0 for l in lists: i += 1 if i < start: continue comics = l.select("a")[0] print(i, len(lists), comics['alt'], comics['href']) getToonKor(comics['href'], "https://tkr035.com", target_folder) if quit_flag: break def get_finished_webtoons(): getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0", 36) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=2", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=3", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=4", 130) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=5", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=6", 195) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=7", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=8", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=9", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=10", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=11", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=12", 0) getToonKorList("https://tkr035.com/wt/%EC%99%84%EA%B2%B0?gbun=&wpage=&page=13", 0) def get_continue_webtoons(): getToonKorList("https://tkr035.com/wt/%EC%B5%9C%EC%8B%A0/0/all/%EC%9D%B8%EA%B8%B0//%EC%A0%84%EC%B2%B4", 0) def get_week_webtoons(week,page,start=0): global target_folder target_folder = r"D:/Temp7/" + str(week) url = "https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/" url += str(week) url += "/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=" url += str(page) getToonKorList(url,start) def get_week_webtoons_all(): #for i in range(3): get_week_webtoons(1,i+1) #get_week_webtoons(1,2,150) #get_week_webtoons(1,3) for i in range(3): get_week_webtoons(2,i+1) for i in range(3): get_week_webtoons(3,i+1) for i in range(3): get_week_webtoons(4,i+1) for i in range(3): get_week_webtoons(5,i+1) for i in range(3): get_week_webtoons(6,i+1) for i in range(3): get_week_webtoons(7,i+1) for i in range(1): get_week_webtoons(8,i+1) ''' global target_folder target_folder = r"D:/Temp7/1" getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4") getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=2") getToonKorList("https://tkr035.com/wt/%EC%97%B0%EC%9E%AC%EC%A4%91/1/all/%EC%9D%B8%EA%B8%B0/%EC%A0%84%EC%B2%B4?gbun=&wpage=&page=3") ''' if __name__ == "__main__": #get_continue_webtoons() get_week_webtoons_all()
2022년 10월 20일 목요일
[python] ToonKor V2 Crawling All
피드 구독하기:
댓글 (Atom)
댓글 없음:
댓글 쓰기