import requests import bs4, codecs import os import io requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) image_ext = 'jpg' request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), } def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def urlToFile(url, file_name, referer=None): request_headers['Referer'] = referer resp = requests.get(url, verify=False, headers=request_headers, params=()) with open(file_name, "wb") as f: f.write(resp.content) def extractTag(bs,tag): [s.extract() for s in bs(tag)] def getWolfCom( comicsUrl, baseUrl, baseDir): doc = getUrl(comicsUrl) title = doc.find("title").text elist = doc.select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li") print(title) new_dir = os.path.join(baseDir, title.replace(":","_")) if not os.path.isdir(new_dir): os.mkdir(new_dir) d = 999 for e in elist: a = e.find('a',"view_open",href=True) if not a: continue url = baseUrl + a['href'] doc = getUrl(url, request_headers) title = doc.find("title").text imgs = doc.select("section.webtoon-body div.group.image-view img") print(title) #sub_dir = os.path.join(new_dir, str(d) + '_' + title.replace(":","_")) sub_dir = os.path.join(new_dir, title.replace(":","_")) if not os.path.isdir(sub_dir): os.mkdir(sub_dir) k = 1; for img in imgs: img_url = img.get('src') if not img_url: continue if image_ext == None or img_url.endswith(image_ext): if( not img_url.startswith("http") ): img_url = baseUrl + img_url file_name = "img_%04d.jpg" % k urlToFile( img_url, os.path.join( sub_dir, file_name), comicsUrl) print( img_url + " -> " + file_name ) k = k + 1 d = d - 1 def getMultipleWolfCom(url): iurl = "https://wfwf164.com" bdir = "D:/Temp2/" getWolfCom(url, iurl, bdir) if __name__ == "__main__": urls = [ "https://wfwf164.com/list?toon=585&title=%B9%DD%C1%DF%B7%C2%BC%D2%B3%E0", "https://wfwf164.com/list?toon=1114&title=%BF%C1%C5%BE%C0%C7%C0%FC%BC%B3", "https://wfwf164.com/list?toon=1387&title=%B3%CA%C5%AC%B0%C9KNUCKLEGIRL", ] iurl = "https://wfwf164.com" bdir = "D:/Temp2/" for url in urls: getWolfCom(url, iurl, bdir ) print("END")
2022년 10월 20일 목요일
[python] WolfCom Crawling
피드 구독하기:
댓글 (Atom)
댓글 없음:
댓글 쓰기