2022년 10월 20일 목요일

[python] TookKor V1 Crawling

import bs4, codecs
import requests
import base64
import os
import io
import sys
import re

requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
image_ext = None
request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
}

def safeFileName(filename):
    return re.sub("[:?/*<>\t.]", "_", filename ).strip()
    
def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')

def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')

def getUrlHtml(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    return bs4.BeautifulSoup(resp.text, 'html.parser'), resp.content.decode('utf8')

def urlToFile(url, file_name):
    while True:
        try:
            resp = requests.get(url, verify=False, headers=request_headers, params=())
            with open(file_name, "wb") as f:
                f.write(resp.content)
            break
        except:
            print( 'retry -->', file_name ); 
            continue
            
def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]

def getToonKor( comicsUrl, baseUrl, baseDir):
    doc = getUrl(comicsUrl)
    title = doc.find("title").text
    table = doc.select("table.web_list")[0]
    elist = table.select("td.episode__index")

    #print( doc, title, table, elist )
    
    new_dir = os.path.join(baseDir, safeFileName(title))
    if not os.path.isdir(new_dir): os.mkdir(new_dir)

    for e in elist:
        url = baseUrl + e['data-role']
        while True:
            try:
                bs_img, html_img = getUrlHtml(url, request_headers)
                title = bs_img.find("title").text
                begin = html_img.index("var toon_img = '")
                end = html_img.index("';",begin)
                data = html_img[begin + 16: end]
                img_list = base64.b64decode(data.encode("UTF-8")).decode("UTF-8")
                doc_imgs = bs4.BeautifulSoup(img_list, 'html.parser')
                imgs = doc_imgs.select("img")
                break
            except:
                pass
        sub_dir = os.path.join(new_dir, safeFileName(title))
        if not os.path.isdir(sub_dir): os.mkdir(sub_dir)
        else: print( 'skip -->', sub_dir ); continue
        print(sub_dir)

        k = 1;
        for img in imgs:
            img_url = img.get('src')
            #print(img_url)
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                ext = img_url.rfind(".")
                if ext >= 0: file_name = ("img_%04d" % k) + img_url[ext:]
                else: file_name = "img_%04d.jpg" % k
                urlToFile( img_url, os.path.join( sub_dir, file_name) )
                print( img_url + " -> " + file_name )
                k = k + 1

if __name__ == "__main__":
    #https://tkor.fish/%EC%9B%B9%ED%88%B0
    url = "https://toonkor103.com/%EC%96%B4%EA%B2%8C%EC%9D%B8-%EB%A7%88%EC%9D%B4-%EB%9D%BC%EC%9D%B4%ED%94%84"
    baseUrl = "https://toonkor103.com"
    outDir = "D:/Temp2/"
    if len(sys.argv) > 1:
        url = sys.argv[1]    
        baseUrl = url[:url.find('/',8)]
    if len(sys.argv) > 2:
        outDir = sys.argv[2]
    getToonKor(url, baseUrl, outDir)
    
        

댓글 없음:

댓글 쓰기