Zdiv's Software Snippet: [python] BS4 WolfCom Example

1. Wolfcom Example

import requests
import bs4, codecs
import os
import io
 
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
image_ext = 'jpg'
request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
}
 
def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')
 
def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')
 
def urlToFile(url, file_name, referer=None):
    request_headers['Referer'] = referer
    resp = requests.get(url, verify=False, headers=request_headers, params=())
    with open(file_name, "wb") as f:
        f.write(resp.content)
    
def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]
 
def getWolfCom( comicsUrl, baseUrl, baseDir):
    doc = getUrl(comicsUrl)
    title = doc.find("title").text
    elist = doc.select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li")
    print(title)
    
    new_dir = os.path.join(baseDir, title.replace(":","_"))
    if not os.path.isdir(new_dir): os.mkdir(new_dir)
 
    for e in elist:
        a = e.find('a',"view_open",href=True)
        if not a: continue
        url = baseUrl + a['href']
 
        doc = getUrl(url, request_headers)
        title = doc.find("title").text
        imgs = doc.select("section.webtoon-body div.group.image-view img")
        print(title)
        
        sub_dir = os.path.join(new_dir, title.replace(":","_"))
        if not os.path.isdir(sub_dir): os.mkdir(sub_dir)
 
        k = 1;
        for img in imgs:
            img_url = img.get('src')
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                file_name = "img_%04d.jpg" % k
                urlToFile( img_url, os.path.join( sub_dir, file_name), comicsUrl)
                print( img_url + " -> " + file_name )
                k = k + 1
        
if __name__ == "__main__":
        url = "https://wfwf164.com/list?toon=2063&title=%B0%ED%B0%ED%BD%BA%C0%AD%B8%BE"
        iurl = "https://wfwf164.com"
        bdir = "D:/Temp2/"
        getWolfCom(url, iurl, bdir)
        print("END")

import bs4, codecs
import requests
import os
import io

requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
image_ext = 'jpg'
request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
}

def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')

def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')

def urlToFile(url, file_name):
    resp = requests.get(url, verify=False, headers=request_headers, params=())
    with open(file_name, "wb") as f:
        f.write(resp.content)
    
def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]

def getWolfCom( comicsUrl, baseUrl, baseDir):
    doc = getUrl(comicsUrl)
    title = doc.find("title").text
    elist = doc.select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li")
    
    new_dir = os.path.join(baseDir, title.replace(":","_"))
    if not os.path.isdir(new_dir): os.mkdir(new_dir)

    for e in elist:
        a = e.find('a',"view_open",href=True)
        if not a: continue
        url = baseUrl + a['href']

        doc = getUrl(url, request_headers)
        title = doc.find("title").text
        imgs = doc.select("section.webtoon-body div.group.image-view img")

        sub_dir = os.path.join(new_dir, title.replace(":","_"))
        if not os.path.isdir(sub_dir): os.mkdir(sub_dir)

        k = 1;
        for img in imgs:
            img_url = img.get('src')
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                file_name = "img_%04d.jpg" % k
                urlToFile( img_url, os.path.join( sub_dir, file_name) )
                print( img_url + " -> " + file_name )
                k = k + 1

if __name__ == "__main__":
        url = "https://wfwf98.com/list?toon=1235&title=%B8%F0%B5%CE%C0%C7%BE%CB%B9%D9"
        iurl = "https://wfwf98.com"
        bdir = "D:/Temp2/"
        getWolfCom(url, iurl, bdir)

Zdiv's Software Snippet

Link

2020년 6월 28일 일요일

[python] BS4 WolfCom Example

댓글 없음:

댓글 쓰기

[python] WolfCom Crawling