2020년 8월 12일 수요일

[IronPython] WebCrawling with NSoup

 1. WolfCom example

import os
import clr
clr.AddReferenceToFileAndPath("NSoup")
import NSoup

import System
from System.IO import *
from System.Net import *

debug = False
image_ext = None

if __name__ == "__main__":
    url = "https://wfwf104.com/list?toon=1229"
    baseUrl = "https://wfwf104.com"
    baseDir = r"D:\Temp3"
    doc = NSoup.NSoupClient.Connect(url).Get()
    '''
    with open("a.html", "w") as f:
        f.write(document.Html())
    '''
    elems = doc.Select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li");
    if debug:
        print(elems.Html());
        
    print(doc.Title)
    new_dir = os.path.join(baseDir, doc.Title.replace(":","_"))
    if not os.path.isdir(new_dir): os.mkdir(new_dir)
    print(new_dir)
    
    for e in elems:
        url = e.Select("a").First.Attr("href")
        if not url: continue
        url = baseUrl + url

        doc = NSoup.NSoupClient.Connect(url).Get()
        imgs = doc.Select("section.webtoon-body > div.group.image-view > img");
        print(doc.Title)

        sub_dir = os.path.join(new_dir, doc.Title.replace(":","_"))
        if not os.path.isdir(sub_dir): os.mkdir(sub_dir)

        k = 1;
        for img in imgs:
            img_url = img.Attr('src')
            if not img_url: continue
            if image_ext == None or img_url.endswith(image_ext):
                if( not img_url.startswith("http") ):
                    img_url = baseUrl + img_url
                file_name = "img_%04d.jpg" % k
                WebClient().DownloadFile(img_url, os.path.join( sub_dir, file_name))
                print( img_url + " -> " + file_name )
                k = k + 1


    
    
    
    

댓글 없음:

댓글 쓰기