Zdiv's Software Snippet: [jython] JSoup Wolf.com example

1. Wolf.com example
import os
import sys
sys.path.append(r"d:\Lib\jar\jsoup-1.12.1.jar")

from java.io import *
from java.net import *
from java.util import *
from java.lang import *
from java.nio.file import *

from org.jsoup import *
from org.jsoup.nodes import *
from org.jsoup.select import *

debug = False
filter = None #"jpg"

def decodeBase64(data):
    return java.util.Base64.getDecoder().decode(data);
     
def urlToFile(urlStr, fileName, referer):
    url = URL(urlStr)
    hc  = url.openConnection()
    hc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36")
    hc.setRequestProperty("Referer", referer);
       
    status = hc.getResponseCode();
    while status != HttpURLConnection.HTTP_OK: # and status != HttpURLConnection.HTTP_NOT_FOUND:
        if status == HttpURLConnection.HTTP_MOVED_TEMP or status == HttpURLConnection.HTTP_MOVED_PERM or status == HttpURLConnection.HTTP_SEE_OTHER:
            newUrl = hc.getHeaderField("Location")
            hc = URL(newUrl).openConnection()
            hc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
            status = hc.getResponseCode();           
    Files.copy(hc.getInputStream(), Paths.get(fileName), StandardCopyOption.REPLACE_EXISTING)
    
def getJsoupDocument(url):
    while True:
        try:
            return Jsoup.connect(url).get()
        except:
            Thread.sleep(1000)

def getWolfCom(comicsUrl, baseUrl, baseDir):
    doc_toc = getJsoupDocument(comicsUrl)
    if debug: print(doc_toc.html()) #doc_toc.text()

    list = doc_toc.select("div.box > div.group.left-box > div.webtoon-bbs-list.bbs-list > ul > li")
    if debug: print(list.html())

    dir = os.path.join(baseDir,doc_toc.title().replace(":","_"))  
    if not os.path.isdir(dir): os.mkdir(dir) #dir.mkdirs()

    for e in list:
        #if( i++ < 38 ) continue;
        try:
            url = baseUrl + e.select("a").first().attr("href")
            if debug: print(url)
    
            doc_img = getJsoupDocument(url)
            imgs = doc_img.select("section.webtoon-body > div.group.image-view > img")
            print(doc_img.title())
            
            subdir = os.path.join(dir,doc_img.title().replace(":","_"))
            if not os.path.isdir(subdir): os.mkdir(subdir) #subdir.mkdirs()
    
            k = 1;
            for img in imgs:
                img_url = img.attr("src")
                if filter == None or img_url.endswith(filter):
                    if not img_url.startswith("http"):
                        img_url = baseUrl + img_url;
                    file_name = "img_%04d.jpg" % k
                    k = k + 1
                    print( img_url + " -> " + file_name )
                    urlToFile(img_url, os.path.join(subdir, file_name), comicsUrl)
        except:
            pass

if __name__ == "__main__":
    url  = "https://wfwf133.com/list?toon=1052&title=%BA%AE%BF%A1%B3%A2%C0%CE%BF%A9%C0%DA"
    iurl = "https://wfwf133.com"
    dir  = "D:/Temp2/"
    getWolfCom(url, iurl, dir)
Zdiv's Software Snippet

Link

2020년 6월 30일 화요일

[jython] JSoup Wolf.com example

댓글 없음:

댓글 쓰기

[python] WolfCom Crawling