2020년 6월 14일 일요일

[python] Beautiful Soup Example


1. Beautiful Soup Example
import requests
import bs4, codecs

request_headers = {
    'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 '
                   '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
    'Referer': 'http://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cnt&date=20170714', # 영화랭킹
}

get_params = (('k1', 'v1'), ('k1', 'v3'), ('k2', 'v2'))

def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')

def getUrl(url, headers={}, params=()):
    resp = requests.get(url, verify=False, headers=headers, params=params)
    #resp.headers
    #html = resp.content.decode('utf8')
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')

def postUrl(url, headers={}, params=(), data={}, fields=(), files={}):
    #application/x-www-form-urlencoded
    resp = requests.get(url, verify=False, headers=headers, params=params)
    html = resp.text
    return bs4.BeautifulSoup(html, 'html.parser')

def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]

if __name__ == "__main__":
    bs = getUrl('https://news.naver.com')
    a_list = bs.select('div#pan_today_main_news a')
    for a in a_list[1:]: #0 links today's event list
        print("TEXT: " + a.text.strip())
        print("LINK: " + a['href'])
        bs1 = getUrl(a['href'])
        #article = bs1.select('div#articleBodyContents')
        article = bs1.select('div[id=articleBodyContents]')
        extractTag(article[0],'script')
        extractTag(article[0],'a')
        print(article[0])
        print(article[0].text)
        break


2. Html Strip Example

import bs4, codecs
import os

def getFile(url):
    with codecs.open(url,'r', encoding='utf8') as f:
        html = f.read()
    return bs4.BeautifulSoup(html, 'html.parser')

def extractTag(bs,tag):
    [s.extract() for s in bs(tag)]

def extractDiv(bs,tag):
    [s.extract() for s in bs.findAll("div", tag)]    
    #[print(s) for s in bs.findAll("div", tag)]
    
def fileToString(file):
    f = codecs.open(file, "r", "utf-8")
    text = f.read()
    f.close()
    return text

def stringToFile(file,data):
    f = codecs.open(file, "w", "utf-8")
    f.write(data)
    f.close()

if __name__ == "__main__":
    dirname = r"D:\Temp2\GoEx\golang.site\go\article"
    newdir = r"D:\Temp2\GoEx\golang.site\go\new"
    files = os.listdir(dirname)
    for f in files:
        print(f)
        oldfile = os.path.join(dirname, f)
        newfile = os.path.join(newdir, f)
        h = getFile(oldfile)
        extractTag(h,"nav")
        extractTag(h,"span")
        extractDiv(h,{"class": "navbar"})
        stringToFile(newfile, str(h))
        


댓글 없음:

댓글 쓰기