1. Beautiful Soup Example
import requests import bs4, codecs request_headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'), 'Referer': 'http://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cnt&date=20170714', # 영화랭킹 } get_params = (('k1', 'v1'), ('k1', 'v3'), ('k2', 'v2')) def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def getUrl(url, headers={}, params=()): resp = requests.get(url, verify=False, headers=headers, params=params) #resp.headers #html = resp.content.decode('utf8') html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def postUrl(url, headers={}, params=(), data={}, fields=(), files={}): #application/x-www-form-urlencoded resp = requests.get(url, verify=False, headers=headers, params=params) html = resp.text return bs4.BeautifulSoup(html, 'html.parser') def extractTag(bs,tag): [s.extract() for s in bs(tag)] if __name__ == "__main__": bs = getUrl('https://news.naver.com') a_list = bs.select('div#pan_today_main_news a') for a in a_list[1:]: #0 links today's event list print("TEXT: " + a.text.strip()) print("LINK: " + a['href']) bs1 = getUrl(a['href']) #article = bs1.select('div#articleBodyContents') article = bs1.select('div[id=articleBodyContents]') extractTag(article[0],'script') extractTag(article[0],'a') print(article[0]) print(article[0].text) break
2. Html Strip Example
import bs4, codecs import os def getFile(url): with codecs.open(url,'r', encoding='utf8') as f: html = f.read() return bs4.BeautifulSoup(html, 'html.parser') def extractTag(bs,tag): [s.extract() for s in bs(tag)] def extractDiv(bs,tag): [s.extract() for s in bs.findAll("div", tag)] #[print(s) for s in bs.findAll("div", tag)] def fileToString(file): f = codecs.open(file, "r", "utf-8") text = f.read() f.close() return text def stringToFile(file,data): f = codecs.open(file, "w", "utf-8") f.write(data) f.close() if __name__ == "__main__": dirname = r"D:\Temp2\GoEx\golang.site\go\article" newdir = r"D:\Temp2\GoEx\golang.site\go\new" files = os.listdir(dirname) for f in files: print(f) oldfile = os.path.join(dirname, f) newfile = os.path.join(newdir, f) h = getFile(oldfile) extractTag(h,"nav") extractTag(h,"span") extractDiv(h,{"class": "navbar"}) stringToFile(newfile, str(h))
댓글 없음:
댓글 쓰기