ebook maker
import os import sys import codecs import bs4 class html(): def __init__(self,filename=None): self.html = None self.bs = None if filename: self.open(filename) def open(self,filename): with codecs.open(filename,'r', encoding='utf8') as f: self.html = f.read() self.bs = bs4.BeautifulSoup(self.html, 'html.parser') def tags(self,tag,attrs=None): #tag='div', attrs = { 'id' : id } if attrs == None: return self.bs.find_all(tag) else: return self.bs.find_all(tag,attrs) def tag(self,tag): # a.class, a#id, a tag return self.bs.select(tag) def remove(self,css): tags = self.bs.select(css) if tags: for tag in tags: tag.extract() def removeTags(self,tag): return [s.extract() for s in self.bs(tag)] def save(self,filename): with codecs.open(filename,'w', encoding='utf8') as f: f.write(self.bs.decode_contents()) f#.write(self.bs.html.decode_contents()) #get_text() or prettify() def appendHead(self,tag,text,attrs): new_tag = self.bs.new_tag(tag, **attrs) #soup.new_tag('div', **{"data-role": "content"}) if text: new_tag.string = text self.bs.head.append(new_tag) def wrapBody(self,tag,attrs): wrapper = self.bs.new_tag(tag, **attrs) #soup.new_tag('div', **{"data-role": "content"}) body_children = list(self.bs.body.children) self.bs.body.clear() self.bs.body.append(wrapper) for child in body_children: wrapper.append(child) def files(dirname): fs = [] files = os.listdir(dirname) for f in files: name,ext = os.path.splitext(f) if ext == '.html': fs.append(os.path.join(dirname,f)) return fs def stripTags(): fs = files(r'D:/Temp2/4') for f in fs: print(f) h = html(f) h.remove('iframe') h.remove('script') h.remove('div.navbar') h.remove('div.articleTopAd') h.remove('li.previous') h.remove('li.next') h.remove('div#footerDiv') h.remove('nav.col-md-3') h.save(f + '.htm') def insertEbookTags(f): h = html(f) h.appendHead('link', None, { 'href': '../css/bootstrap.min.css', 'rel': 'stylesheet' }) h.appendHead('link', None, { 'href': '../css/styles.css', 'rel': 'stylesheet' }) h.appendHead('link', None, { 'href': '../css/highlight.pack.css', 'rel': 'stylesheet' }) h.appendHead('script', None, { 'src': '../js/highlight.pack.js' }) h.appendHead('script', r'hljs.configure({tabReplace: ' '}); hljs.initHighlightingOnLoad();', {}) h.wrapBody('div', { 'class': 'main' } ) h.wrapBody('div', { 'class': 'col-xs-12' } ) h.wrapBody('div', { 'class': 'row' } ) h.wrapBody('div', { 'class': 'container-fluid' } ) h.save(f + '.htm') def makeEbook(directory): fs = files(directory) for f in fs: insertEbookTags(f) print(f) def getLinks(filename): h = html(filename) data = [ ] links = h.tags('a'); for l in links: if not l.attrs: continue try: href = l.attrs['href'] data.append( '{ "' + l.text.replace('[\r\n]','') + '", "' + href + '" },\n') except: pass with codecs.open(filename + '.txt','w','utf-8') as f: for d in data: f.write(d) if __name__ == '__main__': #stripTags() insertEbookTags(r'd:\Android\ebook\work\expect\autoexpect.html') #makeEbook('d:\\Android\\ebook\\work\\advbash') #getLinks('d:\\Android\\ebook\\work\\advbash\\index.html') #getLinks('d:\\Android\\ebook\\bs4_index.html')
댓글 없음:
댓글 쓰기