ebook maker
import os
import sys
import codecs
import bs4
class html():
def __init__(self,filename=None):
self.html = None
self.bs = None
if filename:
self.open(filename)
def open(self,filename):
with codecs.open(filename,'r', encoding='utf8') as f:
self.html = f.read()
self.bs = bs4.BeautifulSoup(self.html, 'html.parser')
def tags(self,tag,attrs=None): #tag='div', attrs = { 'id' : id }
if attrs == None:
return self.bs.find_all(tag)
else:
return self.bs.find_all(tag,attrs)
def tag(self,tag): # a.class, a#id, a tag
return self.bs.select(tag)
def remove(self,css):
tags = self.bs.select(css)
if tags:
for tag in tags:
tag.extract()
def removeTags(self,tag):
return [s.extract() for s in self.bs(tag)]
def save(self,filename):
with codecs.open(filename,'w', encoding='utf8') as f:
f.write(self.bs.decode_contents())
f#.write(self.bs.html.decode_contents()) #get_text() or prettify()
def appendHead(self,tag,text,attrs):
new_tag = self.bs.new_tag(tag, **attrs) #soup.new_tag('div', **{"data-role": "content"})
if text: new_tag.string = text
self.bs.head.append(new_tag)
def wrapBody(self,tag,attrs):
wrapper = self.bs.new_tag(tag, **attrs) #soup.new_tag('div', **{"data-role": "content"})
body_children = list(self.bs.body.children)
self.bs.body.clear()
self.bs.body.append(wrapper)
for child in body_children:
wrapper.append(child)
def files(dirname):
fs = []
files = os.listdir(dirname)
for f in files:
name,ext = os.path.splitext(f)
if ext == '.html':
fs.append(os.path.join(dirname,f))
return fs
def stripTags():
fs = files(r'D:/Temp2/4')
for f in fs:
print(f)
h = html(f)
h.remove('iframe')
h.remove('script')
h.remove('div.navbar')
h.remove('div.articleTopAd')
h.remove('li.previous')
h.remove('li.next')
h.remove('div#footerDiv')
h.remove('nav.col-md-3')
h.save(f + '.htm')
def insertEbookTags(f):
h = html(f)
h.appendHead('link', None, { 'href': '../css/bootstrap.min.css', 'rel': 'stylesheet' })
h.appendHead('link', None, { 'href': '../css/styles.css', 'rel': 'stylesheet' })
h.appendHead('link', None, { 'href': '../css/highlight.pack.css', 'rel': 'stylesheet' })
h.appendHead('script', None, { 'src': '../js/highlight.pack.js' })
h.appendHead('script', r'hljs.configure({tabReplace: ' '}); hljs.initHighlightingOnLoad();', {})
h.wrapBody('div', { 'class': 'main' } )
h.wrapBody('div', { 'class': 'col-xs-12' } )
h.wrapBody('div', { 'class': 'row' } )
h.wrapBody('div', { 'class': 'container-fluid' } )
h.save(f + '.htm')
def makeEbook(directory):
fs = files(directory)
for f in fs:
insertEbookTags(f)
print(f)
def getLinks(filename):
h = html(filename)
data = [ ]
links = h.tags('a');
for l in links:
if not l.attrs:
continue
try:
href = l.attrs['href']
data.append( '{ "' + l.text.replace('[\r\n]','') + '", "' + href + '" },\n')
except:
pass
with codecs.open(filename + '.txt','w','utf-8') as f:
for d in data:
f.write(d)
if __name__ == '__main__':
#stripTags()
insertEbookTags(r'd:\Android\ebook\work\expect\autoexpect.html')
#makeEbook('d:\\Android\\ebook\\work\\advbash')
#getLinks('d:\\Android\\ebook\\work\\advbash\\index.html')
#getLinks('d:\\Android\\ebook\\bs4_index.html')