2020년 11월 15일 일요일

[python] ebook maker

 

ebook maker

import os
import sys
import codecs
import bs4

class html():
    def __init__(self,filename=None):
        self.html = None
        self.bs = None
        if filename:
            self.open(filename)
    def open(self,filename):
        with codecs.open(filename,'r', encoding='utf8') as f:
            self.html = f.read()
        self.bs = bs4.BeautifulSoup(self.html, 'html.parser')  
    def tags(self,tag,attrs=None): #tag='div', attrs = { 'id' : id }
        if attrs == None:
            return self.bs.find_all(tag)
        else:
            return self.bs.find_all(tag,attrs)
    def tag(self,tag): # a.class, a#id, a tag
        return self.bs.select(tag)
    def remove(self,css):
        tags = self.bs.select(css)
        if tags:
            for tag in tags:
                tag.extract()
    def removeTags(self,tag):
        return [s.extract() for s in self.bs(tag)]
    def save(self,filename):
        with codecs.open(filename,'w', encoding='utf8') as f:
            f.write(self.bs.decode_contents())
            f#.write(self.bs.html.decode_contents()) #get_text() or prettify()
    def appendHead(self,tag,text,attrs):
        new_tag = self.bs.new_tag(tag, **attrs) #soup.new_tag('div', **{"data-role": "content"})
        if text: new_tag.string = text
        self.bs.head.append(new_tag)
    def wrapBody(self,tag,attrs):
        wrapper = self.bs.new_tag(tag, **attrs) #soup.new_tag('div', **{"data-role": "content"})
        body_children = list(self.bs.body.children)
        self.bs.body.clear()
        self.bs.body.append(wrapper)
        for child in body_children:
            wrapper.append(child)
            
def files(dirname):
    fs = []
    files = os.listdir(dirname)
    for f in files:
        name,ext = os.path.splitext(f)
        if ext == '.html':
            fs.append(os.path.join(dirname,f))
    return fs
    
def stripTags():
    fs = files(r'D:/Temp2/4')
    for f in fs:
        print(f)
        h = html(f)
        h.remove('iframe')
        h.remove('script')
        h.remove('div.navbar')
        h.remove('div.articleTopAd')
        h.remove('li.previous')
        h.remove('li.next')
        h.remove('div#footerDiv')
        h.remove('nav.col-md-3')
        h.save(f + '.htm')
        
def insertEbookTags(f):
    h = html(f)

    h.appendHead('link', None, { 'href': '../css/bootstrap.min.css', 'rel': 'stylesheet' })
    h.appendHead('link', None, { 'href': '../css/styles.css', 'rel': 'stylesheet' })
    h.appendHead('link', None, { 'href': '../css/highlight.pack.css', 'rel': 'stylesheet' })
    h.appendHead('script', None, { 'src': '../js/highlight.pack.js' })
    h.appendHead('script', r'hljs.configure({tabReplace: '    '}); hljs.initHighlightingOnLoad();', {})
    
    h.wrapBody('div', { 'class': 'main' } )
    h.wrapBody('div', { 'class': 'col-xs-12' } )
    h.wrapBody('div', { 'class': 'row' } )
    h.wrapBody('div', { 'class': 'container-fluid' } )

    h.save(f + '.htm')
            
def makeEbook(directory):
    fs = files(directory)
    for f in fs:
        insertEbookTags(f)
        print(f)

def getLinks(filename):
    h = html(filename)
    data = [ ]
    links = h.tags('a');
    for l in links:
        if not l.attrs:
            continue
        try:
            href = l.attrs['href']
            data.append( '{ "' + l.text.replace('[\r\n]','') + '", "' + href + '" },\n')
        except:
            pass
    with codecs.open(filename + '.txt','w','utf-8') as f:
        for d in data:
            f.write(d)
        
if __name__ == '__main__':
    #stripTags()
    insertEbookTags(r'd:\Android\ebook\work\expect\autoexpect.html')
    #makeEbook('d:\\Android\\ebook\\work\\advbash')
    #getLinks('d:\\Android\\ebook\\work\\advbash\\index.html')
    #getLinks('d:\\Android\\ebook\\bs4_index.html')

댓글 없음:

댓글 쓰기