2021년 8월 18일 수요일

[Python] Any to UTF-8

 Any codec to UTF-8

#-*- coding: utf-8 -*-

import sys    
import os

def file_enc(path):
    import chardet
    with open( path, "rb" ) as f:
        return chardet.detect(f.read()).get('encoding')
    
def py2_euc2utf(in_file, out_file):
    with open(in_file, "r") as f:
        euc = f.read().decode('cp949') #encode('cp949').decode('cp437')
        #euc = f.read().decode('euc_kr') #encode('cp949').decode('cp437')
        utf = euc.encode('utf-8')
        with open(out_file, "w") as w:
            w.write(utf)

def py3_euc2utf(in_file, out_file):
    with open(in_file, "r", encoding=file_enc(in_file)) as f:
        utf = f.read()
        with open(out_file, "w", encoding="utf-8") as w:
            w.write(utf)

def euc2utf(in_file, out_file):
    if sys.version_info.major == 3: py3_euc2utf(in_file,  out_file + '.py3')
    else: py2_euc2utf(in_file, out_file + '.py2')

    
def filelist(path):
    from os import listdir
    from os.path import isfile, join
    return [f for f in listdir(path) if isfile(join(path, f))]

if __name__ == "__main__":
    in_dir = r"d:/Ebook/가림토txt"
    out_dir = r"d:Ebook/가림토txt_utf"
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    files = filelist(in_dir)
    for f in files:
        print(f, file_enc(os.path.join(in_dir,f)))
        try:
            euc2utf(os.path.join(in_dir,f),os.path.join(out_dir,f))
        except:
            print(f, "--------------> ERROR")
    

댓글 없음:

댓글 쓰기