2021년 8월 22일 일요일

[Java] Change Encoding to UTF-8


Change Encoding to UTF-8

package com.zdiv.jlib.app.CharConv;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;

import org.apache.commons.io.IOUtils;

public class CharConv {

    public static void main(String[] args) {
        String directory = "d:\\Ebook\\가림토txt\\객주\\";
        File dir = new File(directory);
        if( dir.exists() ) {
            File[] files = dir.listFiles();
            for( File f : files ) {
                if( ! f.isDirectory() ) {
                    String outFileName = f.getAbsolutePath() + ".out";
                    System.out.println(outFileName );
                    try {
                        String dataStr = IOUtils.toString(new FileInputStream(f), "CP949");
                        byte[] dataByte = dataStr.getBytes("UTF-8");
                        Files.write(Paths.get(outFileName), dataByte, StandardOpenOption.CREATE);
                    } catch (FileNotFoundException e) {
                        e.printStackTrace();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }
    }
}

2021년 8월 18일 수요일

[Python] Any to UTF-8

 Any codec to UTF-8

#-*- coding: utf-8 -*-

import sys    
import os

def file_enc(path):
    import chardet
    with open( path, "rb" ) as f:
        return chardet.detect(f.read()).get('encoding')
    
def py2_euc2utf(in_file, out_file):
    with open(in_file, "r") as f:
        euc = f.read().decode('cp949') #encode('cp949').decode('cp437')
        #euc = f.read().decode('euc_kr') #encode('cp949').decode('cp437')
        utf = euc.encode('utf-8')
        with open(out_file, "w") as w:
            w.write(utf)

def py3_euc2utf(in_file, out_file):
    with open(in_file, "r", encoding=file_enc(in_file)) as f:
        utf = f.read()
        with open(out_file, "w", encoding="utf-8") as w:
            w.write(utf)

def euc2utf(in_file, out_file):
    if sys.version_info.major == 3: py3_euc2utf(in_file,  out_file + '.py3')
    else: py2_euc2utf(in_file, out_file + '.py2')

    
def filelist(path):
    from os import listdir
    from os.path import isfile, join
    return [f for f in listdir(path) if isfile(join(path, f))]

if __name__ == "__main__":
    in_dir = r"d:/Ebook/가림토txt"
    out_dir = r"d:Ebook/가림토txt_utf"
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    files = filelist(in_dir)
    for f in files:
        print(f, file_enc(os.path.join(in_dir,f)))
        try:
            euc2utf(os.path.join(in_dir,f),os.path.join(out_dir,f))
        except:
            print(f, "--------------> ERROR")