Search code examples
javacharacter-encodingnioutf8-decode

Unable to change the encoding of a Shift_JIS to UTF-8 using java.nio


I am trying to read a file that is encoded using Shift_JIS and then convert it into UTF-8. When i use java.nio CharsetDecoder.decode it throws the following error. I am not able to pinpoint the actual cause of this issue.

java.nio.charset.UnmappableCharacterException: Input length = 2
java.nio.charset.UnmappableCharacterException: Input length = 2
    at java.nio.charset.CoderResult.throwException(CoderResult.java:278)
    at java.nio.charset.CharsetDecoder.decode(CharsetDecoder.java:798)
    at CharacterSetConversionUtility.getString(CharacterSetConversionUtility.java:23)
    at CharacterSetConversionUtility.convertBetweenEncodings(CharacterSetConversionUtility.java:39)
    at CharacterSetConversionUtility.main(CharacterSetConversionUtility.java:94

Below is the code snippet

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.charset.CharsetDecoder;

import org.mozilla.universalchardet.UniversalDetector;  
 public class CharacterSetConversionUtility
 {
    public static String getString(String charSet, byte[] bytes) throws CharacterCodingException
{
    ByteBuffer buffer = ByteBuffer.wrap(bytes);
    Charset charset = Charset.forName(charSet);
    CharsetDecoder decoder = charset.newDecoder();
    CharBuffer output = decoder.decode(buffer);
    return output.toString();
}

public static byte[] convertToEncoding(String charSet, String input) throws CharacterCodingException
{
    CharBuffer buffer = CharBuffer.wrap(input);
    Charset charset = Charset.forName(charSet);
    CharsetEncoder encoder = charset.newEncoder();
    ByteBuffer output = encoder.encode(buffer);
    return output.array();
}

public static byte[] convertBetweenEncodings(byte[] originalBytes, String sourceCharSet, String destCharSet)
        throws CharacterCodingException
{
    String unicodeString = getString(sourceCharSet, originalBytes);
    byte[] output = convertToEncoding(destCharSet, unicodeString);
    return output;
}

/** Utility Method to detect character encoding in a byte stream **/

public static String getCharacterEncoding(String fileName){
    byte[] buf = new byte[4096];
    String encoding = null;
    try {
        java.io.FileInputStream fis = new java.io.FileInputStream(fileName);

        // (1)
        UniversalDetector detector = new UniversalDetector(null);

        // (2)
        int nread;
        while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
          detector.handleData(buf, 0, nread);
        }
        // (3)
        detector.dataEnd();

        // (4)
        encoding = detector.getDetectedCharset();
        if (encoding != null) {
          System.out.println("Detected encoding = " + encoding);
        } else {
          System.out.println("No encoding detected.");
        }

        // (5)
        detector.reset();

        //

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return encoding;
}


public static void main(String[] args) {
    Path path = Paths.get("E:/Encoding Issue/SJISFile");
    try {
        byte[] inputdata = Files.readAllBytes(path);
        //Detect the character encoding of the input data
        String inputCharEncoding = getCharacterEncoding("E:/Encoding Issue/SJISFile");
        //Perform a character set conversion
        byte[] outputdata =convertBetweenEncodings(inputdata,inputCharEncoding,"UTF-8");
        FileOutputStream fos = new FileOutputStream("E:/Encoding Issue/convertedutf8.txt");
        fos.write(outputdata);
        fos.close();

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

Solution

  • So, I don't have a definitive answer to the cause of the failure you experienced, although I suspect it lies somewhere in the conversion between String/char[]/byte[]. That said, I'd like to offer a more simple and compact working solution to the problem at hand, which uses this method instead, i.e. the conversion functionality offered by the String class itself as opposed to using en/decoders. This will work for the Shift_JIS charset or any other one. Also, nothing wrong with the use of UniversalDetector, but I omitted it for simplicity's sake and hard-coded the source character set instead. Finally this version is JavaSE 1.6 compatible.

    Hope it helps :)


    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.nio.CharBuffer;
    import java.nio.charset.Charset;
    
    public class JapaneseCharsetTest {
    
        public static final int CHAR_LENGTH_TO_READ = 8192;
        public static void main(String[] args) {
            JapaneseCharsetTest test = new JapaneseCharsetTest();
            test.doIt();
        }
    
        public void doIt() {
    
            InputStreamReader reader = null;
            FileOutputStream fos = null;
            try {
                FileInputStream stream = new FileInputStream(new File("C:/Path/To/My/ShiftJISFile.txt"));
                reader = new InputStreamReader(stream, Charset.forName("Shift_JIS"));
                fos = new FileOutputStream("C:/Path/To/My/UTF8TargetFile.txt");
                char[] inputdata = new char[CHAR_LENGTH_TO_READ];
                int len = 0;
                while ((len = reader.read(inputdata)) != -1) {
                    convert(len, inputdata, fos);
                }
            } 
            catch (IOException e) {
                e.printStackTrace();
            }
            finally {
                try {if (reader != null) reader.close();} catch (IOException ignored) {} 
                try {if (fos != null) fos.close();} catch (IOException ignored){}
            }
        }
    
        private void convert(int len, char[] inputData, FileOutputStream fos) throws IOException {
            char[] charsToWrite = inputData;
            if (len < CHAR_LENGTH_TO_READ) {
                // Last chunk of data - cut it to size
                charsToWrite = new char[len];
                CharBuffer.wrap(inputData).get(charsToWrite, 0, len);
            }
    
            // Convert initial charset (here Shift_JIS) to target (here UTF-8)
            byte[] utf8 = new String(charsToWrite).getBytes("UTF-8");
            fos.write(utf8);
        }
    }