Created
March 4, 2012 21:33
-
-
Save remram44/1974886 to your computer and use it in GitHub Desktop.
UTF-8 decoder with fallback
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package common; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.Reader; | |
/** | |
* A UTF-8 decoder, that fallbacks to Latin-1. | |
* | |
* The decoder tries to read UTF-8 until it finds an invalid sequence of bytes; | |
* then it reads Latin-1 to the end of the file. | |
*/ | |
public class TextLoader extends Reader { | |
private InputStream m_Reader; | |
private boolean m_UTF8 = true; | |
private byte[] m_Waiting = new byte[6]; | |
private int m_NbWaiting = 0; | |
public TextLoader(InputStream reader) | |
{ | |
m_Reader = reader; | |
} | |
private int _read() throws IOException | |
{ | |
if(m_NbWaiting > 0) | |
return m_Waiting[--m_NbWaiting] & 0xFF; | |
else | |
return m_Reader.read(); | |
} | |
@Override | |
public int read(char[] cbuf, int off, int len) throws IOException | |
{ | |
int c; | |
int pos = 0; | |
byte[] buf = new byte[6]; | |
while(pos < len && (c = _read()) != -1) | |
{ | |
if((c & 0x80) == 0) // ASCII | |
cbuf[pos++] = (char)c; | |
else if(!m_UTF8) | |
cbuf[pos++] = (char)c; | |
else | |
{ | |
// Try to decode UTF-8... | |
// Count the number of 1 bits | |
int nb_chars = 1; | |
if((c & 0xC0) == 0xC0) // 1100 0000 ; nb_chars >= 2 | |
{ | |
if((c & 0xF0) == 0xF0) // 1111 0000 ; nb_chars >= 4 | |
{ | |
if((c & 0xFC) == 0xFC) // 1111 1100 ; nb_chars >= 6 | |
nb_chars = 6; | |
else if((c & 0xF8) == 0xF8) // 1111 1000 ; nb_chars >= 5 | |
nb_chars = 5; | |
else | |
nb_chars = 4; | |
} | |
else if((c & 0xE0) == 0xE0) // 1110 0000 ; nb_chars >= 3 | |
nb_chars = 3; | |
else | |
nb_chars = 2; | |
} | |
else | |
m_UTF8 = false; | |
if(m_UTF8) | |
{ | |
// Checks that the next bit is 0 | |
if(((c << nb_chars) & 0x80) != 0) | |
{ | |
m_UTF8 = false; | |
nb_chars = 1; | |
} | |
} | |
if(m_UTF8) | |
{ | |
// Reads the following characters | |
buf[0] = (byte)c; | |
for(int i = 1; m_UTF8 && i < nb_chars; i++) | |
{ | |
c = _read(); | |
if(c == -1) | |
{ | |
m_UTF8 = false; | |
nb_chars = i; | |
} | |
else | |
{ | |
buf[i] = (byte)c; | |
if((c & 0xC0) != 0x80) | |
{ | |
m_UTF8 = false; | |
nb_chars = i+1; | |
} | |
} | |
} | |
} | |
if(m_UTF8) | |
{ | |
// Finally, decode the full UTF-8 character | |
// First byte | |
buf[0] = (byte)(((buf[0] << (nb_chars + 1)) & 0xFF) >> (nb_chars + 1)); | |
// Decode the next bytes starting from the right | |
int unicode = 0; | |
for(int i = nb_chars-1; i >= 0; i--) | |
unicode |= (buf[nb_chars-1-i] & 0x3F) << (i*6); | |
cbuf[pos++] = (char)unicode; | |
} | |
else | |
{ | |
// It wasn't valid UTF-8 : add each already read byte as an | |
// 8-bit character | |
int i = 0; | |
while(i < nb_chars && pos < len) | |
cbuf[pos++] = (char)(buf[i++] & 0xFF); | |
while(i < nb_chars) | |
m_Waiting[m_NbWaiting++] = buf[i++]; | |
} | |
} | |
} | |
return (pos>0)?pos:-1; | |
} | |
@Override | |
public void close() throws IOException | |
{ | |
m_Reader.close(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment