Skip to content

Instantly share code, notes, and snippets.

@remram44
Created March 4, 2012 21:33
Show Gist options
  • Save remram44/1974886 to your computer and use it in GitHub Desktop.
Save remram44/1974886 to your computer and use it in GitHub Desktop.
UTF-8 decoder with fallback
package common;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
/**
* A UTF-8 decoder, that fallbacks to Latin-1.
*
* The decoder tries to read UTF-8 until it finds an invalid sequence of bytes;
* then it reads Latin-1 to the end of the file.
*/
public class TextLoader extends Reader {
private InputStream m_Reader;
private boolean m_UTF8 = true;
private byte[] m_Waiting = new byte[6];
private int m_NbWaiting = 0;
public TextLoader(InputStream reader)
{
m_Reader = reader;
}
private int _read() throws IOException
{
if(m_NbWaiting > 0)
return m_Waiting[--m_NbWaiting] & 0xFF;
else
return m_Reader.read();
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException
{
int c;
int pos = 0;
byte[] buf = new byte[6];
while(pos < len && (c = _read()) != -1)
{
if((c & 0x80) == 0) // ASCII
cbuf[pos++] = (char)c;
else if(!m_UTF8)
cbuf[pos++] = (char)c;
else
{
// Try to decode UTF-8...
// Count the number of 1 bits
int nb_chars = 1;
if((c & 0xC0) == 0xC0) // 1100 0000 ; nb_chars >= 2
{
if((c & 0xF0) == 0xF0) // 1111 0000 ; nb_chars >= 4
{
if((c & 0xFC) == 0xFC) // 1111 1100 ; nb_chars >= 6
nb_chars = 6;
else if((c & 0xF8) == 0xF8) // 1111 1000 ; nb_chars >= 5
nb_chars = 5;
else
nb_chars = 4;
}
else if((c & 0xE0) == 0xE0) // 1110 0000 ; nb_chars >= 3
nb_chars = 3;
else
nb_chars = 2;
}
else
m_UTF8 = false;
if(m_UTF8)
{
// Checks that the next bit is 0
if(((c << nb_chars) & 0x80) != 0)
{
m_UTF8 = false;
nb_chars = 1;
}
}
if(m_UTF8)
{
// Reads the following characters
buf[0] = (byte)c;
for(int i = 1; m_UTF8 && i < nb_chars; i++)
{
c = _read();
if(c == -1)
{
m_UTF8 = false;
nb_chars = i;
}
else
{
buf[i] = (byte)c;
if((c & 0xC0) != 0x80)
{
m_UTF8 = false;
nb_chars = i+1;
}
}
}
}
if(m_UTF8)
{
// Finally, decode the full UTF-8 character
// First byte
buf[0] = (byte)(((buf[0] << (nb_chars + 1)) & 0xFF) >> (nb_chars + 1));
// Decode the next bytes starting from the right
int unicode = 0;
for(int i = nb_chars-1; i >= 0; i--)
unicode |= (buf[nb_chars-1-i] & 0x3F) << (i*6);
cbuf[pos++] = (char)unicode;
}
else
{
// It wasn't valid UTF-8 : add each already read byte as an
// 8-bit character
int i = 0;
while(i < nb_chars && pos < len)
cbuf[pos++] = (char)(buf[i++] & 0xFF);
while(i < nb_chars)
m_Waiting[m_NbWaiting++] = buf[i++];
}
}
}
return (pos>0)?pos:-1;
}
@Override
public void close() throws IOException
{
m_Reader.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment