remram44 · March 4, 2012 21:33
diff --git a/TextLoader.java b/TextLoader.java

 package common;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;

 /**
 * A UTF-8 decoder, that fallbacks to Latin-1.
 *
 * The decoder tries to read UTF-8 until it finds an invalid sequence of bytes;
 * then it reads Latin-1 to the end of the file.
 */
 public class TextLoader extends Reader {

    private InputStream m_Reader;
    private boolean m_UTF8 = true;
    private byte[] m_Waiting = new byte[6];
    private int m_NbWaiting = 0;

    public TextLoader(InputStream reader)
    {
        m_Reader = reader;
    }

    private int _read() throws IOException
    {
        if(m_NbWaiting > 0)
            return m_Waiting[--m_NbWaiting] & 0xFF;
        else
            return m_Reader.read();
    }

    @Override
    public int read(char[] cbuf, int off, int len) throws IOException
    {
        int c;
        int pos = 0;
        byte[] buf = new byte[6];
        while(pos < len && (c = _read()) != -1)
        {
            if((c & 0x80) == 0) // ASCII
                cbuf[pos++] = (char)c;
            else if(!m_UTF8)
                cbuf[pos++] = (char)c;
            else
            {
                // Try to decode UTF-8...

                // Count the number of 1 bits
                int nb_chars = 1;
                if((c & 0xC0) == 0xC0) // 1100 0000 ; nb_chars >= 2
                {
                    if((c & 0xF0) == 0xF0) // 1111 0000 ; nb_chars >= 4
                    {
                        if((c & 0xFC) == 0xFC) // 1111 1100 ; nb_chars >= 6
                            nb_chars = 6;
                        else if((c & 0xF8) == 0xF8) // 1111 1000 ; nb_chars >= 5
                            nb_chars = 5;
                        else
                            nb_chars = 4;
                    }
                    else if((c & 0xE0) == 0xE0) // 1110 0000 ; nb_chars >= 3
                        nb_chars = 3;
                    else
                        nb_chars = 2;
                }
                else
                    m_UTF8 = false;

                if(m_UTF8)
                {
                    // Checks that the next bit is 0
                    if(((c << nb_chars) & 0x80) != 0)
                    {
                        m_UTF8 = false;
                        nb_chars = 1;
                    }
                }

                if(m_UTF8)
                {
                    // Reads the following characters
                    buf[0] = (byte)c;
                    for(int i = 1; m_UTF8 && i < nb_chars; i++)
                    {
                        c = _read();
                        if(c == -1)
                        {
                            m_UTF8 = false;
                            nb_chars = i;
                        }
                        else
                        {
                            buf[i] = (byte)c;
                            if((c & 0xC0) != 0x80)
                            {
                                m_UTF8 = false;
                                nb_chars = i+1;
                            }
                        }
                    }
                }

                if(m_UTF8)
                {
                    // Finally, decode the full UTF-8 character

                    // First byte
                    buf[0] = (byte)(((buf[0] << (nb_chars + 1)) & 0xFF) >> (nb_chars + 1));

                    // Decode the next bytes starting from the right
                    int unicode = 0;
                    for(int i = nb_chars-1; i >= 0; i--)
                        unicode |= (buf[nb_chars-1-i] & 0x3F) << (i*6);

                    cbuf[pos++] = (char)unicode;
                }
                else
                {
                    // It wasn't valid UTF-8 : add each already read byte as an
                    // 8-bit character
                    int i = 0;
                    while(i < nb_chars && pos < len)
                        cbuf[pos++] = (char)(buf[i++] & 0xFF);
                    while(i < nb_chars)
                        m_Waiting[m_NbWaiting++] = buf[i++];
                }
            }
        }
        return (pos>0)?pos:-1;
    }

    @Override
    public void close() throws IOException
    {
        m_Reader.close();
    }

 }

	package common;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.Reader;

	/**
	* A UTF-8 decoder, that fallbacks to Latin-1.
	*
	* The decoder tries to read UTF-8 until it finds an invalid sequence of bytes;
	* then it reads Latin-1 to the end of the file.
	*/
	public class TextLoader extends Reader {

	private InputStream m_Reader;
	private boolean m_UTF8 = true;
	private byte[] m_Waiting = new byte[6];
	private int m_NbWaiting = 0;

	public TextLoader(InputStream reader)
	{
	m_Reader = reader;
	}

	private int _read() throws IOException
	{
	if(m_NbWaiting > 0)
	return m_Waiting[--m_NbWaiting] & 0xFF;
	else
	return m_Reader.read();
	}

	@Override
	public int read(char[] cbuf, int off, int len) throws IOException
	{
	int c;
	int pos = 0;
	byte[] buf = new byte[6];
	while(pos < len && (c = _read()) != -1)
	{
	if((c & 0x80) == 0) // ASCII
	cbuf[pos++] = (char)c;
	else if(!m_UTF8)
	cbuf[pos++] = (char)c;
	else
	{
	// Try to decode UTF-8...

	// Count the number of 1 bits
	int nb_chars = 1;
	if((c & 0xC0) == 0xC0) // 1100 0000 ; nb_chars >= 2
	{
	if((c & 0xF0) == 0xF0) // 1111 0000 ; nb_chars >= 4
	{
	if((c & 0xFC) == 0xFC) // 1111 1100 ; nb_chars >= 6
	nb_chars = 6;
	else if((c & 0xF8) == 0xF8) // 1111 1000 ; nb_chars >= 5
	nb_chars = 5;
	else
	nb_chars = 4;
	}
	else if((c & 0xE0) == 0xE0) // 1110 0000 ; nb_chars >= 3
	nb_chars = 3;
	else
	nb_chars = 2;
	}
	else
	m_UTF8 = false;

	if(m_UTF8)
	{
	// Checks that the next bit is 0
	if(((c << nb_chars) & 0x80) != 0)
	{
	m_UTF8 = false;
	nb_chars = 1;
	}
	}

	if(m_UTF8)
	{
	// Reads the following characters
	buf[0] = (byte)c;
	for(int i = 1; m_UTF8 && i < nb_chars; i++)
	{
	c = _read();
	if(c == -1)
	{
	m_UTF8 = false;
	nb_chars = i;
	}
	else
	{
	buf[i] = (byte)c;
	if((c & 0xC0) != 0x80)
	{
	m_UTF8 = false;
	nb_chars = i+1;
	}
	}
	}
	}

	if(m_UTF8)
	{
	// Finally, decode the full UTF-8 character

	// First byte
	buf[0] = (byte)(((buf[0] << (nb_chars + 1)) & 0xFF) >> (nb_chars + 1));

	// Decode the next bytes starting from the right
	int unicode = 0;
	for(int i = nb_chars-1; i >= 0; i--)
	unicode \|= (buf[nb_chars-1-i] & 0x3F) << (i*6);

	cbuf[pos++] = (char)unicode;
	}
	else
	{
	// It wasn't valid UTF-8 : add each already read byte as an
	// 8-bit character
	int i = 0;
	while(i < nb_chars && pos < len)
	cbuf[pos++] = (char)(buf[i++] & 0xFF);
	while(i < nb_chars)
	m_Waiting[m_NbWaiting++] = buf[i++];
	}
	}
	}
	return (pos>0)?pos:-1;
	}

	@Override
	public void close() throws IOException
	{
	m_Reader.close();
	}

	}