-
-
Save lxneng/314123 to your computer and use it in GitHub Desktop.
Java vs. Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Reads from the | |
* stream <code>in</code> a representation | |
* of a Unicode character string encoded in | |
* <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format; | |
* this string of characters is then returned as a <code>String</code>. | |
* The details of the modified UTF-8 representation | |
* are exactly the same as for the <code>readUTF</code> | |
* method of <code>DataInput</code>. | |
* | |
* @param in a data input stream. | |
* @return a Unicode string. | |
* @exception EOFException if the input stream reaches the end | |
* before all the bytes. | |
* @exception IOException the stream has been closed and the contained | |
* input stream does not support reading after close, or | |
* another I/O error occurs. | |
* @exception UTFDataFormatException if the bytes do not represent a | |
* valid modified UTF-8 encoding of a Unicode string. | |
* @see java.io.DataInputStream#readUnsignedShort() | |
*/ | |
public final static String readUTF(DataInput in) throws IOException { | |
int utflen = in.readUnsignedShort(); | |
byte[] bytearr = null; | |
char[] chararr = null; | |
if (in instanceof DataInputStream) { | |
DataInputStream dis = (DataInputStream)in; | |
if (dis.bytearr.length < utflen){ | |
dis.bytearr = new byte[utflen*2]; | |
dis.chararr = new char[utflen*2]; | |
} | |
chararr = dis.chararr; | |
bytearr = dis.bytearr; | |
} else { | |
bytearr = new byte[utflen]; | |
chararr = new char[utflen]; | |
} | |
int c, char2, char3; | |
int count = 0; | |
int chararr_count=0; | |
in.readFully(bytearr, 0, utflen); | |
while (count < utflen) { | |
c = (int) bytearr[count] & 0xff; | |
if (c > 127) break; | |
count++; | |
chararr[chararr_count++]=(char)c; | |
} | |
while (count < utflen) { | |
c = (int) bytearr[count] & 0xff; | |
switch (c >> 4) { | |
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: | |
/* 0xxxxxxx*/ | |
count++; | |
chararr[chararr_count++]=(char)c; | |
break; | |
case 12: case 13: | |
/* 110x xxxx 10xx xxxx*/ | |
count += 2; | |
if (count > utflen) | |
throw new UTFDataFormatException( | |
"malformed input: partial character at end"); | |
char2 = (int) bytearr[count-1]; | |
if ((char2 & 0xC0) != 0x80) | |
throw new UTFDataFormatException( | |
"malformed input around byte " + count); | |
chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | | |
(char2 & 0x3F)); | |
break; | |
case 14: | |
/* 1110 xxxx 10xx xxxx 10xx xxxx */ | |
count += 3; | |
if (count > utflen) | |
throw new UTFDataFormatException( | |
"malformed input: partial character at end"); | |
char2 = (int) bytearr[count-2]; | |
char3 = (int) bytearr[count-1]; | |
if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) | |
throw new UTFDataFormatException( | |
"malformed input around byte " + (count-1)); | |
chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | | |
((char2 & 0x3F) << 6) | | |
((char3 & 0x3F) << 0)); | |
break; | |
default: | |
/* 10xx xxxx, 1111 xxxx */ | |
throw new UTFDataFormatException( | |
"malformed input around byte " + count); | |
} | |
} | |
// The number of chars produced may be less than utflen | |
return new String(chararr, 0, chararr_count); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import struct | |
def read_utf(fd): | |
"Python implementation of Java's DataInputStream.readUTF method." | |
length = struct.unpack('>H', fd.read(2))[0] | |
return fd.read(length).decode('utf8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment