Skip to content

Instantly share code, notes, and snippets.

@seymores
Created April 20, 2012 03:33
Show Gist options
  • Save seymores/2425692 to your computer and use it in GitHub Desktop.
Save seymores/2425692 to your computer and use it in GitHub Desktop.
Extract AndroidManifest.xml information.
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
class AndroidXMLDecompress {
// decompressXML -- Parse the 'compressed' binary form of Android XML docs
// such as for AndroidManifest.xml in .apk files
public static int endDocTag = 0x00100101;
public static int startTag = 0x00100102;
public static int endTag = 0x00100103;
static void prt(String str) {
//System.err.print(str);
}
public static String decompressXML(byte[] xml) {
StringBuilder finalXML = new StringBuilder();
// Compressed XML file/bytes starts with 24x bytes of data,
// 9 32 bit words in little endian order (LSB first):
// 0th word is 03 00 08 00
// 3rd word SEEMS TO BE: Offset at then of StringTable
// 4th word is: Number of strings in string table
// WARNING: Sometime I indiscriminently display or refer to word in
// little endian storage format, or in integer format (ie MSB first).
int numbStrings = LEW(xml, 4 * 4);
// StringIndexTable starts at offset 24x, an array of 32 bit LE offsets
// of the length/string data in the StringTable.
int sitOff = 0x24; // Offset of start of StringIndexTable
// StringTable, each string is represented with a 16 bit little endian
// character count, followed by that number of 16 bit (LE) (Unicode)
// chars.
int stOff = sitOff + numbStrings * 4; // StringTable follows
// StrIndexTable
// XMLTags, The XML tag tree starts after some unknown content after the
// StringTable. There is some unknown data after the StringTable, scan
// forward from this point to the flag for the start of an XML start
// tag.
int xmlTagOff = LEW(xml, 3 * 4); // Start from the offset in the 3rd
// word.
// Scan forward until we find the bytes: 0x02011000(x00100102 in normal
// int)
for (int ii = xmlTagOff; ii < xml.length - 4; ii += 4) {
if (LEW(xml, ii) == startTag) {
xmlTagOff = ii;
break;
}
} // end of hack, scanning for start of first start tag
// XML tags and attributes:
// Every XML start and end tag consists of 6 32 bit words:
// 0th word: 02011000 for startTag and 03011000 for endTag
// 1st word: a flag?, like 38000000
// 2nd word: Line of where this tag appeared in the original source file
// 3rd word: FFFFFFFF ??
// 4th word: StringIndex of NameSpace name, or FFFFFFFF for default NS
// 5th word: StringIndex of Element Name
// (Note: 01011000 in 0th word means end of XML document, endDocTag)
// Start tags (not end tags) contain 3 more words:
// 6th word: 14001400 meaning??
// 7th word: Number of Attributes that follow this tag(follow word 8th)
// 8th word: 00000000 meaning??
// Attributes consist of 5 words:
// 0th word: StringIndex of Attribute Name's Namespace, or FFFFFFFF
// 1st word: StringIndex of Attribute Name
// 2nd word: StringIndex of Attribute Value, or FFFFFFF if ResourceId
// used
// 3rd word: Flags?
// 4th word: str ind of attr value again, or ResourceId of value
// TMP, dump string table to tr for debugging
// tr.addSelect("strings", null);
// for (int ii=0; ii<numbStrings; ii++) {
// // Length of string starts at StringTable plus offset in StrIndTable
// String str = compXmlString(xml, sitOff, stOff, ii);
// tr.add(String.valueOf(ii), str);
// }
// tr.parent();
// Step through the XML tree element tags and attributes
int off = xmlTagOff;
int indent = 0;
int startTagLineNo = -2;
while (off < xml.length) {
int tag0 = LEW(xml, off);
// int tag1 = LEW(xml, off+1*4);
int lineNo = LEW(xml, off + 2 * 4);
// int tag3 = LEW(xml, off+3*4);
int nameNsSi = LEW(xml, off + 4 * 4);
int nameSi = LEW(xml, off + 5 * 4);
if (tag0 == startTag) { // XML START TAG
int tag6 = LEW(xml, off + 6 * 4); // Expected to be 14001400
int numbAttrs = LEW(xml, off + 7 * 4); // Number of Attributes
// to follow
// int tag8 = LEW(xml, off+8*4); // Expected to be 00000000
off += 9 * 4; // Skip over 6+3 words of startTag data
String name = compXmlString(xml, sitOff, stOff, nameSi);
// tr.addSelect(name, null);
startTagLineNo = lineNo;
// Look for the Attributes
StringBuffer sb = new StringBuffer();
for (int ii = 0; ii < numbAttrs; ii++) {
int attrNameNsSi = LEW(xml, off); // AttrName Namespace Str
// Ind, or FFFFFFFF
int attrNameSi = LEW(xml, off + 1 * 4); // AttrName String
// Index
int attrValueSi = LEW(xml, off + 2 * 4); // AttrValue Str
// Ind, or
// FFFFFFFF
int attrFlags = LEW(xml, off + 3 * 4);
int attrResId = LEW(xml, off + 4 * 4); // AttrValue
// ResourceId or dup
// AttrValue StrInd
off += 5 * 4; // Skip over the 5 words of an attribute
String attrName = compXmlString(xml, sitOff, stOff,
attrNameSi);
String attrValue = "";
if (attrValueSi != -1) {
attrValue = compXmlString(xml, sitOff, stOff, attrValueSi)
} else {
if (attrResId == -1)
attrValue = "resourceID 0x" + Integer.toHexString(attrResId);
else
attrValue = Integer.valueOf(Integer.toHexString(attrResId), 16).intValue();
//System.out.println(attrName + " >>> " + attrValue.split("0x")[1]);
}
// attrValue = Integer.valueOf(Integer.toHexString(attrResId), 16).intValue();
sb.append(" " + attrName + "=\"" + attrValue + "\"");
// tr.add(attrName, attrValue);
}
finalXML.append("<" + name + sb + ">");
prtIndent(indent, "<" + name + sb + ">");
indent++;
} else if (tag0 == endTag) { // XML END TAG
indent--;
off += 6 * 4; // Skip over 6 words of endTag data
String name = compXmlString(xml, sitOff, stOff, nameSi);
finalXML.append("</" + name + ">");
prtIndent(indent, "</" + name + "> (line " + startTagLineNo
+ "-" + lineNo + ")");
// tr.parent(); // Step back up the NobTree
} else if (tag0 == endDocTag) { // END OF XML DOC TAG
break;
} else {
prt(" Unrecognized tag code '" + Integer.toHexString(tag0)
+ "' at offset " + off);
break;
}
} // end of while loop scanning tags and attributes of XML tree
//prt(" end at offset " + off);
return finalXML.toString();
} // end of decompressXML
public static String compXmlString(byte[] xml, int sitOff, int stOff, int strInd) {
if (strInd < 0)
return null;
int strOff = stOff + LEW(xml, sitOff + strInd * 4);
return compXmlStringAt(xml, strOff);
}
public static String spaces = " ";
public static void prtIndent(int indent, String str) {
prt(spaces.substring(0, Math.min(indent * 2, spaces.length())) + str);
}
// compXmlStringAt -- Return the string stored in StringTable format at
// offset strOff. This offset points to the 16 bit string length, which
// is followed by that number of 16 bit (Unicode) chars.
public static String compXmlStringAt(byte[] arr, int strOff) {
int strLen = arr[strOff + 1] << 8 & 0xff00 | arr[strOff] & 0xff;
byte[] chars = new byte[strLen];
for (int ii = 0; ii < strLen; ii++) {
chars[ii] = arr[strOff + 2 + ii * 2];
}
return new String(chars); // Hack, just use 8 byte chars
} // end of compXmlStringAt
// LEW -- Return value of a Little Endian 32 bit word from the byte array
// at offset off.
public static int LEW(byte[] arr, int off) {
return arr[off + 3] << 24 & 0xff000000 | arr[off + 2] << 16 & 0xff0000 | arr[off + 1] << 8 & 0xff00 | arr[off] & 0xFF;
} // end of LEW
public static void main(String[] args) throws IOException {
String fileName = args[0];
InputStream is = null;
ZipFile zip = null;
if (fileName.endsWith(".apk") || fileName.endsWith(".zip")) {
zip = new ZipFile(fileName);
ZipEntry mft = zip.getEntry("AndroidManifest.xml");
is = zip.getInputStream(mft);
} else {
is = new FileInputStream(fileName);
}
byte[] buf = new byte[10240];
int bytesRead = is.read(buf);
is.close();
if (zip != null) {
zip.close();
}
String xml = AndroidXMLDecompress.decompressXML(buf);
System.out.println(xml);
}
}
println "************************************* "
def is = new FileInputStream("/tmp/AndroidManifest.xml")
byte[] buf = new byte[10240];
int bytesRead = is.read(buf);
is.close();
String xml = AndroidXMLDecompress.decompressXML(buf);
System.out.println(xml);
@maxupeng
Copy link

It works!!! Thank U!!!

@packmad
Copy link

packmad commented Dec 21, 2015

works like a charm, tnks!

@vassav
Copy link

vassav commented Sep 4, 2017

Fix for decompress file with UTF16 text:

        public string CompXmlStringAt(byte[] arr, int strOff)
        {
            int strLen = arr[strOff + 1] << 8 & 0xff00 | arr[strOff] & 0xff;
            byte[] chars = new byte[strLen];
            var lst=new List<byte>();
            for (int ii = 0; ii < strLen; ii++)
            {
                //chars[ii] = arr[strOff + 2 + ii * 2];
                lst.Add(arr[strOff + 2 + ii * 2]);
                lst.Add(arr[strOff + 2 + ii * 2+1]);
            }
            return System.Text.Encoding.GetEncoding("UTF-16").GetString(lst.ToArray());  // Hack, just use 8 byte chars
        } // end of CompXmlStringAt

@ricpeter321
Copy link

GOD BLESS YOU!!!

@codehasan
Copy link

So after decompressing, how to recompress the xml string

@Rainb0wCodes
Copy link

@vassav I'm quite sure it takes more edits than that to port the entire code to C#

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment