Created
November 13, 2013 22:07
-
-
Save dwf/7457323 to your computer and use it in GitHub Desktop.
Undo a textual hex dump (from the Firefox cache).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Reverses a hex dump (of the format shown in the Firefox cache). | |
| The format is as follows: | |
| <OFFSET> <BYTE> [<BYTE> [<BYTE> [...]]] [render] | |
| Where | |
| - <OFFSET> is 8 hex digits, possibly followed by a colon | |
| - each <BYTE> is (maximum 16 per line) | |
| - [render] is ASCII rendering of the bytes, ignored. | |
| - <OFFSET> and the first <BYTE> are separated by two spaces, | |
| as are each pair of consecutive <BYTE>s | |
| """ | |
| __author__ = "David Warde-Farley" | |
| __license__ = "3-clause BSD" | |
| __email__ = "d.warde.farley" + chr(64) + "gmail" + chr(46) + "com" | |
| import argparse | |
| import sys | |
| import re | |
| HEX_BYTE = '([0-9a-f]{2})' | |
| OFFSET_REGEX = "[0-9a-f]{8}" | |
| MAX_BYTES = 16 | |
| BYTES_REGEX = HEX_BYTE + " " + (HEX_BYTE + "?" + " ") * (MAX_BYTES - 1) | |
| LINE_REGEX = OFFSET_REGEX + ":? " + BYTES_REGEX | |
| def unhexdump(infile, outfile): | |
| """ | |
| Reads lines from infile, decodes them according to the dump format, | |
| writes bytes to outfile. | |
| """ | |
| s = [] | |
| for line in infile: | |
| match = re.match(LINE_REGEX, line) | |
| if match: | |
| s.extend([chr(int(c, 16)) for c in match.groups() | |
| if c is not None]) | |
| outfile.write(''.join(s)) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description=__doc__.strip().split('\n')[0] | |
| ) | |
| parser.add_argument('input', default=sys.stdin, nargs='?', | |
| type=argparse.FileType('rb'), | |
| help="An optional filename to read from " | |
| "(default=stdin)") | |
| parser.add_argument('-O', '--output', required=False, | |
| default=sys.stdout, type=argparse.FileType('wb'), | |
| help="An optional filename to which output " | |
| "will be written (default=stdout)") | |
| args = parser.parse_args() | |
| unhexdump(args.input, args.output) |
When files have repeated byte patterns, hexdump will print a * alone on a line to indicate "same as the line above", ex.
$ printf "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" | hexdump -C
00000000 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 |AAAAAAAAAAAAAAAA|
*
00000020 41 41 41 41 41 41 41 41 41 41 41 |AAAAAAAAAAA|
0000002b
Python isn't my first language but I coded up the following hack to handle this situation and fill in the missing repeated byte patterns.
def unhexdump(infile, outfile):
"""
Reads lines from infile, decodes them according to the dump format,
writes bytes to outfile.
"""
s = []
last_line = ''
fill_repeat = False
for line in infile:
match = re.match(LINE_REGEX, line)
if match:
offset=re.match("("+OFFSET_REGEX+")", line).group(1)
if fill_repeat:
fill_repeat = False
start_offset=int(re.match("("+OFFSET_REGEX+")", last_line).group(1), 16) + 16
end_offset=int(offset, 16)
for i in range(start_offset, end_offset, 16):
s.extend([chr(int(c, 16)) for c in re.match(LINE_REGEX, last_line).groups()
if c is not None])
s.extend([chr(int(c, 16)) for c in match.groups()
if c is not None])
last_line = line
elif re.match("^\*", line):
fill_repeat = True
outfile.write(''.join(s))
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice code, I changed your REGEX just a bit, and now it works for a few other Hexdump formats.