Created
May 11, 2019 09:57
-
-
Save marpie/a5973afe0cbc16be351e262cf22ad72f to your computer and use it in GitHub Desktop.
Uses PyPDF2 to parse the MIPS Architecture manuals and creates Ghidra compatible idx entries.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" parseMIPSpdf.py | |
Uses PyPDF2 to parse the MIPS Instruction documentation and creates | |
a Ghidra compatible idx. | |
PDF Sources: | |
https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00087-2B-MIPS64BIS-AFP-6.06.pdf | |
https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00594-2B-microMIPS64-AFP-6.05.pdf | |
https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf | |
Command Line: | |
parseMIPSpdf.py MD00087-2B-MIPS64BIS-AFP-6.06.pdf 44 517 > pages_MD00087.txt | |
parseMIPSpdf.py MD00594-2B-microMIPS64-AFP-6.05.pdf 69 452 > pages_MD00594.txt | |
parseMIPSpdf.py MIPS_Architecture_microMIPS32_InstructionSet_AFP_P_MD00582_06.04.pdf 65 368 > pages_MD00582.txt | |
Hints: | |
After running the script, inspect StdErr. Special care should be taken for | |
the following errors: | |
- WARNING:root:Content scraping failed for page: XX | |
This means, that PyPDF2 returned no content object and the page | |
needs to be inspected by the user and manually added to the output. | |
Most likely the page is embedded as an image - this is the case for | |
several pages in MD00087 and MD00594. | |
Author: marpie ([email protected]) | |
Last Update: 20190511 | |
Created: 20160511 | |
""" | |
from __future__ import print_function | |
import sys | |
import typing | |
import logging | |
try: | |
import PyPDF2 | |
except ModuleNotFoundError: | |
sys.stderr.write("PyPDF2 missing! Install via: pip3 install PyPDF2\n") | |
sys.exit(1) | |
# Version Information | |
__version__ = "0.0.1" | |
__program__ = "parseMIPSpdf v" + __version__ | |
__author__ = "marpie" | |
__email__ = "[email protected]" | |
__license__ = "BSD License" | |
__copyright__ = "Copyright 2019, a12d404.net" | |
# ("Prototype", "Development", "Testing", "Production") | |
__status__ = "Prototype" | |
LOG_LEVEL = logging.WARNING | |
# Source: MD00087-2B-MIPS64BIS-AFP-6.06.pdf | |
COND_TABLE = [ | |
# Table 3.9 Comparing CMP.condn.fmt, IEEE 754-2008, C.cond.fmt, and MSA FP compares | |
'f', | |
'af', | |
't', | |
'at', | |
'un', | |
'or', | |
'eq', | |
'neq', | |
'une', | |
'ueq', | |
'ogl', | |
'ne', | |
'olt', | |
'lt', | |
'uge', | |
'ult', | |
'oge', | |
'ole', | |
'le', | |
'ugt', | |
'ule', | |
'ogt', | |
# Table 3.2 FPU Comparisons With Special Operand Exceptions for QNaNs | |
'sf', | |
'saf', | |
'st', | |
'sat', | |
'ngle', | |
'sun', | |
'gle', | |
'sor', | |
'seq', | |
'sne', | |
'sune', | |
'ngl', | |
'sueq', | |
'gl', | |
'sne', | |
'lt', | |
'slt', | |
'nlt', | |
'suge', | |
'nge', | |
'sult', | |
'ge', | |
'soge', | |
'le', | |
'sle', | |
'nle', | |
'sugt', | |
'ngt', | |
'sule', | |
'gt', | |
'sogt', | |
] | |
#SCRIPT_PATH = os.path.dirname( os.path.realpath( __file__ ) ) | |
def eprint(*args, **kwargs): | |
print(*args, file=sys.stderr, **kwargs) | |
def extractTextLines(pdf: PyPDF2.PdfFileReader, page: PyPDF2.pdf.PageObject) -> typing.List[str]: | |
""" | |
Locate all text drawing commands, in the order they are provided in the | |
content stream, and extract the text. This works well for some PDF | |
files, but poorly for others, depending on the generator used. This will | |
be refined in the future. Do not rely on the order of text coming out of | |
this function, as it will change if this function is made more | |
sophisticated. | |
:return: a unicode string object. | |
Source & (c): <PyPDF2.pdf> | |
""" | |
logging.info("Func: extractTextLines()") | |
text = [] | |
content = page.getContents() | |
if not content: | |
return None | |
if not isinstance(content, PyPDF2.pdf.ContentStream): | |
content = PyPDF2.pdf.ContentStream(content, pdf) | |
# Note: we check all strings are TextStringObjects. ByteStringObjects | |
# are strings where the byte->string encoding was unknown, so adding | |
# them to the text here would be gibberish. | |
for operands, operator in content.operations: | |
#logging.debug("operands ({}): {}".format(type(operands), repr(operands))) | |
#logging.debug("operator ({}): {}".format(type(operator), repr(operator))) | |
if operator == b"Tj": | |
_text = operands[0] | |
if isinstance(_text, PyPDF2.generic.TextStringObject): | |
text.append(_text) | |
elif operator == b"T*": | |
text.append("") | |
elif operator == b"'": | |
text.append("") | |
_text = operands[0] | |
if isinstance(_text, PyPDF2.generic.TextStringObject): | |
text.append(operands[0]) | |
elif operator == b'"': | |
_text = operands[2] | |
if isinstance(_text, PyPDF2.generic.TextStringObject): | |
text.append(_text) | |
elif operator == b"TJ": | |
__text = "" | |
for i in operands[0]: | |
if isinstance(i, PyPDF2.generic.TextStringObject): | |
__text += i | |
text.append(__text) | |
return text | |
def parseFunctionTitle(title: str) -> typing.List[str]: | |
"""parseFunctions uses the extracted text to return the functions explained | |
on that page. The returned functions can contain ".fmt" or other aliases. | |
""" | |
logging.info("Func: parseFunctions(title: '{}')".format(title)) | |
functs = [] | |
if len(title) < 1: | |
return [] | |
elif ('{' in title) and ('}' in title): | |
# Sample: "b{le,ge,gt,lt,eq,ne}zalc" | |
first, rest = title.split('{', 1) | |
rest, last = rest.split('}', 1) | |
functs = [ | |
"{}{}{}".format(first, part.strip(), last).lower() | |
for part in rest.split(',') | |
] | |
elif (" " in title) and (not "," in title): | |
# Sample: "aui daui dahi dati" | |
functs = [part.lower() for part in title.split(' ') if part] | |
elif ("," in title): | |
# Sample: "crc32b, crc32h, crc32w, crc32d" | |
functs = [part.strip().lower() | |
for part in title.split(',') if part.strip()] | |
else: | |
functs = [title.lower()] | |
return functs | |
def extractFmtFunctions(function, parsed): | |
"""extractFmtFunctions creates every possible format for the given function.""" | |
logging.info("Func: extractFmtFunctions()") | |
functionBase = function[:-3].lower() | |
logging.debug("functionBase={}".format(repr(functionBase))) | |
functions = [] | |
for line in parsed: | |
line = line.lower() | |
if line.startswith(functionBase): | |
logging.debug("line -> {}".format(repr(line))) | |
functions.append(line.split(" ", 1)[0]) | |
return functions | |
def prepareCondFunctions(function: str) -> typing.List[str]: | |
"""prepareCondFunctions returns all conditional combinations of the function.""" | |
logging.info("Func: prepareCondFunctions()") | |
functions = [] | |
for cond in COND_TABLE: | |
func = function \ | |
.replace(".cond.", ".{}.".format(cond)) \ | |
.replace("<cond>", ".{}.".format(cond)) \ | |
.replace(".condn.", ".{}.".format(cond)) | |
functions.append(func) | |
return functions | |
def isConditionalFunction(func: str) -> bool: | |
return (".cond." in func) or ("<cond>" in func) or (".condn." in func) | |
def parseFunctionsUsingFormatSection(pageNo: int, lines: typing.List[str]) -> typing.List[str]: | |
# State-Machine: | |
# 0 = searching format-section | |
# 1 = function title | |
# 2 = function entries for ".fmt" and ".cond" pages | |
state = 0 | |
preParsedFunctions = [] | |
rawLines = [] | |
for line in lines: | |
line = line.lower().strip() | |
if len(line) < 1: | |
continue | |
if state == 0: | |
if "format:" in line: | |
logging.debug("state=1") | |
state = 1 | |
continue | |
elif state == 1: | |
logging.debug("state before 2") | |
preParsedFunctions = parseFunctionTitle(line) | |
if len(preParsedFunctions) < 1: | |
logging.error("No functions found in format section!") | |
logging.debug("state=2") | |
state = 2 | |
logging.debug("preParsedFunctions(len={})={}".format( | |
len(preParsedFunctions), | |
repr(preParsedFunctions) | |
)) | |
elif state == 2: | |
if ":" in line: | |
# we reached a new section, bail! | |
logging.debug("new section!") | |
break | |
rawLines.append(line) | |
logging.debug("rawLines(len={})={}".format( | |
len(rawLines), | |
repr(rawLines) | |
)) | |
if len(preParsedFunctions) < 1: | |
logging.debug("quick-exit--1") | |
return [] | |
if (len(rawLines) < 2) and len(preParsedFunctions) > 1: | |
logging.debug("quick-exit--2") | |
function = preParsedFunctions[0].split(" ")[0] | |
if isConditionalFunction(function): | |
return prepareCondFunctions(function) | |
return [function] | |
elif (".fmt" not in preParsedFunctions[0]) and (preParsedFunctions[0] not in rawLines[0]): | |
logging.debug("quick-exit--3") | |
return [preParsedFunctions[0].split(" ")[0]] | |
functions = [] | |
# resolve .fmt | |
for function in preParsedFunctions: | |
logging.debug("function - {}".format(function)) | |
if function.endswith(".fmt"): | |
functions += extractFmtFunctions(function, rawLines) | |
else: | |
functions.append(function) | |
functionsPreParsed = functions | |
functions = [] | |
# resolve .cond. | |
for function in functionsPreParsed: | |
if isConditionalFunction(function): | |
functions += prepareCondFunctions(function) | |
else: | |
functions.append(function) | |
return functions | |
def parsePage(pdfReader: PyPDF2.PdfFileReader, pageNo: int) -> typing.Tuple[bool, str]: | |
logging.info("Func: parsePage(pageNo: {})".format(pageNo)) | |
pageObj = pdfReader.getPage(pageNo-1) | |
try: | |
parsed = extractTextLines(pdfReader, pageObj) | |
#logging.debug("extractTextLines: Result({} lines): {}".format(len(parsed), repr(parsed))) | |
if parsed is None: | |
logging.warning("Content scraping failed for page: {}".format(pageNo)) | |
if (not parsed) or len(parsed) < 2: | |
return False, None | |
except: | |
return False, None | |
functions = parseFunctionsUsingFormatSection(pageNo, parsed) | |
if len(functions) < 1: | |
return False, None | |
return True, [(functionName.lower(), pageNo,) for functionName in functions] | |
def parsePdf(filename: str, startPage: int, endPage: int) -> bool: | |
"""parsePdf parses the instruction pages and prints the index.""" | |
logging.info("Func: parsePdf()") | |
alreadyParsedFunctions = [] | |
with open(filename, 'rb') as pdfFileObj: | |
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) | |
try: | |
# iterate instruction pages | |
for pageNo in range(startPage, endPage+1): | |
res, out = parsePage(pdfReader, pageNo) | |
#res, out = parsePage(pdfReader, 51) | |
if not res: | |
eprint("[E] Parsing failed for page {}!".format(pageNo)) | |
#raise Exception("breakout1!") | |
continue | |
# out can be None if the functions on the page were already | |
# parsed by a previous page. | |
for func, pageNo in out: | |
if not func in alreadyParsedFunctions: | |
alreadyParsedFunctions.append(func) | |
print('{},{}'.format(func, pageNo)) | |
#raise Exception("breakout2!") | |
finally: | |
pdfFileObj.close() | |
def main(argv: list): | |
logging.basicConfig(level=LOG_LEVEL) | |
if len(argv) != 4: | |
eprint("{} [pdf-file] [start-page] [end-page]".format(argv[0])) | |
return False | |
filename = argv[1] | |
startPage = int(argv[2]) | |
endPage = int(argv[3]) | |
return parsePdf(filename, startPage, endPage) | |
if __name__ == "__main__": | |
import sys | |
sys.exit(not main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment