Created
October 3, 2018 16:49
-
-
Save pgoodman/f7ab19623bef9414ffdc6f7abbf551a3 to your computer and use it in GitHub Desktop.
Transform new LLVM IR into old LLVM IR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import logging | |
| import os | |
| import re | |
| from subprocess import call | |
| import sys | |
| def _convert_data(data): | |
| ''' | |
| 'data' is a string of the 'disassembled' bitcode (could be | |
| read in from file/stdin) | |
| Our tooling requires LLVM 3.5 bitcode. Mcsema uses LLVM <later version> | |
| bitcode. These substitutions remove information from LLVM 3.8 bitcode to | |
| make it compatible with llvm3.5. Additional transformations are added as | |
| necessary. | |
| returns a string of the disassembled bitcode suitable for running through | |
| llvm-as | |
| ''' | |
| #data = re.sub('inbounds [%A-Za-z0-9]+,', '', data) | |
| # Get rid of volatile, etc. | |
| data = re.sub('load (half|float|double|x86_fp80|fp128),', 'load', data) | |
| data = re.sub('load volatile', 'load', data) | |
| data = re.sub('load volatile (half|float|double|x86_fp80|fp128),', 'load', data) | |
| # Remove the leading non-pointer type in loads. | |
| data = re.sub('load [^*,]+?,', 'load ', data) | |
| # Remove the leading non-pointer type in GEPs. | |
| data = re.sub('= getelementptr [^*,]+?,', '= getelementptr ', data) | |
| # Remove the non-leading pointer type in constant expression GEPs. | |
| data = re.sub('getelementptr inbounds \([^*,]+?,', 'getelementptr inbounds (', data) | |
| data = re.sub('getelementptr \([^*,]+?,', 'getelementptr (', data) | |
| # Add in a `*` for calls through function pointers. | |
| data = re.sub('call ([^ ]+) (\([^)]+\)) %([a-zA-Z0-9.]+)\(', r"call \1 \2 *%\3(", data) | |
| # fix calling syntax for varargs functions | |
| #data = re.sub('call void \(\.\.\.\) @', 'call void (...)* @', data) | |
| # Remove TBAA annotations | |
| data = re.sub(', !tbaa ![0-9]+', '', data) | |
| # Remove Range annotations | |
| data = re.sub(', !range ![0-9]+', '', data) | |
| # Remove metadatas with nested metadata | |
| data = re.sub(r'^![0-9]+ = !{!.*$', '', data) | |
| # Remove complex metadatas. | |
| data = re.sub(r'^![0-9]+ = ![^}]+.*', '', data) | |
| # Remove multi node metadata entries | |
| data = re.sub('!.* = !{.*,.*}', '', data) | |
| # Add the right type to metadata (these are basically the mcsema_real_eip). | |
| data = re.sub('= !{i64', ' = i64 !{i64', data) | |
| data = re.sub('= !{i32', ' = i32 !{i32', data) | |
| # put types in front of metadata we care about | |
| # remove attributes not in llvm 3.5 | |
| data = re.sub('local_unnamed_addr', '', data) | |
| data = re.sub('source_filename = .*', '', data) | |
| # remove null refs or medata refs in medata | |
| data = re.sub('!.* = !{(null|!["0-9]).*', '', data) | |
| # remove debug metadata | |
| data = re.sub('!.* = (distinct )?!DI.*', '', data) | |
| # remove any empty stragglers | |
| data = re.sub('!.* = !{}', '', data) | |
| # remove attributes not in llvm 3.5 | |
| data = re.sub('"no-signed-zeros-fp-math"="true" ', '', data) | |
| data = re.sub('"no-signed-zeros-fp-math"="false" ', '', data) | |
| data = re.sub('"no-trapping-math"="true" ', '', data) | |
| data = re.sub('"no-trapping-math"="false" ', '', data) | |
| data = re.sub('"stack-protector-buffer-size"="8" ', '', data) | |
| data = re.sub('"unsafe-fp-math"="true" ', '', data) | |
| data = re.sub('"unsafe-fp-math"="false" ', '', data) | |
| data = re.sub('"use-soft-float"="true" ', '', data) | |
| data = re.sub('"use-soft-float"="false" ', '', data) | |
| data = re.sub(' norecurse ', ' ', data) | |
| data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="true"', ' ', data) | |
| data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="false"', ' ', data) | |
| data = re.sub('"disable-tail-calls"="true"', ' ', data) | |
| data = re.sub('"disable-tail-calls"="false"', ' ', data) | |
| data = re.sub('"less-precise-fpmad"="true"', ' ', data) | |
| data = re.sub('"less-precise-fpmad"="false"', ' ', data) | |
| data = re.sub('"no-frame-pointer-elim"="true"', ' ', data) | |
| data = re.sub('"no-frame-pointer-elim"="false"', ' ', data) | |
| data = re.sub('"no-infs-fp-math"="true"', ' ', data) | |
| data = re.sub('"no-infs-fp-math"="false"', ' ', data) | |
| data = re.sub('"no-nans-fp-math"="true"', ' ', data) | |
| data = re.sub('"no-nans-fp-math"="false"', ' ', data) | |
| data = re.sub('"no-frame-pointer-elim"="true"', ' ', data) | |
| data = re.sub('"no-frame-pointer-elim"="false"', ' ', data) | |
| data = re.sub('"no-frame-pointer-elim-non-leaf"', ' ', data) | |
| data = re.sub('"no-jump-tables"="true"', ' ', data) | |
| data = re.sub('"no-jump-tables"="false"', ' ', data) | |
| # Remove empty attribute sets. | |
| data = re.sub('attributes #[0-9]+ = \{[ ]*\}', '', data) | |
| return data | |
| if __name__ == '__main__': | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s') | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Transform modern LLVM IR into old LLVM IR.') | |
| parser.add_argument('--modern_input', type=str, required=True, | |
| help='Path to input LLVM IR file.') | |
| parser.add_argument('--legacy_output', type=str, required=True, | |
| help='Path to output LLVM IR file in LLVM 3.5-compatible format.') | |
| args = parser.parse_args() | |
| with open(args.modern_input, 'r') as input_ir: | |
| with open(args.legacy_output, 'w') as output_ir: | |
| output_ir.write(_convert_data(input_ir.read())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment