Skip to content

Instantly share code, notes, and snippets.

@pgoodman
Created October 3, 2018 16:49
Show Gist options
  • Select an option

  • Save pgoodman/f7ab19623bef9414ffdc6f7abbf551a3 to your computer and use it in GitHub Desktop.

Select an option

Save pgoodman/f7ab19623bef9414ffdc6f7abbf551a3 to your computer and use it in GitHub Desktop.
Transform new LLVM IR into old LLVM IR
import logging
import os
import re
from subprocess import call
import sys
def _convert_data(data):
'''
'data' is a string of the 'disassembled' bitcode (could be
read in from file/stdin)
Our tooling requires LLVM 3.5 bitcode. Mcsema uses LLVM <later version>
bitcode. These substitutions remove information from LLVM 3.8 bitcode to
make it compatible with llvm3.5. Additional transformations are added as
necessary.
returns a string of the disassembled bitcode suitable for running through
llvm-as
'''
#data = re.sub('inbounds [%A-Za-z0-9]+,', '', data)
# Get rid of volatile, etc.
data = re.sub('load (half|float|double|x86_fp80|fp128),', 'load', data)
data = re.sub('load volatile', 'load', data)
data = re.sub('load volatile (half|float|double|x86_fp80|fp128),', 'load', data)
# Remove the leading non-pointer type in loads.
data = re.sub('load [^*,]+?,', 'load ', data)
# Remove the leading non-pointer type in GEPs.
data = re.sub('= getelementptr [^*,]+?,', '= getelementptr ', data)
# Remove the non-leading pointer type in constant expression GEPs.
data = re.sub('getelementptr inbounds \([^*,]+?,', 'getelementptr inbounds (', data)
data = re.sub('getelementptr \([^*,]+?,', 'getelementptr (', data)
# Add in a `*` for calls through function pointers.
data = re.sub('call ([^ ]+) (\([^)]+\)) %([a-zA-Z0-9.]+)\(', r"call \1 \2 *%\3(", data)
# fix calling syntax for varargs functions
#data = re.sub('call void \(\.\.\.\) @', 'call void (...)* @', data)
# Remove TBAA annotations
data = re.sub(', !tbaa ![0-9]+', '', data)
# Remove Range annotations
data = re.sub(', !range ![0-9]+', '', data)
# Remove metadatas with nested metadata
data = re.sub(r'^![0-9]+ = !{!.*$', '', data)
# Remove complex metadatas.
data = re.sub(r'^![0-9]+ = ![^}]+.*', '', data)
# Remove multi node metadata entries
data = re.sub('!.* = !{.*,.*}', '', data)
# Add the right type to metadata (these are basically the mcsema_real_eip).
data = re.sub('= !{i64', ' = i64 !{i64', data)
data = re.sub('= !{i32', ' = i32 !{i32', data)
# put types in front of metadata we care about
# remove attributes not in llvm 3.5
data = re.sub('local_unnamed_addr', '', data)
data = re.sub('source_filename = .*', '', data)
# remove null refs or medata refs in medata
data = re.sub('!.* = !{(null|!["0-9]).*', '', data)
# remove debug metadata
data = re.sub('!.* = (distinct )?!DI.*', '', data)
# remove any empty stragglers
data = re.sub('!.* = !{}', '', data)
# remove attributes not in llvm 3.5
data = re.sub('"no-signed-zeros-fp-math"="true" ', '', data)
data = re.sub('"no-signed-zeros-fp-math"="false" ', '', data)
data = re.sub('"no-trapping-math"="true" ', '', data)
data = re.sub('"no-trapping-math"="false" ', '', data)
data = re.sub('"stack-protector-buffer-size"="8" ', '', data)
data = re.sub('"unsafe-fp-math"="true" ', '', data)
data = re.sub('"unsafe-fp-math"="false" ', '', data)
data = re.sub('"use-soft-float"="true" ', '', data)
data = re.sub('"use-soft-float"="false" ', '', data)
data = re.sub(' norecurse ', ' ', data)
data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="true"', ' ', data)
data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="false"', ' ', data)
data = re.sub('"disable-tail-calls"="true"', ' ', data)
data = re.sub('"disable-tail-calls"="false"', ' ', data)
data = re.sub('"less-precise-fpmad"="true"', ' ', data)
data = re.sub('"less-precise-fpmad"="false"', ' ', data)
data = re.sub('"no-frame-pointer-elim"="true"', ' ', data)
data = re.sub('"no-frame-pointer-elim"="false"', ' ', data)
data = re.sub('"no-infs-fp-math"="true"', ' ', data)
data = re.sub('"no-infs-fp-math"="false"', ' ', data)
data = re.sub('"no-nans-fp-math"="true"', ' ', data)
data = re.sub('"no-nans-fp-math"="false"', ' ', data)
data = re.sub('"no-frame-pointer-elim"="true"', ' ', data)
data = re.sub('"no-frame-pointer-elim"="false"', ' ', data)
data = re.sub('"no-frame-pointer-elim-non-leaf"', ' ', data)
data = re.sub('"no-jump-tables"="true"', ' ', data)
data = re.sub('"no-jump-tables"="false"', ' ', data)
# Remove empty attribute sets.
data = re.sub('attributes #[0-9]+ = \{[ ]*\}', '', data)
return data
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s')
import argparse
parser = argparse.ArgumentParser(description='Transform modern LLVM IR into old LLVM IR.')
parser.add_argument('--modern_input', type=str, required=True,
help='Path to input LLVM IR file.')
parser.add_argument('--legacy_output', type=str, required=True,
help='Path to output LLVM IR file in LLVM 3.5-compatible format.')
args = parser.parse_args()
with open(args.modern_input, 'r') as input_ir:
with open(args.legacy_output, 'w') as output_ir:
output_ir.write(_convert_data(input_ir.read()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment