pgoodman · October 3, 2018 16:49
diff --git a/transformations.py b/transformations.py
 import logging
 import os
 import re
 from subprocess import call
 import sys

 def _convert_data(data):
    '''
    'data' is a string of the 'disassembled' bitcode (could be 
    read in from file/stdin)

    Our tooling requires LLVM 3.5 bitcode. Mcsema uses LLVM <later version>
    bitcode. These substitutions remove information from LLVM 3.8 bitcode to
    make it compatible with llvm3.5. Additional transformations are added as
    necessary.

    returns a string of the disassembled bitcode suitable for running through
    llvm-as
    '''

    #data = re.sub('inbounds [%A-Za-z0-9]+,', '', data)
    
    # Get rid of volatile, etc.
    data = re.sub('load (half|float|double|x86_fp80|fp128),', 'load', data)
    data = re.sub('load volatile', 'load', data)
    data = re.sub('load volatile (half|float|double|x86_fp80|fp128),', 'load', data)

    # Remove the leading non-pointer type in loads.
    data = re.sub('load [^*,]+?,', 'load ', data)

    # Remove the leading non-pointer type in GEPs.
    data = re.sub('= getelementptr [^*,]+?,', '= getelementptr ', data)

    # Remove the non-leading pointer type in constant expression GEPs.
    data = re.sub('getelementptr inbounds \([^*,]+?,', 'getelementptr inbounds (', data)
    data = re.sub('getelementptr \([^*,]+?,', 'getelementptr (', data)

    # Add in a `*` for calls through function pointers.
    data = re.sub('call ([^ ]+) (\([^)]+\)) %([a-zA-Z0-9.]+)\(', r"call \1 \2 *%\3(", data)

    # fix calling syntax for varargs functions
    #data = re.sub('call void \(\.\.\.\) @', 'call void (...)* @', data)


    # Remove TBAA annotations
    data = re.sub(', !tbaa ![0-9]+', '', data)

    # Remove Range annotations
    data = re.sub(', !range ![0-9]+', '', data)

    # Remove metadatas with nested metadata
    data = re.sub(r'^![0-9]+ = !{!.*$', '', data)

    # Remove complex metadatas.
    data = re.sub(r'^![0-9]+ = ![^}]+.*', '', data)

    # Remove multi node metadata entries
    data = re.sub('!.* = !{.*,.*}', '', data)

    # Add the right type to metadata (these are basically the mcsema_real_eip).
    data = re.sub('= !{i64', ' = i64 !{i64', data)
    data = re.sub('= !{i32', ' = i32 !{i32', data)


    # put types in front of metadata we care about
    # remove attributes not in llvm 3.5
    data = re.sub('local_unnamed_addr', '', data)
    data = re.sub('source_filename = .*', '', data)


    # remove null refs or medata refs in medata
    data = re.sub('!.* = !{(null|!["0-9]).*', '', data)
    # remove debug metadata
    data = re.sub('!.* = (distinct )?!DI.*', '', data)
    # remove any empty stragglers
    data = re.sub('!.* = !{}', '', data)
    # remove attributes not in llvm 3.5
    data = re.sub('"no-signed-zeros-fp-math"="true" ', '', data)
    data = re.sub('"no-signed-zeros-fp-math"="false" ', '', data)
    data = re.sub('"no-trapping-math"="true" ', '', data)
    data = re.sub('"no-trapping-math"="false" ', '', data)
    data = re.sub('"stack-protector-buffer-size"="8" ', '', data)
    data = re.sub('"unsafe-fp-math"="true" ', '', data)
    data = re.sub('"unsafe-fp-math"="false" ', '', data)
    data = re.sub('"use-soft-float"="true" ', '', data)
    data = re.sub('"use-soft-float"="false" ', '', data)
    data = re.sub(' norecurse ', ' ', data)
    data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="true"', ' ', data)
    data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="false"', ' ', data)
    data = re.sub('"disable-tail-calls"="true"', ' ', data)
    data = re.sub('"disable-tail-calls"="false"', ' ', data)
    data = re.sub('"less-precise-fpmad"="true"', ' ', data)
    data = re.sub('"less-precise-fpmad"="false"', ' ', data)
    data = re.sub('"no-frame-pointer-elim"="true"', ' ', data)
    data = re.sub('"no-frame-pointer-elim"="false"', ' ', data)
    data = re.sub('"no-infs-fp-math"="true"', ' ', data)
    data = re.sub('"no-infs-fp-math"="false"', ' ', data)
    data = re.sub('"no-nans-fp-math"="true"', ' ', data)
    data = re.sub('"no-nans-fp-math"="false"', ' ', data)
    data = re.sub('"no-frame-pointer-elim"="true"', ' ', data)
    data = re.sub('"no-frame-pointer-elim"="false"', ' ', data)
    data = re.sub('"no-frame-pointer-elim-non-leaf"', ' ', data)
    data = re.sub('"no-jump-tables"="true"', ' ', data)
    data = re.sub('"no-jump-tables"="false"', ' ', data)


    # Remove empty attribute sets.
    data = re.sub('attributes #[0-9]+ = \{[ ]*\}', '', data)

    return data

 if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s')

    import argparse
    parser = argparse.ArgumentParser(description='Transform modern LLVM IR into old LLVM IR.')
    parser.add_argument('--modern_input', type=str, required=True,
                        help='Path to input LLVM IR file.')

    parser.add_argument('--legacy_output', type=str, required=True,
                        help='Path to output LLVM IR file in LLVM 3.5-compatible format.')

    args = parser.parse_args()

    with open(args.modern_input, 'r') as input_ir:
        with open(args.legacy_output, 'w') as output_ir:
            output_ir.write(_convert_data(input_ir.read()))
	import logging
	import os
	import re
	from subprocess import call
	import sys

	def _convert_data(data):
	'''
	'data' is a string of the 'disassembled' bitcode (could be
	read in from file/stdin)

	Our tooling requires LLVM 3.5 bitcode. Mcsema uses LLVM <later version>
	bitcode. These substitutions remove information from LLVM 3.8 bitcode to
	make it compatible with llvm3.5. Additional transformations are added as
	necessary.

	returns a string of the disassembled bitcode suitable for running through
	llvm-as
	'''

	#data = re.sub('inbounds [%A-Za-z0-9]+,', '', data)

	# Get rid of volatile, etc.
	data = re.sub('load (half\|float\|double\|x86_fp80\|fp128),', 'load', data)
	data = re.sub('load volatile', 'load', data)
	data = re.sub('load volatile (half\|float\|double\|x86_fp80\|fp128),', 'load', data)

	# Remove the leading non-pointer type in loads.
	data = re.sub('load [^*,]+?,', 'load ', data)

	# Remove the leading non-pointer type in GEPs.
	data = re.sub('= getelementptr [^*,]+?,', '= getelementptr ', data)

	# Remove the non-leading pointer type in constant expression GEPs.
	data = re.sub('getelementptr inbounds \([^*,]+?,', 'getelementptr inbounds (', data)
	data = re.sub('getelementptr \([^*,]+?,', 'getelementptr (', data)

	# Add in a `*` for calls through function pointers.
	data = re.sub('call ([^ ]+) (\([^)]+\)) %([a-zA-Z0-9.]+)\(', r"call \1 \2 *%\3(", data)

	# fix calling syntax for varargs functions
	#data = re.sub('call void \(\.\.\.\) @', 'call void (...)* @', data)


	# Remove TBAA annotations
	data = re.sub(', !tbaa ![0-9]+', '', data)

	# Remove Range annotations
	data = re.sub(', !range ![0-9]+', '', data)

	# Remove metadatas with nested metadata
	data = re.sub(r'^![0-9]+ = !{!.*$', '', data)

	# Remove complex metadatas.
	data = re.sub(r'^![0-9]+ = ![^}]+.*', '', data)

	# Remove multi node metadata entries
	data = re.sub('!.* = !{.,.}', '', data)

	# Add the right type to metadata (these are basically the mcsema_real_eip).
	data = re.sub('= !{i64', ' = i64 !{i64', data)
	data = re.sub('= !{i32', ' = i32 !{i32', data)


	# put types in front of metadata we care about
	# remove attributes not in llvm 3.5
	data = re.sub('local_unnamed_addr', '', data)
	data = re.sub('source_filename = .*', '', data)


	# remove null refs or medata refs in medata
	data = re.sub('!.* = !{(null\|!["0-9]).*', '', data)
	# remove debug metadata
	data = re.sub('!.* = (distinct )?!DI.*', '', data)
	# remove any empty stragglers
	data = re.sub('!.* = !{}', '', data)
	# remove attributes not in llvm 3.5
	data = re.sub('"no-signed-zeros-fp-math"="true" ', '', data)
	data = re.sub('"no-signed-zeros-fp-math"="false" ', '', data)
	data = re.sub('"no-trapping-math"="true" ', '', data)
	data = re.sub('"no-trapping-math"="false" ', '', data)
	data = re.sub('"stack-protector-buffer-size"="8" ', '', data)
	data = re.sub('"unsafe-fp-math"="true" ', '', data)
	data = re.sub('"unsafe-fp-math"="false" ', '', data)
	data = re.sub('"use-soft-float"="true" ', '', data)
	data = re.sub('"use-soft-float"="false" ', '', data)
	data = re.sub(' norecurse ', ' ', data)
	data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="true"', ' ', data)
	data = re.sub('"correctly-rounded-divide-sqrt-fp-math"="false"', ' ', data)
	data = re.sub('"disable-tail-calls"="true"', ' ', data)
	data = re.sub('"disable-tail-calls"="false"', ' ', data)
	data = re.sub('"less-precise-fpmad"="true"', ' ', data)
	data = re.sub('"less-precise-fpmad"="false"', ' ', data)
	data = re.sub('"no-frame-pointer-elim"="true"', ' ', data)
	data = re.sub('"no-frame-pointer-elim"="false"', ' ', data)
	data = re.sub('"no-infs-fp-math"="true"', ' ', data)
	data = re.sub('"no-infs-fp-math"="false"', ' ', data)
	data = re.sub('"no-nans-fp-math"="true"', ' ', data)
	data = re.sub('"no-nans-fp-math"="false"', ' ', data)
	data = re.sub('"no-frame-pointer-elim"="true"', ' ', data)
	data = re.sub('"no-frame-pointer-elim"="false"', ' ', data)
	data = re.sub('"no-frame-pointer-elim-non-leaf"', ' ', data)
	data = re.sub('"no-jump-tables"="true"', ' ', data)
	data = re.sub('"no-jump-tables"="false"', ' ', data)


	# Remove empty attribute sets.
	data = re.sub('attributes #[0-9]+ = \{[ ]*\}', '', data)

	return data

	if __name__ == '__main__':
	logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s')

	import argparse
	parser = argparse.ArgumentParser(description='Transform modern LLVM IR into old LLVM IR.')
	parser.add_argument('--modern_input', type=str, required=True,
	help='Path to input LLVM IR file.')

	parser.add_argument('--legacy_output', type=str, required=True,
	help='Path to output LLVM IR file in LLVM 3.5-compatible format.')

	args = parser.parse_args()

	with open(args.modern_input, 'r') as input_ir:
	with open(args.legacy_output, 'w') as output_ir:
	output_ir.write(_convert_data(input_ir.read()))
No results found