moyix · July 26, 2017 00:07
diff --git a/panda_taint.sh b/panda_taint.sh
 # We're going to do a simple demo of using PANDA to do a dynamic taint
 # analysis of a program that parses a file. The program we're using is
 # the "who" utility, which parses a binary log file (utmp).
 # This assumes that you've got a build of PANDA:
 ls ~/git/panda/build/i386-softmmu/qemu-system-i386
 # To start off, we'll create a recording of running who. We can do this
 # using PANDA's run_debian.py script, which will automatically download a
 # 32-bit Linux image and run a command in it. It will even copy in any
 # files needed for you.
 ~/git/panda/panda/scripts/run_debian.py who /var/run/utmp 
 # We now have a recording of running "who" on the *host's* /var/run/utmp
 ls
 ls replays/who/
 # We can replay it
 ~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who
 # Now we can run our taint analysis. This is done using the "file_taint"
 # plugin. Taint analysis is expensive, so it helps to restrict our
 # attention to just the subset of the replay that contains the behavior
 # we're interested in.
 # We can figure out where the file we care about gets opened using the
 # "notaint" option of file_taint, but first let's get help on the plugin:
 ~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who -panda file_taint:help
 # Since intercepting open() is an OS-specific behavior, we need to tell
 # PANDA what OS we're running in the guest. For the image used by
 # run_debian.py, it's "linux-32-lava32".
 # We also need to run this with a filename, so that we can see at what
 # instruction we open the file we're interested in:
 ~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who -panda file_taint:notaint,filename=utmp -os linux-32-lava32
 # So we can see that the first time we see the file we want to taint is
 # around instruction 10373977. Now we can run PANDA's taint analysis for
 # real. But wait! What analysis do we actually want to do?
 #
 # The file_taint plugin applies taint labels, but doesn't say when to do
 # taint *queries*. For that, we can turn to other plugins, such as
 # tainted_branch and tainted_instr.
 #
 # The latter -- tainted_instr -- is what we'll use. It queries taint
 # whenever an instruction that handles tainted data is seen, and logs
 # info about it in pandalog format to the file given in the -pandalog
 # argument.
 #
 # We will also use the "pos" option (for "positional labels") to
 # file_taint, so that it creates one label per byte in the input file.
 # The analysis can take a while -- on my system, about 1.5 minutes.
 # Larger files and more complex programs can take much longer.

 ~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who -panda file_taint:filename=utmp,pos,first_instr=10373977 -os linux-32-lava32 -panda tainted_instr -pandalog utmp.plog

 # We can see the contents of the pandalog with the plog_reader.py script
 ~/git/panda/panda/scripts/plog_reader.py utmp.plog 
 # Note that this may be a little slow, however
 # As an alternative, I've written a multithreaded pandalog taint parser in C++
 git clone https://github.com/moyix/pandalog_taint_parser
 cd pandalog_taint_parser/
 ls
 # You'll need Google Protocol Buffers and zlib installed to build it
 make
 ./label_pcs ../utmp.plog 
 # And now we have a nice file that maps each input label (i.e. each byte of the input file) to all the program counters that handled it
 ls
 less -S label_pcs.txt
 # What do we do with those PC's? Well, one obvious thing is to go look
 # at them in IDA Pro. One could probably automate this fairly easily.
 #
 # Note that the addresses shown here are the ones that were observed in
 # the guest system -- so to map them back to an address in the appropriate
 # library or program, you'll need to figure out where that program or
 # library was loaded in the guest. This is left as an exercise for the
 # reader, but the OSI get_libaries function may help. 
 #
 # That's all for now! Enjoy PANDA's taint analysis! You can do lots of            
 # cool stuff with it. One thing we didn't talk about at all is the TCN            
 # feature -- this tracks the amount of computation done on the data               
 # that you've tainted. Here's a sneak peak...                                     
 ./label_tcn ../utmp.plog                                                          
 less label_tcn.txt                                                                
 awk '$2 != 0' label_tcn.txt | less                                                
 # This gives us a way to measure what the most "important" (or, at least,         
 # the most "computed on") bytes of an input file are.                             
 # Ok, that really is all for now....
	# We're going to do a simple demo of using PANDA to do a dynamic taint
	# analysis of a program that parses a file. The program we're using is
	# the "who" utility, which parses a binary log file (utmp).
	# This assumes that you've got a build of PANDA:
	ls ~/git/panda/build/i386-softmmu/qemu-system-i386
	# To start off, we'll create a recording of running who. We can do this
	# using PANDA's run_debian.py script, which will automatically download a
	# 32-bit Linux image and run a command in it. It will even copy in any
	# files needed for you.
	~/git/panda/panda/scripts/run_debian.py who /var/run/utmp
	# We now have a recording of running "who" on the host's /var/run/utmp
	ls
	ls replays/who/
	# We can replay it
	~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who
	# Now we can run our taint analysis. This is done using the "file_taint"
	# plugin. Taint analysis is expensive, so it helps to restrict our
	# attention to just the subset of the replay that contains the behavior
	# we're interested in.
	# We can figure out where the file we care about gets opened using the
	# "notaint" option of file_taint, but first let's get help on the plugin:
	~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who -panda file_taint:help
	# Since intercepting open() is an OS-specific behavior, we need to tell
	# PANDA what OS we're running in the guest. For the image used by
	# run_debian.py, it's "linux-32-lava32".
	# We also need to run this with a filename, so that we can see at what
	# instruction we open the file we're interested in:
	~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who -panda file_taint:notaint,filename=utmp -os linux-32-lava32
	# So we can see that the first time we see the file we want to taint is
	# around instruction 10373977. Now we can run PANDA's taint analysis for
	# real. But wait! What analysis do we actually want to do?
	#
	# The file_taint plugin applies taint labels, but doesn't say when to do
	# taint queries. For that, we can turn to other plugins, such as
	# tainted_branch and tainted_instr.
	#
	# The latter -- tainted_instr -- is what we'll use. It queries taint
	# whenever an instruction that handles tainted data is seen, and logs
	# info about it in pandalog format to the file given in the -pandalog
	# argument.
	#
	# We will also use the "pos" option (for "positional labels") to
	# file_taint, so that it creates one label per byte in the input file.
	# The analysis can take a while -- on my system, about 1.5 minutes.
	# Larger files and more complex programs can take much longer.

	~/git/panda/build/i386-softmmu/qemu-system-i386 -replay replays/who/who -panda file_taint:filename=utmp,pos,first_instr=10373977 -os linux-32-lava32 -panda tainted_instr -pandalog utmp.plog

	# We can see the contents of the pandalog with the plog_reader.py script
	~/git/panda/panda/scripts/plog_reader.py utmp.plog
	# Note that this may be a little slow, however
	# As an alternative, I've written a multithreaded pandalog taint parser in C++
	git clone https://github.com/moyix/pandalog_taint_parser
	cd pandalog_taint_parser/
	ls
	# You'll need Google Protocol Buffers and zlib installed to build it
	make
	./label_pcs ../utmp.plog
	# And now we have a nice file that maps each input label (i.e. each byte of the input file) to all the program counters that handled it
	ls
	less -S label_pcs.txt
	# What do we do with those PC's? Well, one obvious thing is to go look
	# at them in IDA Pro. One could probably automate this fairly easily.
	#
	# Note that the addresses shown here are the ones that were observed in
	# the guest system -- so to map them back to an address in the appropriate
	# library or program, you'll need to figure out where that program or
	# library was loaded in the guest. This is left as an exercise for the
	# reader, but the OSI get_libaries function may help.
	#
	# That's all for now! Enjoy PANDA's taint analysis! You can do lots of
	# cool stuff with it. One thing we didn't talk about at all is the TCN
	# feature -- this tracks the amount of computation done on the data
	# that you've tainted. Here's a sneak peak...
	./label_tcn ../utmp.plog
	less label_tcn.txt
	awk '$2 != 0' label_tcn.txt \| less
	# This gives us a way to measure what the most "important" (or, at least,
	# the most "computed on") bytes of an input file are.
	# Ok, that really is all for now....