Skip to content

Instantly share code, notes, and snippets.

@williballenthin
Last active August 16, 2021 13:37
Show Gist options
  • Save williballenthin/ea1fef4984ea31ae92e333f04e9d0110 to your computer and use it in GitHub Desktop.
Save williballenthin/ea1fef4984ea31ae92e333f04e9d0110 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
'''
A simplified FLOSS implementation that only supports stackstrings.
requirements:
- yara-python
- unicorn
author: Willi Ballenthin
email: [email protected]
'''
import os
import re
import sys
import zipfile
import os.path
import logging
import collections
import yara
import unicorn
import argparse
logger = logging.getLogger(__name__)
# via: https://gist.github.com/williballenthin/8e3913358a7996eab9b96bd57fc59df2
ASCII_BYTE = b' !\"#\$%&\'\(\)\*\+,-\./0123456789:;<=>\?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]\^_`abcdefghijklmnopqrstuvwxyz\{\|\}\\\~\t'
def ascii_strings(buf, n=4):
'''
extract ASCII encoded strings from the given bytes.
Args:
buf (bytes): the raw bytes to search.
n (int): the minimum string length, default: 4.
Yields:
str: the extracted string.
'''
reg = b'([%s]{%d,})' % (ASCII_BYTE, n)
ascii_re = re.compile(reg)
for match in ascii_re.finditer(buf):
yield match.group().decode('ascii')
def unicode_strings(buf, n=4):
'''
extract UTF-16LE encoded strings from the given bytes.
Args:
buf (bytes): the raw bytes to search.
n (int): the minimum string length, default: 4.
Yields:
str: the extracted string.
'''
reg = b'((?:[%s]\x00){%d,})' % (ASCII_BYTE, n)
uni_re = re.compile(reg)
for match in uni_re.finditer(buf):
try:
yield match.group().decode('utf-16')
except UnicodeDecodeError:
pass
def align(value, alignment):
'''
align the given value.
result will be greater than or equal to the given value.
Args:
value (int): the base value.
alignment (int): the alignment increment.
Returns:
int: the aligned value.
'''
if value % alignment == 0:
return value
return value + (alignment - (value % alignment))
CODE_ADDR = 0x00401000
STACK_ADDR = 0x69690000
def emulate_stackstrings(insns):
'''
emulate the given instructions and return the stack memory region.
its assumed that the instructions simply manipulate the stack memory.
the state of general purpose registers, aside from RSP and RBP, are undefined.
the active stack frame region between RBP and RSP is 0x1000 bytes in size.
Args:
insns (bytes): the raw stack string creation instructions. must be less than 0x1000 bytes.
Returns:
bytes: the 0x3000 bytes of stack memory after emulation.
'''
emu = unicorn.Uc(unicorn.UC_ARCH_X86, unicorn.UC_MODE_64)
logging.debug('mapping instructions')
emu.mem_map(CODE_ADDR, align(len(insns), 0x1000))
emu.mem_write(CODE_ADDR, insns)
# stack layout:
#
# min-addr -> STACK_ADDR
# rsp ------> STACK_ADDR + 0x1000
# rbp ------> STACK_ADDR + 0x2000
# max-addr -> STACK_ADDR + 0x3000
logging.debug('mapping stack')
emu.mem_map(STACK_ADDR, 0x3000)
emu.reg_write(unicorn.x86_const.UC_X86_REG_RSP, STACK_ADDR + 0x1000)
emu.reg_write(unicorn.x86_const.UC_X86_REG_RBP, STACK_ADDR + 0x2000)
try:
logging.debug('emulating instructions')
emu.emu_start(CODE_ADDR, CODE_ADDR+len(insns))
except unicorn.UcError as e:
logging.debug('emulation error: %s', str(e))
logging.debug('fetching stack region')
stack = emu.mem_read(STACK_ADDR, 0x3000)
return stack
def match_yara(rules, buf):
'''
match the given YARA rules against the given buffer.
ignore overlapping hits for the same rule; that is, greedily take the first hit.
Args:
rules (yara.Rules): the compiled YARA rules.
buf (bytes): the bytes to search.
Yields:
tuple[str, str, int, bytes]: the matched rule name, string name, hit offset, and hit bytes.
'''
seen = collections.defaultdict(lambda: set([]))
for match in rules.match(data=buf):
for (offset, sname, s) in match.strings:
if offset in seen[match.rule]:
continue
yield match.rule, sname, offset, s
for addr in range(offset, offset+len(s)):
seen[match.rule].add(addr)
def extract_stackstrings(rules, buf):
'''
extract stack strings via using YARA and emulation.
match the given YARA rules given buffer to identify potential stack string creation instructions.
then emulate the instructions and run strings on the modified stack region.
Args:
rules (yara.Rules): the compiled YARA rules.
buf (bytes): the bytes to search.
Yields:
tuple[int, str]: the instructions start offset and decoded string.
'''
logger.debug('matching YARA rules...')
for rname, sname, offset, insns in match_yara(rules, buf):
stack = emulate_stackstrings(insns)
for string in ascii_strings(stack):
yield offset, string
for string in unicode_strings(stack):
yield offset, string
FileEntry = collections.namedtuple('FileEntry', ['path', 'f'])
def enumerate_files(path, password):
'''
generate the paths to files found under the given directory.
Args:
path (str): file system path to a directory.
Yields:
str: file system path to a file.
'''
# the dup here is annoying...
if os.path.isfile(path):
if path.lower().endswith('.zip'):
with zipfile.ZipFile(path, 'r') as z:
z.setpassword(password.encode('ascii'))
for name in z.namelist():
with z.open(name, 'r') as f:
yield FileEntry(path='%s:%s' % (path, name), f=f)
for root, dirs, files in os.walk(path):
for file in files:
path = os.path.join(root, file)
if file.lower().endswith('.zip'):
with zipfile.ZipFile(path, 'r') as z:
z.setpassword(password.encode('ascii'))
for name in z.namelist():
with z.open(name, 'r') as f:
yield FileEntry(path='%s:%s' % (path, name), f=f)
else:
with open(path, 'rb') as f:
yield FileEntry(path=path, f=f)
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
cd = os.path.dirname(__file__)
parser = argparse.ArgumentParser(description='Extract stack strings from raw x86 blobs.')
parser.add_argument("input", type=str, help="Path to input file")
parser.add_argument("--yara", type=str, help="Path to file containing YARA rules",
default=os.path.join(cd, "stackstrings.yara"))
parser.add_argument("-r", "--recursive", action="store_true",
help="Enable recursive scanning of files")
parser.add_argument("-v", "--verbose", action="store_true",
help="Enable debug logging")
parser.add_argument("-q", "--quiet", action="store_true",
help="Disable all output but errors")
args = parser.parse_args(args=argv)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
elif args.quiet:
logging.basicConfig(level=logging.ERROR)
logging.getLogger().setLevel(logging.ERROR)
else:
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger.debug('compiling YARA rules...')
rules = yara.compile(args.yara)
if args.recursive:
logger.debug('collecting files to scan...')
files = enumerate_files(args.input, password='infected')
else:
# TODO: leak
f = open(args.input)
files = [FileEntry(path=args.input, f=f)]
for entry in files:
logger.debug('scanning file: %s', entry.path)
buf = entry.f.read()
for offset, string in extract_stackstrings(rules, buf):
print('%s:0x%x: %s' % (entry.path, offset, string))
return 0
if __name__ == "__main__":
sys.exit(main())
rule stack_strings
{
meta:
author = "William Ballenthin"
email = "[email protected]"
license = "Apache 2.0"
copyright = "FireEye, Inc"
description = "Match x86 that appears to be stack string creation."
strings:
// stack string near the frame pointer.
// the compiler may choose to use a single byte offset from $bp.
// like: mov [ebp-10h], 25h
//
// regex explanation:
// 4 times:
// byte C6 (mov)
// byte 45 ($bp-relative, one-byte offset)
// any byte (the offset from $bp)
// printable ascii (the immediate constant)
// 1 times:
// byte C6 (mov)
// byte 45 ($bp-relative, one-byte offset)
// any byte (the offset from $bp)
// byte 00 (the immediate constant, null terminator)
$ss_small_bp = /(\xC6\x45.[a-zA-Z0-9 -~]){4,}\xC6\x45.\x00/
// stack strings further away from the frame pointer.
// the compiler may choose to use a four-byte offset from $bp.
// like: mov byte ptr [ebp-D80h], 5Ch
// we restrict the offset to be within 0xFFF (4095) of the frame pointer.
//
// regex explanation:
// 4 times:
// byte C6 (mov)
// byte 85 ($bp-relative, four-byte offset)
// any byte (LSB of the offset from $bp)
// byte 0xF0-0xFF (second LSB of the offset from $bp)
// byte FF (second MSB)
// byte FF (MSB of the offset from $bp)
// printable ascii (the immediate constant)
// 1 times:
// byte C6 (mov)
// byte 85 ($bp-relative, four-byte offset)
// any byte (LSB of the offset from $bp)
// byte 0xF0-0xFF (second LSB of the offset from $bp)
// byte FF (second MSB)
// byte FF (MSB of the offset from $bp)
// byte 00 (the immediate constant, null terminator)
$ss_big_bp = /(\xC6\x85.[\xF0-\xFF]\xFF\xFF[a-zA-Z0-9 -~]){4,}\xC6\x85.[\xF0-\xFF]\xFF\xFF\x00/
// stack string near the stack pointer.
// the compiler may choose to use a single byte offset from $sp.
// like: mov byte ptr [esp+0Bh], 24h
//
// regex explanation:
// 4 times:
// byte C6 (mov)
// byte 44 ($sp-relative, one-byte offset)
// byte 24 ($sp-relative, one-byte offset)
// any byte (the offset from $sp)
// printable ascii (the immediate constant)
// 1 times:
// byte C6 (mov)
// byte 44 ($sp-relative, one-byte offset)
// byte 24 ($sp-relative, one-byte offset)
// any byte (the offset from $sp)
// byte 00 (the immediate constant, null terminator)
$ss_small_sp = /(\xC6\x44\x24.[a-zA-Z0-9 -~]){4,}\xC6\x44\x24.\x00/
// stack strings further away from the stack pointer.
// the compiler may choose to use a four-byte offset from $sp.
// like: byte ptr [esp+0DDh], 49h
// we restrict the offset to be within 0xFFF (4095) of the stack pointer.
//
// regex explanation:
// 4 times:
// byte C6 (mov)
// byte 84 ($sp-relative, four-byte offset)
// byte 24 ($sp-relative, four-byte offset)
// any byte (LSB of the offset from $sp)
// byte 0x00-0x0F (second LSB of the offset from $sp)
// byte 00 (second MSB)
// byte 00 (MSB of the offset from $sp)
// printable ascii (the immediate constant)
// 1 times:
// byte C6 (mov)
// byte 84 ($sp-relative, four-byte offset)
// byte 24 ($sp-relative, four-byte offset)
// any byte (LSB of the offset from $sp)
// byte 0x00-0x0F (second LSB of the offset from $sp)
// byte 00 (second MSB)
// byte 00 (MSB of the offset from $sp)
// byte 00 (the immediate constant, null terminator)
$ss_big_sp = /(\xC6\x84\x24.[\x00-\x0F]\x00\x00[a-zA-Z0-9 -~]){4,}\xC6\x84\x24.[\x00-\x0F]\x00\x00\x00/
// like:
// .text:10001A34 C7 06 57 61 69 74 mov dword ptr [esi], 74696157h
// .text:10001A3A C7 46 04 46 6F 72 53 mov dword ptr [esi+4], 53726F46h
// .text:10001A41 C7 46 08 69 6E 67 6C mov dword ptr [esi+8], 6C676E69h
// .text:10001A48 C7 46 0C 65 4F 62 6A mov dword ptr [esi+0Ch], 6A624F65h
// .text:10001A4F C7 46 10 65 63 74 00 mov dword ptr [esi+10h], 74
$ss_small_reg_dword_ascii = /(\xC7.?.[a-zA-Z0-9 -~][a-zA-Z0-9 -~][a-zA-Z0-9 -~][a-zA-Z0-9 -~]){2,}/
// like:
// unicode string
// like:
// .text:0000000180008993 C7 44 24 64 6D 00 6C 00 mov dword ptr [rsp+64h], 6C006Dh
// .text:000000018000899B C7 44 24 68 2E 00 64 00 mov dword ptr [rsp+68h], 64002Eh
// .text:00000001800089A3 C7 44 24 6C 6C 00 6C 00 mov dword ptr [rsp+6Ch], 6
// like:
// .text:0000000180008916 C7 45 08 65 00 73 00 mov dword ptr [rbp+8], 730065h
// .text:000000018000891D C7 45 0C 2E 00 64 00 mov dword ptr [rbp+0Ch], 64002Eh
// .text:0000000180008924 C7 45 10 6C 00 6C 00 mov dword ptr [rbp+10h], 6
$ss_small_reg_dword_uni = /(\xC7.?.?.[a-zA-Z0-9 -~]\x00[a-zA-Z0-9 -~]\x00){2,}/
condition:
$ss_small_bp or $ss_big_bp or $ss_small_sp or $ss_big_sp or $ss_small_reg_dword_ascii or $ss_small_reg_dword_uni
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment