Skip to content

Instantly share code, notes, and snippets.

@eliask
Last active February 3, 2018 17:07
Show Gist options
  • Save eliask/090922ff040f2f372149e5b8490186ca to your computer and use it in GitHub Desktop.
Save eliask/090922ff040f2f372149e5b8490186ca to your computer and use it in GitHub Desktop.
Parsing strace (network related) output with Python and parsy
'''
Warning: Turns out parsy is very slow :(
Usage:
import strace_grammar as grammar
for line in strace_output:
# not interested in this strace metadata:
if line.startswith('strace:'):
continue
parsed = grammar.line.parse(line)
print(parsed)
'''
from parsy import string, regex, seq, alt, generate
# TOOD could return semantic namedtuples instead of anonymous tuples
from collections import namedtuple
whitespace = regex(r'\s*')
lexeme = lambda p: p << whitespace
lbrace = lexeme(string('{'))
rbrace = lexeme(string('}'))
lbrack = lexeme(string('['))
rbrack = lexeme(string(']'))
lparen = lexeme(string('('))
rparen = lexeme(string(')'))
equals = lexeme(string('='))
comma = lexeme(string(','))
number = lexeme(regex(r'-?[0-9]+')).map(int)
# NB: could be simplified with python str.decode('string_escape').
string_part = regex(r'[^"\\]+')
string_esc = string('\\') >> (
string('\\')
| string('/')
| string('"')
| string('b').result('\b')
| string('v').result('\v')
| string('f').result('\f')
| string('n').result('\n')
| string('r').result('\r')
| string('t').result('\t')
| regex(r'u[0-9a-fA-F]{4}').map(lambda s: chr(int(s[1:], 16)))
| regex(r'[0-9]{1,3}').map(lambda s: chr(int(s, 10)))
| regex(r'x[0-9a-fA-F]{1,4}').map(lambda s: chr(int(s[1:], 16)))
)
quoted = lexeme(string('"') >> (string_part | string_esc).many().concat() << string('"'))
ip_addr = lexeme(regex('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'))
key_symbol = lexeme(regex(r"[a-z][a-z_0-9]*"))
symbol = lexeme(regex(r"[A-Z][A-Z_0-9]*"))
pipe = lexeme(string('|'))
enum = regex(r'(0x[0-9a-fA-F]+|[A-Z_0-9]+)(\|(0x[0-9a-fA-F]+|[A-Z_0-9]+))*')
literal = number | symbol
# Circular dependency between array and value means we use `generate` form here
@generate
def array():
yield lbrack
elements = yield value.sep_by(comma)
yield rbrack
return elements
@generate
def record_array():
yield lbrace
elements = yield value.sep_by(comma)
yield rbrace
return elements
@generate
def object_pair1():
key = yield key_symbol
yield equals
val = yield value
return (key, val)
@generate
def object_pair_func():
x = (key, args) = yield func
return (key, x)
truncated = lexeme(string("..."))
object_pair = lexeme(object_pair_func) | lexeme(object_pair1) | truncated.result((None, None))
@generate
def arguments():
foo = yield lexeme(value.sep_by(comma))
return foo
@generate
def func():
funcname = yield key_symbol
yield lparen
elements = yield arguments
yield rparen
return (funcname, elements)
ref = lexeme(regex('&[a-z][a-z0-9_]*'))
mapping = lexeme(regex('\[[0-9]+->[0-9]+\]'))
keyval = lbrace >> object_pair.sep_by(comma).map(dict) << rbrace
@generate
def value():
asd = yield ip_addr | enum | symbol | quoted | ref | func | literal | mapping | record_array | keyval | array
return asd
unfinished = lexeme(string('<unfinished ...>'))
reason_text = regex(r'[a-zA-Z0-9_ ]+')
retval_error = seq(number, symbol, lparen >> reason_text << rparen)
retval_unfinished = seq(number, unfinished)
retval = retval_error | retval_unfinished | number
syscall_complete = seq(
key_symbol,
lparen >> arguments << rparen,
equals >> retval,
)
syscall_partial = seq(
key_symbol,
lparen >> arguments << comma.optional(),
unfinished,
)
syscall_exit = string('+++ exited with ') >> number << lexeme(string('+++'))
syscall_resumed = seq(
string('<... ') >> key_symbol << lexeme(string('resumed>')),
arguments.optional() << rparen,
equals >> retval,
)
syscall = syscall_complete | syscall_partial | syscall_resumed | syscall_exit
pid = regex(r'\[pid +') >> number << string(']')
line = seq(
pid,
string(" ") >> syscall,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment