Last active
February 3, 2018 17:07
-
-
Save eliask/090922ff040f2f372149e5b8490186ca to your computer and use it in GitHub Desktop.
Parsing strace (network related) output with Python and parsy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Warning: Turns out parsy is very slow :( | |
Usage: | |
import strace_grammar as grammar | |
for line in strace_output: | |
# not interested in this strace metadata: | |
if line.startswith('strace:'): | |
continue | |
parsed = grammar.line.parse(line) | |
print(parsed) | |
''' | |
from parsy import string, regex, seq, alt, generate | |
# TOOD could return semantic namedtuples instead of anonymous tuples | |
from collections import namedtuple | |
whitespace = regex(r'\s*') | |
lexeme = lambda p: p << whitespace | |
lbrace = lexeme(string('{')) | |
rbrace = lexeme(string('}')) | |
lbrack = lexeme(string('[')) | |
rbrack = lexeme(string(']')) | |
lparen = lexeme(string('(')) | |
rparen = lexeme(string(')')) | |
equals = lexeme(string('=')) | |
comma = lexeme(string(',')) | |
number = lexeme(regex(r'-?[0-9]+')).map(int) | |
# NB: could be simplified with python str.decode('string_escape'). | |
string_part = regex(r'[^"\\]+') | |
string_esc = string('\\') >> ( | |
string('\\') | |
| string('/') | |
| string('"') | |
| string('b').result('\b') | |
| string('v').result('\v') | |
| string('f').result('\f') | |
| string('n').result('\n') | |
| string('r').result('\r') | |
| string('t').result('\t') | |
| regex(r'u[0-9a-fA-F]{4}').map(lambda s: chr(int(s[1:], 16))) | |
| regex(r'[0-9]{1,3}').map(lambda s: chr(int(s, 10))) | |
| regex(r'x[0-9a-fA-F]{1,4}').map(lambda s: chr(int(s[1:], 16))) | |
) | |
quoted = lexeme(string('"') >> (string_part | string_esc).many().concat() << string('"')) | |
ip_addr = lexeme(regex('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}')) | |
key_symbol = lexeme(regex(r"[a-z][a-z_0-9]*")) | |
symbol = lexeme(regex(r"[A-Z][A-Z_0-9]*")) | |
pipe = lexeme(string('|')) | |
enum = regex(r'(0x[0-9a-fA-F]+|[A-Z_0-9]+)(\|(0x[0-9a-fA-F]+|[A-Z_0-9]+))*') | |
literal = number | symbol | |
# Circular dependency between array and value means we use `generate` form here | |
@generate | |
def array(): | |
yield lbrack | |
elements = yield value.sep_by(comma) | |
yield rbrack | |
return elements | |
@generate | |
def record_array(): | |
yield lbrace | |
elements = yield value.sep_by(comma) | |
yield rbrace | |
return elements | |
@generate | |
def object_pair1(): | |
key = yield key_symbol | |
yield equals | |
val = yield value | |
return (key, val) | |
@generate | |
def object_pair_func(): | |
x = (key, args) = yield func | |
return (key, x) | |
truncated = lexeme(string("...")) | |
object_pair = lexeme(object_pair_func) | lexeme(object_pair1) | truncated.result((None, None)) | |
@generate | |
def arguments(): | |
foo = yield lexeme(value.sep_by(comma)) | |
return foo | |
@generate | |
def func(): | |
funcname = yield key_symbol | |
yield lparen | |
elements = yield arguments | |
yield rparen | |
return (funcname, elements) | |
ref = lexeme(regex('&[a-z][a-z0-9_]*')) | |
mapping = lexeme(regex('\[[0-9]+->[0-9]+\]')) | |
keyval = lbrace >> object_pair.sep_by(comma).map(dict) << rbrace | |
@generate | |
def value(): | |
asd = yield ip_addr | enum | symbol | quoted | ref | func | literal | mapping | record_array | keyval | array | |
return asd | |
unfinished = lexeme(string('<unfinished ...>')) | |
reason_text = regex(r'[a-zA-Z0-9_ ]+') | |
retval_error = seq(number, symbol, lparen >> reason_text << rparen) | |
retval_unfinished = seq(number, unfinished) | |
retval = retval_error | retval_unfinished | number | |
syscall_complete = seq( | |
key_symbol, | |
lparen >> arguments << rparen, | |
equals >> retval, | |
) | |
syscall_partial = seq( | |
key_symbol, | |
lparen >> arguments << comma.optional(), | |
unfinished, | |
) | |
syscall_exit = string('+++ exited with ') >> number << lexeme(string('+++')) | |
syscall_resumed = seq( | |
string('<... ') >> key_symbol << lexeme(string('resumed>')), | |
arguments.optional() << rparen, | |
equals >> retval, | |
) | |
syscall = syscall_complete | syscall_partial | syscall_resumed | syscall_exit | |
pid = regex(r'\[pid +') >> number << string(']') | |
line = seq( | |
pid, | |
string(" ") >> syscall, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment