Created
June 27, 2014 11:51
-
-
Save tbicr/cd584138ce183839946f to your computer and use it in GitHub Desktop.
pyparsing example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:977a530fe07ef0f070bef944a04bc6a483233573b0bf17ae77e06a0d82e02e52" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Pyparsing - python parsing library\n", | |
"\n", | |
"Pavel Tysliatski\n", | |
"\n", | |
"Expansa Group" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Common\n", | |
"\n", | |
"Site: http://pyparsing.wikispaces.com/\n", | |
"\n", | |
"Common information: http://pyparsing.wikispaces.com/HowToUsePyparsing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from pyparsing import *\n", | |
"from string import *" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## ParserElement subclasses" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Basic subclasses\n", | |
"\n", | |
" Literal\n", | |
" Word\n", | |
" Regex\n", | |
" SkipTo\n", | |
"\n", | |
"and etc." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Expression subclasses\n", | |
"\n", | |
" Or - |\n", | |
" And - ^\n", | |
" Optional\n", | |
" ZeroOrMore\n", | |
" OneOrMore\n", | |
"\n", | |
"and etc." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Positional subclasses\n", | |
"\n", | |
" StringStart\n", | |
" StringEnd\n", | |
" LineStart\n", | |
" LineEnd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Converter subclasses\n", | |
"\n", | |
" Suppress\n", | |
"\n", | |
"and etc." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Parser actions\n", | |
"\n", | |
" setParserAction\n", | |
" addParserAction" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Base example\n", | |
"Make parser:\n", | |
"\n", | |
" [0-9]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def action(s, loc, toks):\n", | |
" print('s', type(s), s)\n", | |
" print('loc', type(loc), loc)\n", | |
" print('toks', type(toks), toks)\n", | |
" return ['<'] + list(toks) + ['>']\n", | |
"\n", | |
"\n", | |
"pattern = Suppress('[') + Word(digits) + '-' + Word(digits).setParseAction(action) + Suppress(']')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Parsing methods" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### parseString" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.parseString('[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"s <class 'str'> [0-9]\n", | |
"loc <class 'int'> 3\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"(['0', '-', '<', '9', '>'], {})" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# >>> pattern.parseString('test[0-9]test')\n", | |
"# ParseException: Expected \"[\" (at char 0), (line:1, col:1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### searchString" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.searchString('test[0-9]test')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"s <class 'str'> test[0-9]test\n", | |
"loc <class 'int'> 7\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"([(['0', '-', '<', '9', '>'], {})], {})" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.searchString('test[0-9]test[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"s <class 'str'> test[0-9]test[0-9]\n", | |
"loc <class 'int'> 7\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n", | |
"s <class 'str'> test[0-9]test[0-9]\n", | |
"loc <class 'int'> 16\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"([(['0', '-', '<', '9', '>'], {}), (['0', '-', '<', '9', '>'], {})], {})" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### scanString" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"list(pattern.scanString('test[0-9]test'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"s <class 'str'> test[0-9]test\n", | |
"loc <class 'int'> 7\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"[((['0', '-', '<', '9', '>'], {}), 4, 9)]" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"list(pattern.scanString('test[0-9]test[0-9]'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"s <class 'str'> test[0-9]test[0-9]\n", | |
"loc <class 'int'> 7\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n", | |
"s <class 'str'> test[0-9]test[0-9]\n", | |
"loc <class 'int'> 16\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"[((['0', '-', '<', '9', '>'], {}), 4, 9),\n", | |
" ((['0', '-', '<', '9', '>'], {}), 13, 18)]" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### transformString" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.transformString('test[0-9]test[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"s <class 'str'> test[0-9]test[0-9]\n", | |
"loc <class 'int'> 7\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n", | |
"s <class 'str'> test[0-9]test[0-9]\n", | |
"loc <class 'int'> 16\n", | |
"toks <class 'pyparsing.ParseResults'> ['9']\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"'test0-<9>test0-<9>'" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## More complex example\n", | |
"\n", | |
"Make iterator:\n", | |
"\n", | |
" text_block[numeric_block][numeric_block]text_block...\n", | |
" \n", | |
"For example:\n", | |
"\n", | |
" >>> te\\[\\]st\\\\\\\\[0-9]test[0-9]\n", | |
" te[]st\\\\0test0\n", | |
" te[]st\\\\0test1\n", | |
" te[]st\\\\0test2\n", | |
" ...\n", | |
" te[]st\\\\9test7\n", | |
" te[]st\\\\9test8\n", | |
" te[]st\\\\9test9" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Parser" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"text = SkipTo(StringEnd() | '[')\n", | |
"numeric = Suppress('[') + Word(digits) + Suppress('-') + Word(digits) + Suppress(']')\n", | |
"pattern = ZeroOrMore(text | numeric)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def skip_empty(toks):\n", | |
" if not toks[0]:\n", | |
" raise ParseException('must be not empty')\n", | |
" \n", | |
"\n", | |
"text.addParseAction(skip_empty)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"SkipTo:({StringEnd | \"[\"})" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.parseString('test[0-9]test[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"(['test', '0', '9', 'test', '0', '9'], {})" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"(['te\\\\'], {})" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Escaping" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"escape = (Literal('\\\\\\\\').addParseAction(replaceWith('\\\\')) | \n", | |
" Literal('\\\\[').addParseAction(replaceWith('[')) |\n", | |
" Literal('\\\\]').addParseAction(replaceWith(']')))\n", | |
"text = SkipTo(StringEnd() | '[', ignore=escape).setParseAction(skip_empty)\n", | |
"pattern = ZeroOrMore(text | numeric)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": [ | |
"(['te\\\\[\\\\]st\\\\\\\\', '0', '9', 'test', '0', '9'], {})" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def unescape(toks):\n", | |
" return [escape.transformString(item) for item in toks]\n", | |
"\n", | |
"\n", | |
"text.addParseAction(unescape)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 16, | |
"text": [ | |
"SkipTo:({StringEnd | \"[\"})" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 17, | |
"text": [ | |
"(['te[]st\\\\', '0', '9', 'test', '0', '9'], {})" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Iterator" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def text_block(toks):\n", | |
" return [iter(toks)]\n", | |
" \n", | |
" \n", | |
"def numeric_block(toks):\n", | |
" from_value, to_value = toks\n", | |
" return [map(str, range(int(from_value), int(to_value) + 1))]\n", | |
"\n", | |
"\n", | |
"text.addParseAction(text_block)\n", | |
"numeric.addParseAction(numeric_block)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 18, | |
"text": [ | |
"{Suppress:(\"[\") W:(0123...) Suppress:(\"-\") W:(0123...) Suppress:(\"]\")}" | |
] | |
} | |
], | |
"prompt_number": 18 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"pattern.parseString('te\\[\\]st\\\\\\\\[0-9]test[0-9]')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 19, | |
"text": [ | |
"([<list_iterator object at 0x7f35380c7fd0>, <map object at 0x7f35381252b0>, <list_iterator object at 0x7f3538125278>, <map object at 0x7f3538100668>], {})" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from itertools import product\n", | |
"\n", | |
"\n", | |
"def iterator(string):\n", | |
" return (''.join(items) for items in product(*pattern.parseString(string)))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 20 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"len(list(iterator('te\\[\\]st\\\\\\\\[0-9]test[0-9]')))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 21, | |
"text": [ | |
"100" | |
] | |
} | |
], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"list(iterator('te\\[\\]st\\\\\\\\[0-9]test[0-9]'))[:3]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 22, | |
"text": [ | |
"['te[]st\\\\0test0', 'te[]st\\\\0test1', 'te[]st\\\\0test2']" | |
] | |
} | |
], | |
"prompt_number": 22 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"list(iterator('te\\[\\]st\\\\\\\\[0-9]test[0-9]'))[-3:]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 23, | |
"text": [ | |
"['te[]st\\\\9test7', 'te[]st\\\\9test8', 'te[]st\\\\9test9']" | |
] | |
} | |
], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Questions?\n", | |
"\n", | |
"[email protected]" | |
] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment