Created
June 28, 2014 17:04
-
-
Save davidfischer/4041a861cfa306cad9ac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:db14aefced4d44bcfaa24db2b4e1775c5ae2c2c0a09af719ed0649d1e14dd53f" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Standard library XML modules\n", | |
"# Fairly fast but not really complete\n", | |
"from xml.etree import ElementTree\n", | |
"from xml.dom import minidom\n", | |
"\n", | |
"# Third party XML processing (builds on libxml2)\n", | |
"# The elementtree library in the standard library is an older\n", | |
"# version ported from this library\n", | |
"# Fast and fairly complete\n", | |
"import lxml\n", | |
"\n", | |
"# BeautifulSoup (primarily for HTML not XML - relies on lxml for XML)\n", | |
"# pip install beautifulsoup4\n", | |
"import bs4\n", | |
"\n", | |
"# Patch XML parsers for untrusted input\n", | |
"# https://docs.python.org/2/library/xml.html#xml-vulnerabilities\n", | |
"import defusedxml\n", | |
"\n", | |
"# Helper for fetching remote data over HTTP\n", | |
"import requests" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"resp = requests.get('http://gdx.mlb.com/components/game/mlb/year_2014/month_06/day_26/scoreboard.xml')\n", | |
"text = resp.content\n", | |
"print text[:1000]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><!--Copyright 2014 MLB Advanced Media, L.P. Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt-->\n", | |
"<scoreboard date=\"20140626\" last_modified=\"\">\n", | |
" <go_game>\n", | |
" <game id=\"2014_06_26_atlmlb_houmlb_1\" league=\"NA\" status=\"FINAL\" start_time=\"2:10PM\"\n", | |
" home_code=\"\"/>\n", | |
" <team name=\"Astros\" code=\"\">\n", | |
" <gameteam R=\"6\" H=\"8\" E=\"1\"/>\n", | |
" </team>\n", | |
" <team name=\"Braves\" code=\"\">\n", | |
" <gameteam R=\"1\" H=\"6\" E=\"0\"/>\n", | |
" </team>\n", | |
" <w_pitcher wins=\"8\" losses=\"5\">\n", | |
" <pitcher name=\"J. Cosart\"/>\n", | |
" </w_pitcher>\n", | |
" <l_pitcher wins=\"2\" losses=\"5\">\n", | |
" <pitcher name=\"M. Minor\"/>\n", | |
" </l_pitcher>\n", | |
" <sv_pitcher saves=\"0\">\n", | |
" <pitcher name=\". \"/>\n", | |
" </sv_pitcher>\n", | |
" </go_game>\n", | |
" <go_game>\n", | |
" <game id=\"2014_06_26_minmlb_anamlb_1\" league=\"AA\" status=\"FINAL\" start_time=\"3:35PM\"\n", | |
" home_code=\"\"/>\n", | |
" <team name=\"Angels\" \n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"et = ElementTree.fromstring(text)\n", | |
"for elem in et.iter('game'):\n", | |
" print elem.attrib" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"{'status': 'FINAL', 'league': 'NA', 'start_time': '2:10PM', 'id': '2014_06_26_atlmlb_houmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'AA', 'start_time': '3:35PM', 'id': '2014_06_26_minmlb_anamlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_miamlb_phimlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_nynmlb_pitmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'AA', 'start_time': '7:07PM', 'id': '2014_06_26_chamlb_tormlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'AA', 'start_time': '8:05PM', 'id': '2014_06_26_detmlb_texmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:05PM', 'id': '2014_06_26_wasmlb_chnmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:10PM', 'id': '2014_06_26_colmlb_milmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:10PM', 'id': '2014_06_26_slnmlb_lanmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:15PM', 'id': '2014_06_26_cinmlb_sfnmlb_1', 'home_code': ''}\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"dom = minidom.parseString(text)\n", | |
"for game in dom.getElementsByTagName('game'):\n", | |
" print game.attributes['id'].value, game.attributes['status'].value" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"2014_06_26_atlmlb_houmlb_1 FINAL\n", | |
"2014_06_26_minmlb_anamlb_1 FINAL\n", | |
"2014_06_26_miamlb_phimlb_1 FINAL\n", | |
"2014_06_26_nynmlb_pitmlb_1 FINAL\n", | |
"2014_06_26_chamlb_tormlb_1 FINAL\n", | |
"2014_06_26_detmlb_texmlb_1 FINAL\n", | |
"2014_06_26_wasmlb_chnmlb_1 FINAL\n", | |
"2014_06_26_colmlb_milmlb_1 FINAL\n", | |
"2014_06_26_slnmlb_lanmlb_1 FINAL\n", | |
"2014_06_26_cinmlb_sfnmlb_1 FINAL\n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Similar to the built-in ElementTree\n", | |
"# but with more features such as XML schema validation and more complete XPath\n", | |
"root = lxml.etree.XML(text)\n", | |
"doc = lxml.etree.ElementTree(root)\n", | |
"\n", | |
"# Find National League games\n", | |
"for game in root.xpath('//game[@league=\"NN\"]'):\n", | |
" print game.attrib" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_miamlb_phimlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_nynmlb_pitmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:05PM', 'id': '2014_06_26_wasmlb_chnmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:10PM', 'id': '2014_06_26_colmlb_milmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:10PM', 'id': '2014_06_26_slnmlb_lanmlb_1', 'home_code': ''}\n", | |
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:15PM', 'id': '2014_06_26_cinmlb_sfnmlb_1', 'home_code': ''}\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"soup = bs4.BeautifulSoup(text, 'xml')\n", | |
"for game in soup.find_all('game'):\n", | |
" print game" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<game home_code=\"\" id=\"2014_06_26_atlmlb_houmlb_1\" league=\"NA\" start_time=\"2:10PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_minmlb_anamlb_1\" league=\"AA\" start_time=\"3:35PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_miamlb_phimlb_1\" league=\"NN\" start_time=\"7:05PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_nynmlb_pitmlb_1\" league=\"NN\" start_time=\"7:05PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_chamlb_tormlb_1\" league=\"AA\" start_time=\"7:07PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_detmlb_texmlb_1\" league=\"AA\" start_time=\"8:05PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_wasmlb_chnmlb_1\" league=\"NN\" start_time=\"8:05PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_colmlb_milmlb_1\" league=\"NN\" start_time=\"8:10PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_slnmlb_lanmlb_1\" league=\"NN\" start_time=\"10:10PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_cinmlb_sfnmlb_1\" league=\"NN\" start_time=\"10:15PM\" status=\"FINAL\"/>\n" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# CSS like selectors\n", | |
"for game in soup.select('game[league=\"AA\"]'):\n", | |
" print game" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<game home_code=\"\" id=\"2014_06_26_minmlb_anamlb_1\" league=\"AA\" start_time=\"3:35PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_chamlb_tormlb_1\" league=\"AA\" start_time=\"7:07PM\" status=\"FINAL\"/>\n", | |
"<game home_code=\"\" id=\"2014_06_26_detmlb_texmlb_1\" league=\"AA\" start_time=\"8:05PM\" status=\"FINAL\"/>\n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"laughs = \"\"\"<?xml version=\"1.0\"?>\n", | |
"<!DOCTYPE lolz [\n", | |
" <!ENTITY lol \"lol\">\n", | |
" <!ELEMENT lolz (#PCDATA)>\n", | |
" <!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n", | |
" <!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n", | |
" <!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n", | |
" <!ENTITY lol4 \"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n", | |
" <!ENTITY lol5 \"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n", | |
" <!ENTITY lol6 \"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n", | |
" <!ENTITY lol7 \"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n", | |
" <!ENTITY lol8 \"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n", | |
" <!ENTITY lol9 \"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n", | |
"]>\n", | |
"<lolz>&lol9;</lolz>\"\"\"\n", | |
"\n", | |
"# If I ran this, it would consume all of the memory on my machine\n", | |
"#dom = parseString(laughs)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# This is \"safe\" but will throw an error\n", | |
"import defusedxml.minidom\n", | |
"defusedxml.minidom.parseString(laughs)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "EntitiesForbidden", | |
"evalue": "EntitiesForbidden(name='lol', system_id=None, public_id=None)", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mEntitiesForbidden\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-9-b24528b40419>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# This is \"safe\" but will throw an error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdefusedxml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mminidom\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mdefusedxml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mminidom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparseString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlaughs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m/Users/dfischer/Projects/envs/pyxml/lib/python2.7/site-packages/defusedxml/minidom.pyc\u001b[0m in \u001b[0;36mparseString\u001b[0;34m(string, parser, forbid_dtd, forbid_entities, forbid_external)\u001b[0m\n\u001b[1;32m 33\u001b[0m return _expatbuilder.parseString(string, forbid_dtd=forbid_dtd,\n\u001b[1;32m 34\u001b[0m \u001b[0mforbid_entities\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforbid_entities\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m forbid_external=forbid_external)\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m return _do_pulldom_parse(_pulldom.parseString, (string,),\n", | |
"\u001b[0;32m/Users/dfischer/Projects/envs/pyxml/lib/python2.7/site-packages/defusedxml/expatbuilder.pyc\u001b[0m in \u001b[0;36mparseString\u001b[0;34m(string, namespaces, forbid_dtd, forbid_entities, forbid_external)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0mforbid_entities\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforbid_entities\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m forbid_external=forbid_external)\n\u001b[0;32m--> 110\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mbuilder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparseString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;32m/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.pyc\u001b[0m in \u001b[0;36mparseString\u001b[0;34m(self, string)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 223\u001b[0;31m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mParse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 224\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setup_subset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mParseEscape\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/Users/dfischer/Projects/envs/pyxml/lib/python2.7/site-packages/defusedxml/expatbuilder.pyc\u001b[0m in \u001b[0;36mdefused_entity_decl\u001b[0;34m(self, name, is_parameter_entity, value, base, sysid, pubid, notation_name)\u001b[0m\n\u001b[1;32m 33\u001b[0m def defused_entity_decl(self, name, is_parameter_entity, value, base,\n\u001b[1;32m 34\u001b[0m sysid, pubid, notation_name):\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mEntitiesForbidden\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msysid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpubid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnotation_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m def defused_unparsed_entity_decl(self, name, base, sysid, pubid,\n", | |
"\u001b[0;31mEntitiesForbidden\u001b[0m: EntitiesForbidden(name='lol', system_id=None, public_id=None)" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment