Skip to content

Instantly share code, notes, and snippets.

@davidfischer
Created June 28, 2014 17:04
Show Gist options
  • Save davidfischer/4041a861cfa306cad9ac to your computer and use it in GitHub Desktop.
Save davidfischer/4041a861cfa306cad9ac to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:db14aefced4d44bcfaa24db2b4e1775c5ae2c2c0a09af719ed0649d1e14dd53f"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Standard library XML modules\n",
"# Fairly fast but not really complete\n",
"from xml.etree import ElementTree\n",
"from xml.dom import minidom\n",
"\n",
"# Third party XML processing (builds on libxml2)\n",
"# The elementtree library in the standard library is an older\n",
"# version ported from this library\n",
"# Fast and fairly complete\n",
"import lxml\n",
"\n",
"# BeautifulSoup (primarily for HTML not XML - relies on lxml for XML)\n",
"# pip install beautifulsoup4\n",
"import bs4\n",
"\n",
"# Patch XML parsers for untrusted input\n",
"# https://docs.python.org/2/library/xml.html#xml-vulnerabilities\n",
"import defusedxml\n",
"\n",
"# Helper for fetching remote data over HTTP\n",
"import requests"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"resp = requests.get('http://gdx.mlb.com/components/game/mlb/year_2014/month_06/day_26/scoreboard.xml')\n",
"text = resp.content\n",
"print text[:1000]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<?xml version=\"1.0\" encoding=\"UTF-8\"?><!--Copyright 2014 MLB Advanced Media, L.P. Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt-->\n",
"<scoreboard date=\"20140626\" last_modified=\"\">\n",
" <go_game>\n",
" <game id=\"2014_06_26_atlmlb_houmlb_1\" league=\"NA\" status=\"FINAL\" start_time=\"2:10PM\"\n",
" home_code=\"\"/>\n",
" <team name=\"Astros\" code=\"\">\n",
" <gameteam R=\"6\" H=\"8\" E=\"1\"/>\n",
" </team>\n",
" <team name=\"Braves\" code=\"\">\n",
" <gameteam R=\"1\" H=\"6\" E=\"0\"/>\n",
" </team>\n",
" <w_pitcher wins=\"8\" losses=\"5\">\n",
" <pitcher name=\"J. Cosart\"/>\n",
" </w_pitcher>\n",
" <l_pitcher wins=\"2\" losses=\"5\">\n",
" <pitcher name=\"M. Minor\"/>\n",
" </l_pitcher>\n",
" <sv_pitcher saves=\"0\">\n",
" <pitcher name=\". \"/>\n",
" </sv_pitcher>\n",
" </go_game>\n",
" <go_game>\n",
" <game id=\"2014_06_26_minmlb_anamlb_1\" league=\"AA\" status=\"FINAL\" start_time=\"3:35PM\"\n",
" home_code=\"\"/>\n",
" <team name=\"Angels\" \n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"et = ElementTree.fromstring(text)\n",
"for elem in et.iter('game'):\n",
" print elem.attrib"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{'status': 'FINAL', 'league': 'NA', 'start_time': '2:10PM', 'id': '2014_06_26_atlmlb_houmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'AA', 'start_time': '3:35PM', 'id': '2014_06_26_minmlb_anamlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_miamlb_phimlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_nynmlb_pitmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'AA', 'start_time': '7:07PM', 'id': '2014_06_26_chamlb_tormlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'AA', 'start_time': '8:05PM', 'id': '2014_06_26_detmlb_texmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:05PM', 'id': '2014_06_26_wasmlb_chnmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:10PM', 'id': '2014_06_26_colmlb_milmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:10PM', 'id': '2014_06_26_slnmlb_lanmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:15PM', 'id': '2014_06_26_cinmlb_sfnmlb_1', 'home_code': ''}\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dom = minidom.parseString(text)\n",
"for game in dom.getElementsByTagName('game'):\n",
" print game.attributes['id'].value, game.attributes['status'].value"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"2014_06_26_atlmlb_houmlb_1 FINAL\n",
"2014_06_26_minmlb_anamlb_1 FINAL\n",
"2014_06_26_miamlb_phimlb_1 FINAL\n",
"2014_06_26_nynmlb_pitmlb_1 FINAL\n",
"2014_06_26_chamlb_tormlb_1 FINAL\n",
"2014_06_26_detmlb_texmlb_1 FINAL\n",
"2014_06_26_wasmlb_chnmlb_1 FINAL\n",
"2014_06_26_colmlb_milmlb_1 FINAL\n",
"2014_06_26_slnmlb_lanmlb_1 FINAL\n",
"2014_06_26_cinmlb_sfnmlb_1 FINAL\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Similar to the built-in ElementTree\n",
"# but with more features such as XML schema validation and more complete XPath\n",
"root = lxml.etree.XML(text)\n",
"doc = lxml.etree.ElementTree(root)\n",
"\n",
"# Find National League games\n",
"for game in root.xpath('//game[@league=\"NN\"]'):\n",
" print game.attrib"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_miamlb_phimlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '7:05PM', 'id': '2014_06_26_nynmlb_pitmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:05PM', 'id': '2014_06_26_wasmlb_chnmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '8:10PM', 'id': '2014_06_26_colmlb_milmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:10PM', 'id': '2014_06_26_slnmlb_lanmlb_1', 'home_code': ''}\n",
"{'status': 'FINAL', 'league': 'NN', 'start_time': '10:15PM', 'id': '2014_06_26_cinmlb_sfnmlb_1', 'home_code': ''}\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"soup = bs4.BeautifulSoup(text, 'xml')\n",
"for game in soup.find_all('game'):\n",
" print game"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<game home_code=\"\" id=\"2014_06_26_atlmlb_houmlb_1\" league=\"NA\" start_time=\"2:10PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_minmlb_anamlb_1\" league=\"AA\" start_time=\"3:35PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_miamlb_phimlb_1\" league=\"NN\" start_time=\"7:05PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_nynmlb_pitmlb_1\" league=\"NN\" start_time=\"7:05PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_chamlb_tormlb_1\" league=\"AA\" start_time=\"7:07PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_detmlb_texmlb_1\" league=\"AA\" start_time=\"8:05PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_wasmlb_chnmlb_1\" league=\"NN\" start_time=\"8:05PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_colmlb_milmlb_1\" league=\"NN\" start_time=\"8:10PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_slnmlb_lanmlb_1\" league=\"NN\" start_time=\"10:10PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_cinmlb_sfnmlb_1\" league=\"NN\" start_time=\"10:15PM\" status=\"FINAL\"/>\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# CSS like selectors\n",
"for game in soup.select('game[league=\"AA\"]'):\n",
" print game"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"<game home_code=\"\" id=\"2014_06_26_minmlb_anamlb_1\" league=\"AA\" start_time=\"3:35PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_chamlb_tormlb_1\" league=\"AA\" start_time=\"7:07PM\" status=\"FINAL\"/>\n",
"<game home_code=\"\" id=\"2014_06_26_detmlb_texmlb_1\" league=\"AA\" start_time=\"8:05PM\" status=\"FINAL\"/>\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"laughs = \"\"\"<?xml version=\"1.0\"?>\n",
"<!DOCTYPE lolz [\n",
" <!ENTITY lol \"lol\">\n",
" <!ELEMENT lolz (#PCDATA)>\n",
" <!ENTITY lol1 \"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n",
" <!ENTITY lol2 \"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n",
" <!ENTITY lol3 \"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n",
" <!ENTITY lol4 \"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n",
" <!ENTITY lol5 \"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n",
" <!ENTITY lol6 \"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n",
" <!ENTITY lol7 \"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n",
" <!ENTITY lol8 \"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n",
" <!ENTITY lol9 \"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n",
"]>\n",
"<lolz>&lol9;</lolz>\"\"\"\n",
"\n",
"# If I ran this, it would consume all of the memory on my machine\n",
"#dom = parseString(laughs)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# This is \"safe\" but will throw an error\n",
"import defusedxml.minidom\n",
"defusedxml.minidom.parseString(laughs)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "EntitiesForbidden",
"evalue": "EntitiesForbidden(name='lol', system_id=None, public_id=None)",
"output_type": "pyerr",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mEntitiesForbidden\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-b24528b40419>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# This is \"safe\" but will throw an error\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdefusedxml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mminidom\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mdefusedxml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mminidom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparseString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlaughs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Users/dfischer/Projects/envs/pyxml/lib/python2.7/site-packages/defusedxml/minidom.pyc\u001b[0m in \u001b[0;36mparseString\u001b[0;34m(string, parser, forbid_dtd, forbid_entities, forbid_external)\u001b[0m\n\u001b[1;32m 33\u001b[0m return _expatbuilder.parseString(string, forbid_dtd=forbid_dtd,\n\u001b[1;32m 34\u001b[0m \u001b[0mforbid_entities\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforbid_entities\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m forbid_external=forbid_external)\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m return _do_pulldom_parse(_pulldom.parseString, (string,),\n",
"\u001b[0;32m/Users/dfischer/Projects/envs/pyxml/lib/python2.7/site-packages/defusedxml/expatbuilder.pyc\u001b[0m in \u001b[0;36mparseString\u001b[0;34m(string, namespaces, forbid_dtd, forbid_entities, forbid_external)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0mforbid_entities\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforbid_entities\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m forbid_external=forbid_external)\n\u001b[0;32m--> 110\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mbuilder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparseString\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/xml/dom/expatbuilder.pyc\u001b[0m in \u001b[0;36mparseString\u001b[0;34m(self, string)\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 223\u001b[0;31m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mParse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 224\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setup_subset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 225\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mParseEscape\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/dfischer/Projects/envs/pyxml/lib/python2.7/site-packages/defusedxml/expatbuilder.pyc\u001b[0m in \u001b[0;36mdefused_entity_decl\u001b[0;34m(self, name, is_parameter_entity, value, base, sysid, pubid, notation_name)\u001b[0m\n\u001b[1;32m 33\u001b[0m def defused_entity_decl(self, name, is_parameter_entity, value, base,\n\u001b[1;32m 34\u001b[0m sysid, pubid, notation_name):\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mEntitiesForbidden\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbase\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msysid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpubid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnotation_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m def defused_unparsed_entity_decl(self, name, base, sysid, pubid,\n",
"\u001b[0;31mEntitiesForbidden\u001b[0m: EntitiesForbidden(name='lol', system_id=None, public_id=None)"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment