Created
September 28, 2016 09:26
-
-
Save kokes/b97c8324ba664400714a78f5561340fc to your computer and use it in GitHub Desktop.
performance difference between BeautifulSoup and lxml.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Parsing HTML into tables is not as fast as it could be, especially when relying on libraries other than `lxml`.\n", | |
"\n", | |
"Possible reasons for slowness:\n", | |
"- lxml bails, because the strict mode is on, falls back on slower bs4\n", | |
"- there is a regexp match even if the default is set (r'.+') - unnecessary\n", | |
"- when in rows, one can iterate (and check element name) instead of searching\n", | |
"\n", | |
"I have not investigated `%prun` much to give an idea of weights. But it seems that not relying on `lxml` is the main factor." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"from urllib.request import urlretrieve\n", | |
"import lxml.html\n", | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def fp(blob):\n", | |
" whs = re.compile(r'[\\r\\n]+|\\s{2,}') # github.com/pydata/pandas/blob/37f95cef85834207db0930e863341efb285e38a2/pandas/io/html.py#L65\n", | |
" bd = lxml.html.parse(blob).getroot()\n", | |
"\n", | |
" tables = []\n", | |
" tbls = bd.findall('.//table')\n", | |
"\n", | |
" for tb in tbls:\n", | |
" dt = []\n", | |
" trs = tb.findall('.//tr') # './/' as it might be within a tbody/thead/tfoot\n", | |
" for tr in trs:\n", | |
" row = []\n", | |
" for ch in tr.getchildren():\n", | |
" assert ch.tag in ['th', 'td']\n", | |
" cn = whs.sub(' ', ch.text_content().strip())\n", | |
" row.append(cn if len(cn) > 0 else np.nan)\n", | |
" dt.append(row)\n", | |
"\n", | |
" tables.append(pd.DataFrame(dt))\n", | |
" \n", | |
" return tables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('states.html', <http.client.HTTPMessage at 0x93e9a50>)" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"urlretrieve(url, 'states.html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 1.31 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"hh = pd.read_html('states.html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 44.4 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit hh2 = fp('states.html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('bls.html', <http.client.HTTPMessage at 0x987aa70>)" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"urlretrieve('http://www.bls.gov/news.release/cpi.t01.htm', 'bls.html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 217 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"cp = pd.read_html('bls.html') # treats <th> properly, unlike the my dummy code" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 7.83 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"cp2 = fp('bls.html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### lxml\n", | |
"Relying on lxml in a properly formatted document makes things faster. A gap remains, but it could be wholly attributed to the much better handling of edge cases in `pandas`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"html = \"\"\"<html>\n", | |
"<body>\n", | |
"<table>\n", | |
"%body%\n", | |
"</table>\n", | |
"</body>\n", | |
"</html>\n", | |
"\n", | |
"\"\"\"\n", | |
"cols = 10\n", | |
"rows = 1000\n", | |
"\n", | |
"rw = ['<td>value%s</td>' % ('</td><td>value'*(cols-1)) for j in range(rows)]\n", | |
"\n", | |
"htb = html.replace('%body%', '<tr>%s</tr>' % ('</tr>\\n<tr>'.join(rw)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from io import StringIO" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fh = StringIO(htb)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 93.1 ms per loop\n", | |
"10 loops, best of 3: 189 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit fp(fh)\n", | |
"\n", | |
"%timeit pd.read_html(fh)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment