Skip to content

Instantly share code, notes, and snippets.

@kokes
Created September 28, 2016 09:26
Show Gist options
  • Save kokes/b97c8324ba664400714a78f5561340fc to your computer and use it in GitHub Desktop.
Save kokes/b97c8324ba664400714a78f5561340fc to your computer and use it in GitHub Desktop.
performance difference between BeautifulSoup and lxml.html
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Parsing HTML into tables is not as fast as it could be, especially when relying on libraries other than `lxml`.\n",
"\n",
"Possible reasons for slowness:\n",
"- lxml bails, because the strict mode is on, falls back on slower bs4\n",
"- there is a regexp match even if the default is set (r'.+') - unnecessary\n",
"- when in rows, one can iterate (and check element name) instead of searching\n",
"\n",
"I have not investigated `%prun` much to give an idea of weights. But it seems that not relying on `lxml` is the main factor."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from urllib.request import urlretrieve\n",
"import lxml.html\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def fp(blob):\n",
" whs = re.compile(r'[\\r\\n]+|\\s{2,}') # github.com/pydata/pandas/blob/37f95cef85834207db0930e863341efb285e38a2/pandas/io/html.py#L65\n",
" bd = lxml.html.parse(blob).getroot()\n",
"\n",
" tables = []\n",
" tbls = bd.findall('.//table')\n",
"\n",
" for tb in tbls:\n",
" dt = []\n",
" trs = tb.findall('.//tr') # './/' as it might be within a tbody/thead/tfoot\n",
" for tr in trs:\n",
" row = []\n",
" for ch in tr.getchildren():\n",
" assert ch.tag in ['th', 'td']\n",
" cn = whs.sub(' ', ch.text_content().strip())\n",
" row.append(cn if len(cn) > 0 else np.nan)\n",
" dt.append(row)\n",
"\n",
" tables.append(pd.DataFrame(dt))\n",
" \n",
" return tables"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('states.html', <http.client.HTTPMessage at 0x93e9a50>)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"urlretrieve(url, 'states.html')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 1.31 s per loop\n"
]
}
],
"source": [
"%%timeit\n",
"hh = pd.read_html('states.html')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 44.4 ms per loop\n"
]
}
],
"source": [
"%timeit hh2 = fp('states.html')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"('bls.html', <http.client.HTTPMessage at 0x987aa70>)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"urlretrieve('http://www.bls.gov/news.release/cpi.t01.htm', 'bls.html')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 217 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"cp = pd.read_html('bls.html') # treats <th> properly, unlike the my dummy code"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 7.83 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"cp2 = fp('bls.html')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### lxml\n",
"Relying on lxml in a properly formatted document makes things faster. A gap remains, but it could be wholly attributed to the much better handling of edge cases in `pandas`."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"html = \"\"\"<html>\n",
"<body>\n",
"<table>\n",
"%body%\n",
"</table>\n",
"</body>\n",
"</html>\n",
"\n",
"\"\"\"\n",
"cols = 10\n",
"rows = 1000\n",
"\n",
"rw = ['<td>value%s</td>' % ('</td><td>value'*(cols-1)) for j in range(rows)]\n",
"\n",
"htb = html.replace('%body%', '<tr>%s</tr>' % ('</tr>\\n<tr>'.join(rw)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from io import StringIO"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"fh = StringIO(htb)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 93.1 ms per loop\n",
"10 loops, best of 3: 189 ms per loop\n"
]
}
],
"source": [
"%timeit fp(fh)\n",
"\n",
"%timeit pd.read_html(fh)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment