kokes · September 28, 2016 09:26
diff --git a/read_html.ipynb b/read_html.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Parsing HTML into tables is not as fast as it could be, especially when relying on libraries other than `lxml`.\n",
    "\n",
    "Possible reasons for slowness:\n",
    "- lxml bails, because the strict mode is on, falls back on slower bs4\n",
    "- there is a regexp match even if the default is set (r'.+') - unnecessary\n",
    "- when in rows, one can iterate (and check element name) instead of searching\n",
    "\n",
    "I have not investigated `%prun` much to give an idea of weights. But it seems that not relying on `lxml` is the main factor."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from urllib.request import urlretrieve\n",
    "import lxml.html\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def fp(blob):\n",
    "    whs = re.compile(r'[\\r\\n]+|\\s{2,}') # github.com/pydata/pandas/blob/37f95cef85834207db0930e863341efb285e38a2/pandas/io/html.py#L65\n",
    "    bd = lxml.html.parse(blob).getroot()\n",
    "\n",
    "    tables = []\n",
    "    tbls = bd.findall('.//table')\n",
    "\n",
    "    for tb in tbls:\n",
    "        dt = []\n",
    "        trs = tb.findall('.//tr') # './/' as it might be within a tbody/thead/tfoot\n",
    "        for tr in trs:\n",
    "            row = []\n",
    "            for ch in tr.getchildren():\n",
    "                assert ch.tag in ['th', 'td']\n",
    "                cn = whs.sub(' ', ch.text_content().strip())\n",
    "                row.append(cn if len(cn) > 0 else np.nan)\n",
    "            dt.append(row)\n",
    "\n",
    "        tables.append(pd.DataFrame(dt))\n",
    "    \n",
    "    return tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('states.html', <http.client.HTTPMessage at 0x93e9a50>)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urlretrieve(url, 'states.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loop, best of 3: 1.31 s per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "hh = pd.read_html('states.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 loops, best of 3: 44.4 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit hh2 = fp('states.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('bls.html', <http.client.HTTPMessage at 0x987aa70>)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urlretrieve('http://www.bls.gov/news.release/cpi.t01.htm', 'bls.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 loop, best of 3: 217 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "cp = pd.read_html('bls.html') # treats <th> properly, unlike the my dummy code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 loops, best of 3: 7.83 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "cp2 = fp('bls.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### lxml\n",
    "Relying on lxml in a properly formatted document makes things faster. A gap remains, but it could be wholly attributed to the much better handling of edge cases in `pandas`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "html = \"\"\"<html>\n",
    "<body>\n",
    "<table>\n",
    "%body%\n",
    "</table>\n",
    "</body>\n",
    "</html>\n",
    "\n",
    "\"\"\"\n",
    "cols = 10\n",
    "rows = 1000\n",
    "\n",
    "rw = ['<td>value%s</td>' % ('</td><td>value'*(cols-1)) for j in range(rows)]\n",
    "\n",
    "htb = html.replace('%body%', '<tr>%s</tr>' % ('</tr>\\n<tr>'.join(rw)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from io import StringIO"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fh = StringIO(htb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 loops, best of 3: 93.1 ms per loop\n",
      "10 loops, best of 3: 189 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%timeit fp(fh)\n",
    "\n",
    "%timeit pd.read_html(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Parsing HTML into tables is not as fast as it could be, especially when relying on libraries other than `lxml`.\n",
	"\n",
	"Possible reasons for slowness:\n",
	"- lxml bails, because the strict mode is on, falls back on slower bs4\n",
	"- there is a regexp match even if the default is set (r'.+') - unnecessary\n",
	"- when in rows, one can iterate (and check element name) instead of searching\n",
	"\n",
	"I have not investigated `%prun` much to give an idea of weights. But it seems that not relying on `lxml` is the main factor."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from urllib.request import urlretrieve\n",
	"import lxml.html\n",
	"import re"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def fp(blob):\n",
	" whs = re.compile(r'[\\r\\n]+\|\\s{2,}') # github.com/pydata/pandas/blob/37f95cef85834207db0930e863341efb285e38a2/pandas/io/html.py#L65\n",
	" bd = lxml.html.parse(blob).getroot()\n",
	"\n",
	" tables = []\n",
	" tbls = bd.findall('.//table')\n",
	"\n",
	" for tb in tbls:\n",
	" dt = []\n",
	" trs = tb.findall('.//tr') # './/' as it might be within a tbody/thead/tfoot\n",
	" for tr in trs:\n",
	" row = []\n",
	" for ch in tr.getchildren():\n",
	" assert ch.tag in ['th', 'td']\n",
	" cn = whs.sub(' ', ch.text_content().strip())\n",
	" row.append(cn if len(cn) > 0 else np.nan)\n",
	" dt.append(row)\n",
	"\n",
	" tables.append(pd.DataFrame(dt))\n",
	" \n",
	" return tables"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"('states.html', <http.client.HTTPMessage at 0x93e9a50>)"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"urlretrieve(url, 'states.html')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loop, best of 3: 1.31 s per loop\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"hh = pd.read_html('states.html')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"10 loops, best of 3: 44.4 ms per loop\n"
	]
	}
	],
	"source": [
	"%timeit hh2 = fp('states.html')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"('bls.html', <http.client.HTTPMessage at 0x987aa70>)"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"urlretrieve('http://www.bls.gov/news.release/cpi.t01.htm', 'bls.html')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1 loop, best of 3: 217 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"cp = pd.read_html('bls.html') # treats <th> properly, unlike the my dummy code"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"100 loops, best of 3: 7.83 ms per loop\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"cp2 = fp('bls.html')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### lxml\n",
	"Relying on lxml in a properly formatted document makes things faster. A gap remains, but it could be wholly attributed to the much better handling of edge cases in `pandas`."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"html = \"\"\"<html>\n",
	"<body>\n",
	"<table>\n",
	"%body%\n",
	"</table>\n",
	"</body>\n",
	"</html>\n",
	"\n",
	"\"\"\"\n",
	"cols = 10\n",
	"rows = 1000\n",
	"\n",
	"rw = ['<td>value%s</td>' % ('</td><td>value'*(cols-1)) for j in range(rows)]\n",
	"\n",
	"htb = html.replace('%body%', '<tr>%s</tr>' % ('</tr>\\n<tr>'.join(rw)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from io import StringIO"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"fh = StringIO(htb)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"10 loops, best of 3: 93.1 ms per loop\n",
	"10 loops, best of 3: 189 ms per loop\n"
	]
	}
	],
	"source": [
	"%timeit fp(fh)\n",
	"\n",
	"%timeit pd.read_html(fh)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}