Last active
August 9, 2016 10:36
-
-
Save kmike/af647777cef39c3d01071905d176c006 to your computer and use it in GitHub Desktop.
parsel HtmlParser benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import parsel\n", | |
"from parsel import selector\n", | |
"from lxml import etree, html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"selector._ctgroup['html_html'] = {\n", | |
" '_parser': html.HTMLParser,\n", | |
" '_csstranslator': selector.HTMLTranslator(),\n", | |
" '_tostring_method': 'html'\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 3703 pages.json\r\n" | |
] | |
} | |
], | |
"source": [ | |
"!wc -l pages.json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4.28 s, sys: 788 ms, total: 5.06 s\n", | |
"Wall time: 5.13 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"def load(): \n", | |
" pages = []\n", | |
" with open('pages.json', 'rt') as f:\n", | |
" for line in f:\n", | |
" if line.strip() in '[]':\n", | |
" continue\n", | |
" try:\n", | |
" page = json.loads(line.strip().rstrip(','))\n", | |
" except json.JSONDecodeError:\n", | |
" print(line)\n", | |
" break\n", | |
" pages.append(page)\n", | |
" return pages\n", | |
"\n", | |
"pages = load()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def parse(pages, type):\n", | |
" return [parsel.Selector(p['html'], type=type) for p in pages]\n", | |
"\n", | |
"def run_xpath(selectors):\n", | |
" for sel in selectors:\n", | |
" sel.xpath('//p').extract()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 5: 12 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r5\n", | |
"selectors = parse(pages, 'html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"selectors = parse(pages, 'html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 5: 1.15 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r5\n", | |
"run_xpath(selectors)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 5: 12.4 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r5\n", | |
"selectors = parse(pages, 'html_html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"selectors = parse(pages, 'html_html')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 5: 1.21 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit -r5\n", | |
"run_xpath(selectors)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
pip install subsample csvkit 1>&2 | |
curl http://s3.amazonaws.com/alexa-static/top-1m.csv.zip > top-1m.csv.zip | |
unzip ./top-1m.csv.zip 1>&2 | |
subsample --sample-size $1 top-1m.csv | csvcut -c 2 | |
rm ./top-1m.csv | |
rm ./top-1m.csv.zip |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
1. Create urls.txt file with urls, one url per line:: | |
./get-random-domains.sh 1000 > urls.txt | |
2. Run spider to get page contents:: | |
scrapy runspider savehtml.py -a urls=urls.txt -o pages.json -L INFO | |
""" | |
import random | |
import scrapy | |
from scrapy.utils.url import guess_scheme | |
from scrapy.linkextractors import LinkExtractor | |
class SavehtmlSpider(scrapy.Spider): | |
name = "savehtml" | |
requests_per_domain = 5 | |
custom_settings = { | |
'CONCURRENT_REQUESTS': 50, | |
'REACTOR_THREADPOOL_MAXSIZE': 20, | |
'AJAXCRAWL_ENABLED': True, | |
} | |
def start_requests(self): | |
self.le = LinkExtractor(canonicalize=False) | |
with open(self.urls, 'rt') as f: | |
for line in f: | |
if not line.strip(): | |
continue | |
url = guess_scheme(line.strip()) | |
yield scrapy.Request(url, self.parse) | |
def parse(self, response): | |
if not hasattr(response, 'text'): | |
return | |
links = self.le.extract_links(response) | |
n_links = min(len(links), int(self.requests_per_domain) - 1) | |
links = random.sample(links, n_links) | |
for link in links: | |
yield scrapy.Request(link.url, self.parse_other) | |
yield {'url': response.url, 'html': response.text} | |
def parse_other(self, response): | |
if not hasattr(response, 'text'): | |
return | |
yield {'url': response.url, 'html': response.text} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment