Last active
August 9, 2016 10:36
-
-
Save kmike/af647777cef39c3d01071905d176c006 to your computer and use it in GitHub Desktop.
parsel HtmlParser benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
pip install subsample csvkit 1>&2 | |
curl http://s3.amazonaws.com/alexa-static/top-1m.csv.zip > top-1m.csv.zip | |
unzip ./top-1m.csv.zip 1>&2 | |
subsample --sample-size $1 top-1m.csv | csvcut -c 2 | |
rm ./top-1m.csv | |
rm ./top-1m.csv.zip |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
1. Create urls.txt file with urls, one url per line:: | |
./get-random-domains.sh 1000 > urls.txt | |
2. Run spider to get page contents:: | |
scrapy runspider savehtml.py -a urls=urls.txt -o pages.json -L INFO | |
""" | |
import random | |
import scrapy | |
from scrapy.utils.url import guess_scheme | |
from scrapy.linkextractors import LinkExtractor | |
class SavehtmlSpider(scrapy.Spider): | |
name = "savehtml" | |
requests_per_domain = 5 | |
custom_settings = { | |
'CONCURRENT_REQUESTS': 50, | |
'REACTOR_THREADPOOL_MAXSIZE': 20, | |
'AJAXCRAWL_ENABLED': True, | |
} | |
def start_requests(self): | |
self.le = LinkExtractor(canonicalize=False) | |
with open(self.urls, 'rt') as f: | |
for line in f: | |
if not line.strip(): | |
continue | |
url = guess_scheme(line.strip()) | |
yield scrapy.Request(url, self.parse) | |
def parse(self, response): | |
if not hasattr(response, 'text'): | |
return | |
links = self.le.extract_links(response) | |
n_links = min(len(links), int(self.requests_per_domain) - 1) | |
links = random.sample(links, n_links) | |
for link in links: | |
yield scrapy.Request(link.url, self.parse_other) | |
yield {'url': response.url, 'html': response.text} | |
def parse_other(self, response): | |
if not hasattr(response, 'text'): | |
return | |
yield {'url': response.url, 'html': response.text} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment