Last active
April 27, 2018 13:02
-
-
Save iAnanich/99d9b22f3d5e0288906c353663b1c3ff to your computer and use it in GitHub Desktop.
Scrapy httpbin.org/anything spider for benchmarking purposes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# put it with settings.py and spiders folder | |
from urllib.parse import urlparse, urlencode, urlunparse | |
def update_query(url: str, query_dict: dict) -> str: | |
QUERY_INDEX = 4 | |
components = urlparse(url) | |
query = components[QUERY_INDEX] | |
if query: | |
dict_query = { | |
k: v for k, v in | |
(kv.split('=') for kv in query.split('&')) | |
} | |
else: | |
dict_query = {} | |
dict_query.update(query_dict) | |
query = urlencode(dict_query) | |
components = list(components) | |
components[QUERY_INDEX] = query | |
updated = urlunparse(components) | |
return updated |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from ..common import update_query | |
class HttpbinSpider(scrapy.Spider): | |
name = 'httpbin' | |
allowed_domains = ['httpbin.org'] | |
BASE_URL = 'http://httpbin.org/anything' | |
DEEP_FACTOR = 16 | |
BREADTH_FACTOR = 16 | |
def start_requests(self): | |
for breadth in range(self.BREADTH_FACTOR): | |
meta = {'breadth': breadth} | |
url = update_query(self.BASE_URL, meta) | |
yield scrapy.Request(url, meta=meta, callback=self.parse_breadth) | |
def parse_breadth(self, response): | |
yield { | |
'breadth': response.meta['breadth'], | |
'breadth_latency': response.meta['download_latency'], | |
} | |
for deep in range(self.DEEP_FACTOR): | |
meta = { | |
'breadth': response.meta['breadth'], | |
'breadth_latency': response.meta['download_latency'], | |
'deep': deep, | |
} | |
url = update_query(response.url, meta) | |
yield scrapy.Request(url, meta=meta, callback=self.parse_deep) | |
def parse_deep(self, response): | |
breadth_latency = response.meta['breadth_latency'] | |
yield { | |
'breadth': response.meta['breadth'], | |
'deep': response.meta['deep'], | |
'total_latency': response.meta['download_latency']+breadth_latency, | |
'breadth_latency': breadth_latency, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment