-
-
Save IAlwaysBeCoding/438464eec197f546ad1b707dba1f7551 to your computer and use it in GitHub Desktop.
Scrapy downloader middleware class for handling IPv6-only hosts.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
from __future__ import absolute_import, division, print_function | |
import socket | |
from urlparse import urlparse | |
class DnsResolverMiddleware(object): | |
""" | |
Downloader Middleware Class for address resolving. | |
It resolves domain to IPv4 \ IPv6 addresses. | |
Previously it was designed for IPv6-only hosts. | |
Algorithm: | |
1) Replace all domain names to addresses for a request. | |
2) Form a new request. | |
3) Perform this request (at Scrapy Engine). | |
4) Get a response. | |
5) Replaces addresses to names for this response. | |
6) Return response. | |
It works without any caching. So it may be quite slow. | |
TODO: | |
* Add caching [?]. | |
""" | |
dns_dict = None | |
def process_request(self, request, spider): | |
""" | |
Replaces all domain names to addresses and forms a new request. | |
Also it replaces addresses to domain names | |
for a Referer-header. | |
:param request: the request being processed; | |
:param spider: dummy parameter in this case; | |
:return: None or a new request with replaced links. | |
""" | |
meta = getattr(request, 'meta', None) | |
if not meta: | |
meta = dict() | |
if not self.dns_dict: | |
self.dns_dict = dict() | |
# To resist infinite loop. | |
if meta.get('resolved_request'): | |
return None | |
domain_url = request.url | |
# Check if domain and address are computed already. | |
domain = meta.get('domain', str()) | |
address = meta.get('address', str()) | |
if not address or not address: | |
# Compute domain and IP-address. | |
domain = self.parse_host(domain_url) | |
address = self.compute_address(domain) | |
self.dns_dict.setdefault(domain, []) | |
self.dns_dict.setdefault(address, []) | |
if address not in self.dns_dict[domain]: | |
self.dns_dict[domain] += [address] | |
if domain not in self.dns_dict[address]: | |
self.dns_dict[address] += [domain] | |
# Get a new url with an address instead of domain. | |
address_url = self.convert_to_address(domain_url) | |
# To replace `Referer`. | |
headers = request.headers | |
if not headers: | |
headers = dict() | |
# Get a old `Referer` with an address instead of domain. | |
address_referer = request.headers.get('Referer', str()) | |
# Get a new `Referer` with a domain. | |
domain_referer = self.convert_to_domain(address_referer) | |
# Form a new headers dict. | |
new_headers = dict( | |
headers, | |
# Store a new Host-header | |
# for correct resolving on the requested server. | |
Host=domain, | |
) | |
if domain_referer: | |
new_headers = dict( | |
new_headers, | |
# Store a new Referer-header | |
# for correct handling on the requested server | |
Referer=domain_referer | |
) | |
# Form a new meta dict. | |
new_meta = dict( | |
meta, | |
# Mark this request as processed to resist infinite loop. | |
resolved_request=True, | |
# Store some auxiliary data. | |
# It helps us to escape from unnecessary computations | |
# and debug'em all. | |
dns_dict=self.dns_dict, | |
domain=domain, | |
address=address, | |
url=dict( | |
domain=domain_url, | |
address=address_url, | |
), | |
referer=dict( | |
domain=domain_referer, | |
address=address_referer, | |
), | |
) | |
# Form a new request. | |
new_request = request.replace( | |
url=address_url, | |
headers=new_headers, | |
meta=new_meta, | |
) | |
return new_request | |
def process_response(self, request, response, spider): | |
""" | |
Tries to replace addresses to domain names. | |
Replaces are not so successfully as I expects. | |
May be bug in Scrapy Engine. | |
Nevertheless: | |
1) It gets `domain` and `address` from request's meta. | |
2) It replaces `address` to `domain` in request's url. | |
3) It replaces `address` to `domain` in response's url. | |
4) It forms a new response. | |
:param request: the request that originated the response; | |
:param response: the response being processed; | |
:param spider: dummy parameter in this case; | |
:return: None or a new request with replaced links. | |
""" | |
# Get meta from request. It is important. | |
meta = getattr(request, 'meta', None) | |
if not meta: | |
meta = dict() | |
if meta.get('resolved_response'): | |
return None | |
# Form a new url with domain instead of IP-address. | |
url = response.url | |
new_url = self.convert_to_domain(url) | |
# Form a new meta dict. | |
new_meta = dict( | |
meta, | |
# Mark this response as processed. | |
resolved_response=True, | |
) | |
# Form a new request with our new url. | |
new_request = request.replace( | |
url=new_url, | |
meta=new_meta | |
) | |
# Form a new request with our new url and request. | |
response.replace( | |
url=new_url, | |
request=new_request | |
) | |
return response | |
def convert_to_address(self, url): | |
""" | |
Replaces a domain to an address in the url. | |
This function is a stub for caching. | |
:param str url: original url; | |
:return: a new url. | |
:rtype: str | |
""" | |
domain = self.parse_host(url) | |
address_list = self.dns_dict.get(domain, []) | |
for address in address_list: | |
url = url.replace(domain, address) | |
return url | |
def convert_to_domain(self, url): | |
""" | |
Replaces an address to a domain in the url. | |
This function is a stub for caching. | |
:param str url: original url; | |
:return: a new url. | |
:rtype: str | |
""" | |
address = self.parse_host(url) | |
domain_list = self.dns_dict.get(address, []) | |
for domain in domain_list: | |
url = url.replace(address, domain) | |
return url | |
def parse_host(self, url): | |
""" | |
Returns a host part of url. | |
:param str url: source uri with IP or domain | |
:return: host of given url | |
:rtype: str | |
""" | |
parsed_uri = urlparse(url) | |
host = parsed_uri.netloc | |
return host | |
def compute_address(self, host_name): | |
""" | |
Returns a http-compatible IP-address of the given host. | |
This function is a stub for caching. | |
:param str host_name: symbolic name of host; | |
:return: string with IP-address. | |
:rtype: str | |
""" | |
address = self._compute_address(host_name) | |
return address | |
def _compute_address(self, host_name): | |
""" | |
Returns a http-compatible IP-address of the given host. | |
For IPv6 it wraps address into square brackets. | |
:param str host_name: symbolic name of host; | |
:return: string with IP-address. | |
:rtype: str | |
""" | |
addr_list = socket.getaddrinfo(host_name, 0) | |
for addr in addr_list: | |
(family, _socktype, _proto, _canonname, sockaddr) = addr | |
if socket.AF_INET6 == family: | |
(address, _port, _flow_info, _scope_id) = sockaddr | |
address = '[{address}]'.format( | |
address=address | |
) | |
return address | |
elif socket.AF_INET == family: | |
(address, _port) = sockaddr | |
return address | |
return host_name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment