Created
October 19, 2018 12:06
-
-
Save miodeqqq/97ef6f8b4846dcefa07bfd16dc552790 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import logging | |
from random import randint | |
from time import sleep | |
from django.db.models import F | |
from requests import RequestException | |
from requests import get | |
from user_agent import generate_user_agent | |
from proxy.models import Proxy | |
from proxy.utils import get_proxies | |
logger = logging.getLogger(__name__) | |
class GetRequestException(Exception): | |
""" | |
Custom Exception container. | |
""" | |
class GetRequestResponse(object): | |
""" | |
Implements generic logic for making all requests. | |
""" | |
def __init__(self): | |
self.proxies, self.proxy_obj = get_proxies() | |
self.headers = { | |
'User-Agent': generate_user_agent(), | |
} | |
def get_next_proxies(self): | |
""" | |
Gets next random proxies. | |
""" | |
self.proxies, self.proxy_obj = get_proxies() | |
def get_next_headers(self): | |
""" | |
Gets next random headers. | |
""" | |
self.headers.update(**{ | |
'User-Agent': generate_user_agent(), | |
}) | |
def get_request_response(self, url, stream=None, verify=None, params=None): | |
""" | |
General method do make a GET request with given URL as argument. | |
""" | |
from urllib3 import disable_warnings | |
from urllib3.contrib import pyopenssl | |
from urllib3.exceptions import InsecureRequestWarning | |
disable_warnings(InsecureRequestWarning) | |
pyopenssl.inject_into_urllib3() | |
RETRIES = 1 | |
while RETRIES <= 50: | |
try: | |
if url is not None: | |
response = get( | |
url=url, | |
verify=verify, | |
params=params, | |
stream=stream, | |
headers=self.headers, | |
proxies=self.proxies if self.proxies else None, | |
timeout=600 | |
) | |
try: | |
Proxy.objects.filter(pk=self.proxy_obj.pk).update( | |
proxy_usage=F('proxy_usage') + 1, | |
proxy_service_usage_requests=F('proxy_service_usage_requests') + 1 | |
) | |
except (Proxy.DoesNotExist, AttributeError): | |
pass | |
logger.info('[get_request_response] URL --> {}'.format(response.url)) | |
logger.info('[get_request_response] Status Code --> {}'.format(response.status_code)) | |
if b'access denied' in response.content.strip().lower(): | |
logger.info('[get_request_response] Access Denied ! Pubmed blocked us!') | |
if response.status_code in (403, 407, 408): | |
try: | |
Proxy.objects.filter(pk=self.proxy_obj.pk).update( | |
captcha_count=F('captcha_count') + 1, | |
) | |
except (Proxy.DoesNotExist, AttributeError): | |
pass | |
if response.status_code not in (200, 403, 404, 502, 503): | |
raise GetRequestException( | |
'Received other HTTP status code than (200, 404) - sad! Repeating!') | |
return { | |
'response': response, | |
'status_code': response.status_code, | |
'proxies': self.proxies.get('http') | |
} | |
except ConnectionError as c: | |
logger.info('[get_request_response] Connection Error --> {}'.format(c)) | |
pass | |
except (RequestException, GetRequestException) as e: | |
logger.info('[get_request_response] Error --> {}'.format(e)) | |
self.get_next_proxies() | |
self.get_next_headers() | |
RETRIES += 1 | |
sleep(randint(5, 15)) | |
continue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment