Skip to content

Instantly share code, notes, and snippets.

@miodeqqq
Created October 19, 2018 12:06
Show Gist options
  • Save miodeqqq/97ef6f8b4846dcefa07bfd16dc552790 to your computer and use it in GitHub Desktop.
Save miodeqqq/97ef6f8b4846dcefa07bfd16dc552790 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import logging
from random import randint
from time import sleep
from django.db.models import F
from requests import RequestException
from requests import get
from user_agent import generate_user_agent
from proxy.models import Proxy
from proxy.utils import get_proxies
logger = logging.getLogger(__name__)
class GetRequestException(Exception):
"""
Custom Exception container.
"""
class GetRequestResponse(object):
"""
Implements generic logic for making all requests.
"""
def __init__(self):
self.proxies, self.proxy_obj = get_proxies()
self.headers = {
'User-Agent': generate_user_agent(),
}
def get_next_proxies(self):
"""
Gets next random proxies.
"""
self.proxies, self.proxy_obj = get_proxies()
def get_next_headers(self):
"""
Gets next random headers.
"""
self.headers.update(**{
'User-Agent': generate_user_agent(),
})
def get_request_response(self, url, stream=None, verify=None, params=None):
"""
General method do make a GET request with given URL as argument.
"""
from urllib3 import disable_warnings
from urllib3.contrib import pyopenssl
from urllib3.exceptions import InsecureRequestWarning
disable_warnings(InsecureRequestWarning)
pyopenssl.inject_into_urllib3()
RETRIES = 1
while RETRIES <= 50:
try:
if url is not None:
response = get(
url=url,
verify=verify,
params=params,
stream=stream,
headers=self.headers,
proxies=self.proxies if self.proxies else None,
timeout=600
)
try:
Proxy.objects.filter(pk=self.proxy_obj.pk).update(
proxy_usage=F('proxy_usage') + 1,
proxy_service_usage_requests=F('proxy_service_usage_requests') + 1
)
except (Proxy.DoesNotExist, AttributeError):
pass
logger.info('[get_request_response] URL --> {}'.format(response.url))
logger.info('[get_request_response] Status Code --> {}'.format(response.status_code))
if b'access denied' in response.content.strip().lower():
logger.info('[get_request_response] Access Denied ! Pubmed blocked us!')
if response.status_code in (403, 407, 408):
try:
Proxy.objects.filter(pk=self.proxy_obj.pk).update(
captcha_count=F('captcha_count') + 1,
)
except (Proxy.DoesNotExist, AttributeError):
pass
if response.status_code not in (200, 403, 404, 502, 503):
raise GetRequestException(
'Received other HTTP status code than (200, 404) - sad! Repeating!')
return {
'response': response,
'status_code': response.status_code,
'proxies': self.proxies.get('http')
}
except ConnectionError as c:
logger.info('[get_request_response] Connection Error --> {}'.format(c))
pass
except (RequestException, GetRequestException) as e:
logger.info('[get_request_response] Error --> {}'.format(e))
self.get_next_proxies()
self.get_next_headers()
RETRIES += 1
sleep(randint(5, 15))
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment