Skip to content

Instantly share code, notes, and snippets.

@sakydev
Last active March 1, 2023 12:34
Show Gist options
  • Save sakydev/d03865dca7adfd75c0aa3693f695e848 to your computer and use it in GitHub Desktop.
Save sakydev/d03865dca7adfd75c0aa3693f695e848 to your computer and use it in GitHub Desktop.
py+scrapy: simple proxy rotation with python
import random, logging
from datetime import datetime, timedelta, date
class ProxiesRotation(object):
"""docstring for ProxiesRotation"""
def __init__(self, directory):
super(ProxiesRotation, self).__init__()
self.proxies = []
self.forbidden_proxies = []
self.directory = directory
self.usedProxies = {}
self.currentProxy = ''
self.maxProxyHits = 20
self.allused = False
self.logger = logging.getLogger()
self.totalUniqueProxies = 0
def getProxies(self):
proxies = [line.rstrip('\n') for line in open(self.directory + '/proxies.txt')]
random.shuffle(proxies)
self.proxies = proxies
return proxies
def getForbiddenProxies(self):
proxies = [line.rstrip('\n') for line in open(self.directory + '/forbidden_proxies.txt')]
self.forbidden_proxies = proxies
return proxies
def getProxy(self):
self.getNewProxy()
return self.getCurrentProxy()
def getCurrentProxyHits(self):
return self.usedProxies[self.currentKey]['hits']
def getCurrentProxy(self):
print(f"<<<<< Proxy {self.currentProxy} [{self.getCurrentProxyHits()}] >>>>>")
return self.currentProxy
def incrementProxyHits(self):
self.usedProxies[self.currentKey]['hits'] += 1
def getNewProxy(self):
matching = True
while matching:
# iterate over used ones
totalused = len(self.usedProxies)
gracelimit = len(self.proxies) - 1
if self.allused == True or totalused > gracelimit:
if self.allused == False:
self.allused = True
if totalused < 2:
raise CloseSpider("No more ips to be used")
return
self.logger.debug(f"All used, {totalused} > {gracelimit}")
maxKey = min(self.usedProxies)
current = self.usedProxies[maxKey]
if current['hits'] >= self.maxProxyHits or current['proxy'] in self.forbidden_proxies:
del[self.usedProxies[maxKey]]
else:
matching = False
newhits = current['hits']
key = datetime.now().timestamp()
self.currentKey = key
self.usedProxies[key] = {'proxy': current['proxy'], 'hits': newhits}
self.currentProxy = current['proxy']
self.logger.info(f"XXXXXXXX ____ Selected {current['proxy']} with {newhits} _______ XXXXXXXX")
del[self.usedProxies[maxKey]] # need to remove last instance of this proxy
return self.currentProxy
else:
proxy = self.getRandomProxy()
if proxy not in self.usedProxies and proxy not in self.forbidden_proxies:
self.totalUniqueProxies += 1
key = datetime.now().timestamp()
self.currentKey = key
self.currentProxy = proxy
self.usedProxies[key] = {'proxy': proxy, 'hits': 1}
return self.currentProxy
logging.CRITICAL("No more ips to be used")
raise CloseSpider("No more ips to be used")
def getRandomProxy(self):
return random.choice(self.proxies)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment