Last active
March 1, 2023 12:34
-
-
Save sakydev/d03865dca7adfd75c0aa3693f695e848 to your computer and use it in GitHub Desktop.
py+scrapy: simple proxy rotation with python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random, logging | |
from datetime import datetime, timedelta, date | |
class ProxiesRotation(object): | |
"""docstring for ProxiesRotation""" | |
def __init__(self, directory): | |
super(ProxiesRotation, self).__init__() | |
self.proxies = [] | |
self.forbidden_proxies = [] | |
self.directory = directory | |
self.usedProxies = {} | |
self.currentProxy = '' | |
self.maxProxyHits = 20 | |
self.allused = False | |
self.logger = logging.getLogger() | |
self.totalUniqueProxies = 0 | |
def getProxies(self): | |
proxies = [line.rstrip('\n') for line in open(self.directory + '/proxies.txt')] | |
random.shuffle(proxies) | |
self.proxies = proxies | |
return proxies | |
def getForbiddenProxies(self): | |
proxies = [line.rstrip('\n') for line in open(self.directory + '/forbidden_proxies.txt')] | |
self.forbidden_proxies = proxies | |
return proxies | |
def getProxy(self): | |
self.getNewProxy() | |
return self.getCurrentProxy() | |
def getCurrentProxyHits(self): | |
return self.usedProxies[self.currentKey]['hits'] | |
def getCurrentProxy(self): | |
print(f"<<<<< Proxy {self.currentProxy} [{self.getCurrentProxyHits()}] >>>>>") | |
return self.currentProxy | |
def incrementProxyHits(self): | |
self.usedProxies[self.currentKey]['hits'] += 1 | |
def getNewProxy(self): | |
matching = True | |
while matching: | |
# iterate over used ones | |
totalused = len(self.usedProxies) | |
gracelimit = len(self.proxies) - 1 | |
if self.allused == True or totalused > gracelimit: | |
if self.allused == False: | |
self.allused = True | |
if totalused < 2: | |
raise CloseSpider("No more ips to be used") | |
return | |
self.logger.debug(f"All used, {totalused} > {gracelimit}") | |
maxKey = min(self.usedProxies) | |
current = self.usedProxies[maxKey] | |
if current['hits'] >= self.maxProxyHits or current['proxy'] in self.forbidden_proxies: | |
del[self.usedProxies[maxKey]] | |
else: | |
matching = False | |
newhits = current['hits'] | |
key = datetime.now().timestamp() | |
self.currentKey = key | |
self.usedProxies[key] = {'proxy': current['proxy'], 'hits': newhits} | |
self.currentProxy = current['proxy'] | |
self.logger.info(f"XXXXXXXX ____ Selected {current['proxy']} with {newhits} _______ XXXXXXXX") | |
del[self.usedProxies[maxKey]] # need to remove last instance of this proxy | |
return self.currentProxy | |
else: | |
proxy = self.getRandomProxy() | |
if proxy not in self.usedProxies and proxy not in self.forbidden_proxies: | |
self.totalUniqueProxies += 1 | |
key = datetime.now().timestamp() | |
self.currentKey = key | |
self.currentProxy = proxy | |
self.usedProxies[key] = {'proxy': proxy, 'hits': 1} | |
return self.currentProxy | |
logging.CRITICAL("No more ips to be used") | |
raise CloseSpider("No more ips to be used") | |
def getRandomProxy(self): | |
return random.choice(self.proxies) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment