Created
July 19, 2021 16:57
-
-
Save aleksul/bace92f06a826475744c2c001f23ffb0 to your computer and use it in GitHub Desktop.
Python ProxyGrabber using proxybroker v0.3.2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3.6 | |
import asyncio | |
import aiohttp | |
from concurrent import futures | |
from time import perf_counter | |
from os import path, stat, remove | |
from proxybroker import Broker | |
import logging | |
class ProxyGrabber: | |
def __init__(self, timeout=3, filename="proxy.txt", site_to_test="http://example.org/"): | |
self.site_to_test = site_to_test | |
self.timeout = aiohttp.ClientTimeout(total=timeout) | |
self.filename = filename | |
async def grab(self) -> str: | |
result = await self.loader() | |
if result is not None: | |
return f"http://{result}" | |
else: | |
result, _ = asyncio.wait( | |
[self.broker_find(), self.pub_find()], | |
timeout=30, | |
return_when="FIRST_COMPLETED", | |
) | |
result = [i.result() for i in result if i.result() is not None] | |
if result: | |
return f"http://{result[-1]}" | |
else: | |
raise OSError("No proxy found") | |
async def broker_find(self): | |
proxies_num = 10 | |
try: | |
proxies = asyncio.Queue() | |
broker = Broker(queue=proxies) | |
# finds 10(==proxies_num) https proxies | |
await broker.find(types=["HTTPS"], limit=proxies_num) | |
proxy_temp = [] | |
for _ in range(proxies_num): | |
proxy_temp.append(await | |
proxies.get()) # write proxies to the list as soon as possible | |
except Exception as err: # something might go wrong | |
logging.error(f"Can't find a proxy with proxy broker: {type(err)}: {err}") | |
return None | |
else: | |
# ProxyBroker returns lots of info but we need only proxy | |
proxy_temp = [str(i)[1:-1:].split()[4] for i in proxy_temp] | |
logging.debug(f"Find proxies with ProxyBroker: {proxy_temp}") | |
return await self.saver(proxy_temp) | |
async def pub_find(self): | |
try: | |
async with aiohttp.request( | |
"GET", | |
"http://pubproxy.com/api/proxy?limit=5&https=true&" | |
"last_check=60&format=txt", | |
timeout=self.timeout, | |
) as resp: | |
assert resp.status == 200 | |
proxy_temp = await resp.text() | |
except AssertionError: | |
logging.warning("Probably, limit has expired") | |
return None | |
else: | |
proxy_temp = proxy_temp.split("\n") | |
logging.debug(f"Find 5 proxy: {proxy_temp}") | |
return await self.saver(proxy_temp) | |
async def check(self, proxy: str, session: aiohttp.ClientSession): | |
# simply tests access to the site via proxy | |
site = self.site_to_test | |
proxy = "http://" + proxy | |
ping = perf_counter() | |
try: | |
async with session.get(site, proxy=proxy, timeout=self.timeout) as resp: | |
assert resp.status == 200 | |
except futures._base.TimeoutError: | |
logging.debug(f"Too slow proxy: {proxy}") | |
return None | |
except aiohttp.client_exceptions.ClientHttpProxyError: | |
logging.debug(f"Bad proxy: {proxy}") | |
return None | |
except Exception as err: | |
logging.debug(f"This proxy ({proxy}) doesn't work, " f"exception: {type(err)}:{err}") | |
return None | |
else: | |
ping = perf_counter() - ping | |
logging.debug(f"This one seems to be good! Proxy: {proxy} Ping: {ping}") | |
proxy = {"proxy": proxy, "ping": ping} | |
return proxy | |
async def saver(self, proxies_to_check: list): | |
# it's better to use one session per all requests | |
async with aiohttp.ClientSession() as session: | |
# check all the proxies in parallel mode | |
checked_proxies = await asyncio.wait([self.check(i, session) for i in proxies_to_check]) | |
# checker will return None if proxy is bad, we need only good results | |
proxies_to_save = [i.result() for i in checked_proxies[0] if i.result()] | |
if proxies_to_save: # for the case, when all proxies are bad | |
proxies_to_save = sorted( | |
proxies_to_save, key=lambda m: m["ping"]) # sorts proxies to find the fastest... | |
# and after we don't need ping argument and http:// prefix anymore | |
proxies_to_save = [i.get("proxy")[7::] for i in proxies_to_save] | |
with open(self.filename, "a+") as f: # write proxies to the file | |
read = set(f.readlines()) | |
proxies_to_save = list(set(proxies_to_save) - read) # double-write protection | |
for proxy in proxies_to_save: | |
assert f.write(proxy + "\n") | |
logging.info(f"Saved proxies to the file: {proxies_to_save}") | |
return proxies_to_save[0] | |
else: | |
return None | |
async def loader(self): | |
# almost same as saver, but it doesn't append file with new proxies | |
if not path.exists(self.filename): # Firstly, check if we have a file | |
logging.warning("We don't have a proxy file!") | |
return None | |
elif stat(self.filename).st_size == 0: # Secondly, if it is not empty | |
logging.warning("The proxy file is empty!") | |
return None | |
with open(self.filename, "r") as f: # Read the file and close it | |
read_proxies = f.readlines() | |
logging.debug("Everything is ok, opened the file with proxies...") | |
read_proxies = [i[:-1:] for i in read_proxies] # delete \n | |
# it's better to use one session per all requests | |
async with aiohttp.ClientSession() as session: | |
# check all the proxies in parallel mode | |
checked_proxies = await asyncio.gather(*[(self.check(i, session)) for i in read_proxies] | |
) | |
checked_proxies = [i for i in checked_proxies if i] # delete all None's | |
if checked_proxies: # for the case, when all proxies are bad | |
checked_proxies = sorted(checked_proxies, | |
key=lambda m: m["ping"]) # sorts proxies to find the fastest | |
logging.info(f'Found fastest proxy {checked_proxies[0]["proxy"]} ' | |
f'with ping {checked_proxies[0]["ping"]}sec') | |
# and after we don't need ping argument and http:// prefix anymore | |
checked_proxies = [i.get("proxy")[7::] for i in checked_proxies] | |
with open(self.filename, | |
"w") as f: # open file as "writable" to delete all the content first | |
for i in checked_proxies: | |
f.write(i + "\n") | |
logging.debug(f"Wrote in file this list of proxies: {checked_proxies}") | |
return checked_proxies[0] | |
else: | |
logging.warning("No working proxy in the file!") | |
remove(self.filename) | |
return None | |
async def check_site(site: str, timeout=aiohttp.ClientTimeout(total=5)) -> bool: | |
try: | |
async with aiohttp.request("GET", site, timeout=timeout) as resp: | |
assert resp.status == 200 | |
logging.debug(f"Internet seems to be connected. " | |
f"Response from {site}: {resp.status}") | |
except Exception as err: | |
logging.warning(f"Site {site} does not work: {type(err)}:{err}") | |
return False | |
else: | |
return True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment