Last active
October 23, 2019 15:36
-
-
Save NicolasBizzozzero/e26fb23ce92daa642f0729648b51ce1e to your computer and use it in GitHub Desktop.
Example of using the Python's requests library with RR cycling proxies.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import requests | |
from lxml.html import fromstring | |
def main(): | |
URL = 'https://example.org' | |
BATCH_SIZE_PROXIES = 10 | |
CYCLE_TIME = 3 | |
TIMEOUT = 3 | |
useless_proxies = [] | |
while True: | |
proxy_pool = itertools.cycle(get_proxies(n_proxies=BATCH_SIZE_PROXIES)) | |
for proxy in range(BATCH_SIZE_PROXIES * CYCLE_TIME): | |
proxy = next(proxy_pool) | |
if proxy in useless_proxies: | |
continue | |
try: | |
response = requests.get(URL, proxies={"http": proxy, "https": proxy}, timeout=TIMEOUT) | |
print(response.text) | |
except requests.exceptions.ProxyError: | |
useless_proxies.append(proxy) | |
def get_proxies(n_proxies: int = 10): | |
""" Retrieve a maximum of `n_proxies` from the 'free-proxy-list.net' website. """ | |
URL = 'https://free-proxy-list.net/' | |
parser = fromstring(requests.get(URL).text) | |
proxies = set() | |
for line in parser.xpath('//tbody/tr'): | |
if n_proxies <= 0: | |
break | |
if line.xpath('.//td[7][contains(text(),"yes")]'): | |
# Grabbing IP and corresponding port | |
proxy = ":".join([line.xpath('.//td[1]/text()')[0], line.xpath('.//td[2]/text()')[0]]) | |
proxies.add(proxy) | |
n_proxies -= 1 | |
return proxies | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment