Skip to content

Instantly share code, notes, and snippets.

@palewire
Last active July 12, 2025 07:35
Show Gist options
  • Save palewire/0dded073b8f9aa9202ca2f364e664568 to your computer and use it in GitHub Desktop.
Save palewire/0dded073b8f9aa9202ca2f364e664568 to your computer and use it in GitHub Desktop.
Rotating proxy scraper example
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@JakeMofa
Copy link

Rotating proxy scraper example
By Ben Welsh

An example of how to scrape a list of available proxies and use them to make web requests. Helpful when scraping sites that employ measures to restrict access.

import requests
import itertools
from bs4 import BeautifulSoup
Get proxy list from free-proxy-list.net
def get_proxies():
"""
Fetch a list of proxy addresses from the web.
"""
# Fetch the page with the list
r = requests.get('https://free-proxy-list.net/')

# Set it up in BeautifulSoup for parsing
soup = BeautifulSoup(r.text, "html.parser")

# Initialize a blank list to use later
proxies = set()

# Loop through all the rows in the table we want to scrape
for row in soup.find("tbody").find_all('tr')[:75]:

    # If it is listed as a working proxy ...
    if 'yes' in str(row):
        # ... parse out the IP
        cell_list = row.find_all("td")
        ip = cell_list[0].string
        port = cell_list[1].string

        # Add it to our list
        proxies.add("{}:{}".format(ip, port))

# Return the list
return proxies

proxy_list = get_proxies()
proxy_list
{'103.204.210.112:8080',
'103.240.109.171:53281',
'103.42.253.218:8080',
'103.57.71.109:53281',
'110.77.188.103:62225',
'110.77.239.83:42619',
'111.67.71.238:53281',
'114.134.187.162:53281',
'121.166.157.33:8080',
'121.52.157.23:8080',
'138.186.21.86:53281',
'138.204.142.139:31773',
'139.5.153.86:53281',
'145.249.105.25:8118',
'145.255.28.218:53281',
'160.119.153.206:13093',
'170.84.51.74:53281',
'177.206.131.128:53281',
'177.67.217.14:53281',
'178.176.28.164:8080',
'179.191.87.158:53281',
'181.112.145.222:53281',
'181.112.34.222:53281',
'181.112.46.250:53281',
'181.192.30.222:53281',
'182.253.130.174:53281',
'182.253.37.116:3128',
'186.46.90.50:53281',
'188.126.63.203:41258',
'189.43.88.18:53281',
'190.128.158.54:53281',
'192.141.118.255:53281',
'193.107.247.98:53281',
'200.58.214.114:8080',
'201.166.181.8:53281',
'202.142.164.22:53281',
'213.192.75.138:53281',
'27.255.40.63:8080',
'31.41.89.73:41258',
'36.83.72.178:80',
'37.60.215.133:53281',
'38.123.68.72:8080',
'5.228.166.234:53281',
'5.9.70.215:808',
'62.213.14.166:8080',
'77.85.169.2:8080',
'78.156.49.26:41258',
'78.189.65.220:8080',
'80.254.102.220:3128',
'81.163.50.192:41258',
'81.30.216.147:41258',
'81.95.139.186:53281',
'85.117.77.75:53281',
'89.110.59.227:8080',
'89.255.71.162:53281',
'89.43.38.32:8080',
'91.224.63.218:8080',
'91.230.252.163:3128',
'92.247.93.142:8080',
'95.47.83.56:44331'}
Convert it into a pool that will randomly return items forever
proxy_pool = itertools.cycle(proxy_list)
next(proxy_pool)
'89.43.38.32:8080'
next(proxy_pool)
'138.186.21.86:53281'
next(proxy_pool)
'91.230.252.163:3128'
Create a similar pool of user agents
useragent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
useragent_pool = itertools.cycle(useragent_list)
next(useragent_pool)
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
next(useragent_pool)
<itertools.cycle at 0x7f8acc3d01b8>
next(useragent_pool)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment