Last active
June 1, 2021 23:51
-
-
Save palewire/0dded073b8f9aa9202ca2f364e664568 to your computer and use it in GitHub Desktop.
Rotating proxy scraper example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Rotating proxy scraper example\n", | |
"\n", | |
"By [Ben Welsh](http://palewi.re/who-is-ben-welsh/)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"An example of how to scrape a list of available proxies and use them to make web requests. Helpful when scraping sites that employ measures to restrict access." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import itertools\n", | |
"from bs4 import BeautifulSoup" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Get proxy list from free-proxy-list.net" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_proxies():\n", | |
" \"\"\"\n", | |
" Fetch a list of proxy addresses from the web.\n", | |
" \"\"\"\n", | |
" # Fetch the page with the list\n", | |
" r = requests.get('https://free-proxy-list.net/')\n", | |
"\n", | |
" # Set it up in BeautifulSoup for parsing\n", | |
" soup = BeautifulSoup(r.text, \"html.parser\")\n", | |
"\n", | |
" # Initialize a blank list to use later\n", | |
" proxies = set()\n", | |
"\n", | |
" # Loop through all the rows in the table we want to scrape\n", | |
" for row in soup.find(\"tbody\").find_all('tr')[:75]:\n", | |
"\n", | |
" # If it is listed as a working proxy ...\n", | |
" if 'yes' in str(row):\n", | |
" # ... parse out the IP\n", | |
" cell_list = row.find_all(\"td\")\n", | |
" ip = cell_list[0].string\n", | |
" port = cell_list[1].string\n", | |
"\n", | |
" # Add it to our list\n", | |
" proxies.add(\"{}:{}\".format(ip, port))\n", | |
"\n", | |
" # Return the list\n", | |
" return proxies" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"proxy_list = get_proxies()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'103.204.210.112:8080',\n", | |
" '103.240.109.171:53281',\n", | |
" '103.42.253.218:8080',\n", | |
" '103.57.71.109:53281',\n", | |
" '110.77.188.103:62225',\n", | |
" '110.77.239.83:42619',\n", | |
" '111.67.71.238:53281',\n", | |
" '114.134.187.162:53281',\n", | |
" '121.166.157.33:8080',\n", | |
" '121.52.157.23:8080',\n", | |
" '138.186.21.86:53281',\n", | |
" '138.204.142.139:31773',\n", | |
" '139.5.153.86:53281',\n", | |
" '145.249.105.25:8118',\n", | |
" '145.255.28.218:53281',\n", | |
" '160.119.153.206:13093',\n", | |
" '170.84.51.74:53281',\n", | |
" '177.206.131.128:53281',\n", | |
" '177.67.217.14:53281',\n", | |
" '178.176.28.164:8080',\n", | |
" '179.191.87.158:53281',\n", | |
" '181.112.145.222:53281',\n", | |
" '181.112.34.222:53281',\n", | |
" '181.112.46.250:53281',\n", | |
" '181.192.30.222:53281',\n", | |
" '182.253.130.174:53281',\n", | |
" '182.253.37.116:3128',\n", | |
" '186.46.90.50:53281',\n", | |
" '188.126.63.203:41258',\n", | |
" '189.43.88.18:53281',\n", | |
" '190.128.158.54:53281',\n", | |
" '192.141.118.255:53281',\n", | |
" '193.107.247.98:53281',\n", | |
" '200.58.214.114:8080',\n", | |
" '201.166.181.8:53281',\n", | |
" '202.142.164.22:53281',\n", | |
" '213.192.75.138:53281',\n", | |
" '27.255.40.63:8080',\n", | |
" '31.41.89.73:41258',\n", | |
" '36.83.72.178:80',\n", | |
" '37.60.215.133:53281',\n", | |
" '38.123.68.72:8080',\n", | |
" '5.228.166.234:53281',\n", | |
" '5.9.70.215:808',\n", | |
" '62.213.14.166:8080',\n", | |
" '77.85.169.2:8080',\n", | |
" '78.156.49.26:41258',\n", | |
" '78.189.65.220:8080',\n", | |
" '80.254.102.220:3128',\n", | |
" '81.163.50.192:41258',\n", | |
" '81.30.216.147:41258',\n", | |
" '81.95.139.186:53281',\n", | |
" '85.117.77.75:53281',\n", | |
" '89.110.59.227:8080',\n", | |
" '89.255.71.162:53281',\n", | |
" '89.43.38.32:8080',\n", | |
" '91.224.63.218:8080',\n", | |
" '91.230.252.163:3128',\n", | |
" '92.247.93.142:8080',\n", | |
" '95.47.83.56:44331'}" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"proxy_list" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Convert it into a pool that will randomly return items forever" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"proxy_pool = itertools.cycle(proxy_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'89.43.38.32:8080'" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"next(proxy_pool)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'138.186.21.86:53281'" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"next(proxy_pool)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'91.230.252.163:3128'" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"next(proxy_pool)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Create a similar pool of user agents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"useragent_list = [\n", | |
" # Chrome\n", | |
" 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',\n", | |
" 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',\n", | |
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',\n", | |
" # Firefox\n", | |
" 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',\n", | |
" 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',\n", | |
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',\n", | |
" 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',\n", | |
" 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',\n", | |
" 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',\n", | |
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',\n", | |
" 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',\n", | |
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',\n", | |
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',\n", | |
" 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',\n", | |
" 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',\n", | |
" 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"useragent_pool = itertools.cycle(useragent_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"next(useragent_pool)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<itertools.cycle at 0x7f8acc3d01b8>" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"next(useragent_pool)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"next(useragent_pool)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Request a URL using a random proxy and a random user agent" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_url(url):\n", | |
" \"\"\"\n", | |
" Returns the response from a URL, retries if it fails.\n", | |
" \"\"\"\n", | |
" # Get the proxy\n", | |
" proxy = next(proxy_pool)\n", | |
" \n", | |
" # Get the user agent\n", | |
" useragent = next(useragent_pool)\n", | |
" \n", | |
" # Log\n", | |
" print(\"Making a GET request for {} with proxy {} and user agent {}\".format(url, proxy, useragent))\n", | |
" \n", | |
" # Go get it\n", | |
" return requests.get(url, proxies={\"http\": proxy, \"https\": proxy}, headers={'User-Agent': useragent})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Making a GET request for http://cal-access.sos.ca.gov/Campaign/Measures/list.aspx?session=2015 with proxy 91.224.63.218:8080 and user agent Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36\n" | |
] | |
} | |
], | |
"source": [ | |
"r = get_url(\"http://cal-access.sos.ca.gov/Campaign/Measures/list.aspx?session=2015\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"403" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"r.status_code" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'<html style=\"height:100%\"><head><META NAME=\"ROBOTS\" CONTENT=\"NOINDEX, NOFOLLOW\"><meta name=\"format-detection\" content=\"telephone=no\"><meta name=\"viewport\" content=\"initial-scale=1.0\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\"><script type=\"text/javascript\" src=\"/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3\"></script></head><body style=\"margin:0px;height:100%\"><iframe src=\"/_Incapsula_Resource?CWUDNSAI=1&xinfo=10-85779505-0%200NNN%20RT%281532808702515%207%29%20q%280%20-1%20-1%200%29%20r%280%20-1%29%20B16%284%2c312%2c0%29%20U18&incident_id=539032060125820800-325878285682329706&edet=16&cinfo=04000000\" frameborder=0 width=\"100%\" height=\"100%\" marginheight=\"0px\" marginwidth=\"0px\">Request unsuccessful. Incapsula incident ID: 539032060125820800-325878285682329706</iframe></body></html>'" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"r.text" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment