Created
August 27, 2019 15:55
-
-
Save 6aditya8/c8ff33d6fc0c11de839bd9facf175cb6 to your computer and use it in GitHub Desktop.
Get a list of proxy User Agents (useful for User Agent Rotation during scraping)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import random | |
import requests | |
USER_AGENT_SCRAPER_BASE_URL = 'http://www.useragentstring.com/pages/useragentstring.php?name=' | |
POPULAR_BROWSERS = ['Chrome', 'Firefox', 'Mozilla', 'Safari', 'Opera', 'Opera Mini', 'Edge', 'Internet Explorer'] | |
def get_user_agent_strings_for_this_browser(browser): | |
""" | |
Get the latest User-Agent strings of the given Browser | |
:param browser: string of given Browser | |
:return: list of User agents of the given Browser | |
""" | |
url = USER_AGENT_SCRAPER_BASE_URL + browser | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
user_agent_links = soup.find('div', {'id': 'liste'}).findAll('a')[:20] | |
return [str(user_agent.text) for user_agent in user_agent_links] | |
def get_user_agents(): | |
""" | |
Gather a list of some active User-Agent strings from | |
http://www.useragentstring.com of some of the Popular Browsers | |
:return: list of User-Agent strings | |
""" | |
user_agents = [] | |
for browser in POPULAR_BROWSERS: | |
user_agents.extend(get_user_agent_strings_for_this_browser(browser)) | |
return user_agents[3:] # Remove the first 3 Google Header texts from Chrome's user agents | |
proxy_user_agents = get_user_agents() | |
# To randomly select an User-Agent from the collected user-agent strings | |
random_user_agent = random.choice(proxy_user_agents) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment