Skip to content

Instantly share code, notes, and snippets.

@6aditya8
Created August 27, 2019 15:55
Show Gist options
  • Save 6aditya8/c8ff33d6fc0c11de839bd9facf175cb6 to your computer and use it in GitHub Desktop.
Save 6aditya8/c8ff33d6fc0c11de839bd9facf175cb6 to your computer and use it in GitHub Desktop.
Get a list of proxy User Agents (useful for User Agent Rotation during scraping)
from bs4 import BeautifulSoup
import random
import requests
USER_AGENT_SCRAPER_BASE_URL = 'http://www.useragentstring.com/pages/useragentstring.php?name='
POPULAR_BROWSERS = ['Chrome', 'Firefox', 'Mozilla', 'Safari', 'Opera', 'Opera Mini', 'Edge', 'Internet Explorer']
def get_user_agent_strings_for_this_browser(browser):
"""
Get the latest User-Agent strings of the given Browser
:param browser: string of given Browser
:return: list of User agents of the given Browser
"""
url = USER_AGENT_SCRAPER_BASE_URL + browser
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
user_agent_links = soup.find('div', {'id': 'liste'}).findAll('a')[:20]
return [str(user_agent.text) for user_agent in user_agent_links]
def get_user_agents():
"""
Gather a list of some active User-Agent strings from
http://www.useragentstring.com of some of the Popular Browsers
:return: list of User-Agent strings
"""
user_agents = []
for browser in POPULAR_BROWSERS:
user_agents.extend(get_user_agent_strings_for_this_browser(browser))
return user_agents[3:] # Remove the first 3 Google Header texts from Chrome's user agents
proxy_user_agents = get_user_agents()
# To randomly select an User-Agent from the collected user-agent strings
random_user_agent = random.choice(proxy_user_agents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment