-
-
Save Lowell130/52bbaca1937fc9108c4dcb6731456a71 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import time | |
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} | |
def fetch_results(search_term, number_results, language_code): | |
assert isinstance(search_term, str), 'Search term must be a string' | |
assert isinstance(number_results, int), 'Number of results must be an integer' | |
escaped_search_term = search_term.replace(' ', '+') | |
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results, language_code) | |
response = requests.get(google_url, headers=USER_AGENT) | |
response.raise_for_status() | |
return search_term, response.text | |
def parse_results(html, keyword): | |
soup = BeautifulSoup(html, 'html.parser') | |
found_results = [] | |
rank = 1 | |
result_block = soup.find_all('div', attrs={'class': 'g'}) | |
for result in result_block: | |
link = result.find('a', href=True) | |
title = result.find('h3', attrs={'class': 'r'}) | |
description = result.find('span', attrs={'class': 'st'}) | |
if link and title: | |
link = link['href'] | |
title = title.get_text() | |
description = description.get_text() | |
if link != '#': | |
found_results.append({'keyword': keyword, 'rank': rank, 'title': title, 'description': description}) | |
rank += 1 | |
return found_results | |
def scrape_google(search_term, number_results, language_code): | |
try: | |
keyword, html = fetch_results(search_term, number_results, language_code) | |
results = parse_results(html, keyword) | |
return results | |
except AssertionError: | |
raise Exception("Incorrect arguments parsed to function") | |
except requests.HTTPError: | |
raise Exception("You appear to have been blocked by Google") | |
except requests.RequestException: | |
raise Exception("Appears to be an issue with your connection") | |
if __name__ == '__main__': | |
keywords = ['edmund martin', 'python', 'google scraping'] | |
data = [] | |
for keyword in keywords: | |
try: | |
results = scrape_google(keyword, 100, "en") | |
for result in results: | |
data.append(result) | |
except Exception as e: | |
print(e) | |
finally: | |
time.sleep(10) | |
print(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment