Last active
August 20, 2020 15:53
-
-
Save EdmundMartin/0230e90cbd4e0790afdd22318fa0ad8b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import time | |
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} | |
def fetch_results(search_term, number_results, language_code): | |
assert isinstance(search_term, str), 'Search term must be a string' | |
assert isinstance(number_results, int), 'Number of results must be an integer' | |
escaped_search_term = search_term.replace(' ', '+') | |
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results, language_code) | |
response = requests.get(google_url, headers=USER_AGENT) | |
response.raise_for_status() | |
return search_term, response.text | |
def parse_results(html, keyword): | |
soup = BeautifulSoup(html, 'html.parser') | |
found_results = [] | |
rank = 1 | |
result_block = soup.find_all('div', attrs={'class': 'g'}) | |
for result in result_block: | |
link = result.find('a', href=True) | |
title = result.find('h3', attrs={'class': 'r'}) | |
description = result.find('span', attrs={'class': 'st'}) | |
if link and title: | |
link = link['href'] | |
title = title.get_text() | |
if description: | |
description = description.get_text() | |
if link != '#': | |
found_results.append({'keyword': keyword, 'rank': rank, 'title': title, 'description': description, 'link': link}) | |
rank += 1 | |
return found_results | |
def scrape_google(search_term, number_results, language_code): | |
try: | |
keyword, html = fetch_results(search_term, number_results, language_code) | |
results = parse_results(html, keyword) | |
return results | |
except AssertionError: | |
raise Exception("Incorrect arguments parsed to function") | |
except requests.HTTPError: | |
raise Exception("You appear to have been blocked by Google") | |
except requests.RequestException: | |
raise Exception("Appears to be an issue with your connection") | |
if __name__ == '__main__': | |
keywords = ['edmund martin', 'python', 'google scraping'] | |
data = [] | |
for keyword in keywords: | |
try: | |
results = scrape_google(keyword, 100, "en") | |
for result in results: | |
data.append(result) | |
except Exception as e: | |
print(e) | |
finally: | |
time.sleep(10) | |
print(data) |
I am very new to this, programming and all, but I like the idea of this project and I am trying to expand on it. Anyways, when I ran this the first time, it worked. However, running it multiple times gave me the response that I've been blocked by google. Is that because I need to use Google's API to scrap a search? Thanks for your help!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@EdmundMartin thanks for this. I am getting the result returning [] still. I see you mentioned that Google has updated it's selectors. What have they updated to?