Created
August 28, 2022 14:49
-
-
Save gamingflexer/a87b2ffab48ae2bc1c9d80d2cd70ff4f to your computer and use it in GitHub Desktop.
Google Fact Check | Web Scrapping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import urllib | |
import pandas as pd | |
from requests_html import HTML | |
from requests_html import HTMLSession | |
def get_source(url): | |
"""Return the source code for the provided URL. | |
Args: | |
url (string): URL of the page to scrape. | |
Returns: | |
response (object): HTTP response object from requests_html. | |
""" | |
try: | |
session = HTMLSession() | |
response = session.get(url) | |
return response | |
except requests.exceptions.RequestException as e: | |
print(e) | |
def scrape_google(query): | |
query = urllib.parse.quote_plus(query) | |
response = get_source("https://www.google.co.uk/search?q=" + query) | |
links = list(response.html.absolute_links) | |
google_domains = ('https://www.google.', | |
'https://google.', | |
'https://webcache.googleusercontent.', | |
'http://webcache.googleusercontent.', | |
'https://policies.google.', | |
'https://support.google.', | |
'https://maps.google.') | |
for url in links[:]: | |
if url.startswith(google_domains): | |
links.remove(url) | |
return links | |
def get_results(query): | |
query = urllib.parse.quote_plus(query) | |
response = get_source("https://www.google.co.uk/search?q=" + query) | |
return response | |
def parse_results(response): | |
css_identifier_result = ".tF2Cxc" | |
css_identifier_title = "h3" | |
css_identifier_link = ".yuRUbf a" | |
css_identifier_text = ".VwiC3b" | |
results = response.html.find(css_identifier_result) | |
output = [] | |
for result in results: | |
item = { | |
'title': result.find(css_identifier_title, first=True).text, | |
'link': result.find(css_identifier_link, first=True).attrs['href'], | |
'text': result.find(css_identifier_text, first=True).text | |
} | |
output.append(item) | |
return output | |
def google_search(query): | |
response = get_results(query) | |
return parse_results(response) | |
# results = google_search("web scraping") | |
# results | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment