Last active
July 31, 2020 08:05
-
-
Save yeiichi/401115f869fcfc8b5102f2c8fa36eea1 to your computer and use it in GitHub Desktop.
Download a Google Results page.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from urllib.parse import quote_plus, urlunsplit | |
import requests | |
import re | |
PROJECT_ROOT_PATH = '.' | |
class GoogleResultsPage: | |
'''Query text, Results number per page -> search results response''' | |
def __init__(self, query, rslts_num): | |
self.__qry = query | |
self.__num = rslts_num | |
query_string = 'q='+quote_plus(self.__qry)+'&num='+str(self.__num) | |
search_string = urlunsplit( | |
('https', 'www.google.com', '/search', query_string, '')) | |
self.__sstr = search_string | |
def page_fetcher(self): | |
'''Fetch the result page and return as a text response''' | |
my_headers = {'user-agent': | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6)\ | |
AppleWebKit/537.36 (KHTML, like Gecko)\ | |
Chrome/84.0.4147.105 Safari/537.36'} | |
response = requests.get(self.__sstr, | |
headers=my_headers, timeout=(3.05, 27)) | |
return response.text | |
################################ | |
# Output to a file. | |
def html_to_file(html_text): | |
'''Text response content to a HTML file.''' | |
output_file_name = re.sub(r'[\/.:;*?"<>| ]', r'_', query)+'.html' | |
output_file_path = os.path.join(PROJECT_ROOT_PATH, output_file_name) | |
with open(output_file_path, 'w') as f: | |
f.write(html_text) | |
print('Done! ', end='') | |
print('File path:', output_file_path) | |
if __name__ == '__main__': | |
query = input('Query? >> ') | |
rslts_num = input('Results per page (upto 100)? >> ') | |
html_text = GoogleResultsPage(query, rslts_num).page_fetcher() | |
html_to_file(html_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment