Last active
July 3, 2020 00:32
-
-
Save yeiichi/7322bcfd82c80a505d011e8815d53508 to your computer and use it in GitHub Desktop.
Fetch a page data using User Agent information.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Version 1.0.1 | |
# 2020-07-03 | |
import requests | |
import random | |
from bs4 import BeautifulSoup | |
# User agent definition: | |
# You can check your User Agent at ifconfig.me | |
UA_LIST = { | |
'SAFARI': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \ | |
AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15', | |
'FIREFOX': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) \ | |
Gecko/20100101 Firefox/76.0', | |
'CHROME': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) \ | |
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' | |
} | |
my_ua = random.choice(['SAFARI', 'FIREFOX', 'CHROME']) | |
my_headers = {'user-agent': UA_LIST[my_ua]} | |
class HtmlFetcher: | |
def __init__(self, target_url, encoding): | |
self.url = target_url | |
self.encoding = encoding | |
self.user_agent = my_ua | |
def soup(self): | |
# Fetch the target web page and return the response. | |
try: | |
response = requests.get(self.url, headers = my_headers, | |
timeout=(3.05, 27)) | |
response.encoding = self.encoding | |
except Exception as exc: | |
print(f'Error: {exc}') | |
tortoise = BeautifulSoup(response.text, 'lxml') | |
return tortoise | |
if __name__ == '__main__': | |
target = input('URL? >> ') | |
encoding = input('Encoding? >> ') | |
hf = HtmlFetcher(target, encoding) | |
print('\nURL: ', hf.url, | |
'\nUA : ', hf.user_agent, | |
'\nEncoding used: ', hf.encoding, | |
'\nSoup:\n', hf.soup() | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment