This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
price = page_content.find(id='listings_prices') | |
# check if the element with such id exists or not | |
if price is None: | |
# NOTIFY! LOG IT, COUNT IT | |
else: | |
# do something |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# library to generate user agent | |
from user_agent import generate_user_agent | |
# generate a user agent | |
headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))} | |
#headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.63 Safari/537.36'} | |
page_response = requests.get(page_link, timeout=5, headers=headers) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# timeout is set to 5 secodns | |
page_response = requests.get(page_link, timeout=5, headers=headers) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import multiprocessing as multi | |
def chunks(n, page_list): | |
"""Splits the list into n chunks""" | |
return np.array_split(page_list,n) | |
cpus = multi.cpu_count() | |
workers = [] | |
page_list = ['www.website.com/page1.html', 'www.website.com/page2.html' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
page_response = requests.get(page_link, timeout=5) | |
if page_response.status_code == 200: | |
# extract | |
else: | |
print(page_response.status_code) | |
# notify, try again | |
except requests.Timeout as e: | |
print("It is time to timeout") | |
print(str(e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
page_link ='https://www.website_to_crawl.com' | |
# fetch the content from url | |
page_response = requests.get(page_link, timeout=5) | |
# parse html | |
page_content = BeautifulSoup(page_response.content, "html.parser") | |
# extract all html elements where price is stored | |
prices = page_content.find_all(class_='main_price') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
proxies = {'http' : 'http://10.10.0.0:0000', | |
'https': 'http://120.10.0.0:0000'} | |
page_response = requests.get(page_link, proxies=proxies, timeout=5) |