Jey JeyKJey

JeyKJey / element_not_found.py

Last active September 26, 2018 06:43

	price = page_content.find(id='listings_prices')
	# check if the element with such id exists or not
	if price is None:
	# NOTIFY! LOG IT, COUNT IT
	else:
	# do something

JeyKJey / user_agent_denerator.py

Last active April 2, 2020 10:22

	# library to generate user agent
	from user_agent import generate_user_agent
	# generate a user agent
	headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
	#headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.63 Safari/537.36'}
	page_response = requests.get(page_link, timeout=5, headers=headers)

JeyKJey / timeout.py

Last active September 26, 2018 06:42

	# timeout is set to 5 secodns
	page_response = requests.get(page_link, timeout=5, headers=headers)

JeyKJey / speed_up.py

Last active December 24, 2019 05:12

	import numpy as np
	import multiprocessing as multi

	def chunks(n, page_list):
	"""Splits the list into n chunks"""
	return np.array_split(page_list,n)

	cpus = multi.cpu_count()
	workers = []
	page_list = ['www.website.com/page1.html', 'www.website.com/page2.html'

JeyKJey / request_exceptions.py

Last active September 26, 2018 06:42

	try:
	page_response = requests.get(page_link, timeout=5)
	if page_response.status_code == 200:
	# extract
	else:
	print(page_response.status_code)
	# notify, try again
	except requests.Timeout as e:
	print("It is time to timeout")
	print(str(e))

JeyKJey / beautiful_soup.py

Last active September 2, 2019 09:36

	from bs4 import BeautifulSoup
	import requests
	page_link ='https://www.website_to_crawl.com'
	# fetch the content from url
	page_response = requests.get(page_link, timeout=5)
	# parse html
	page_content = BeautifulSoup(page_response.content, "html.parser")

	# extract all html elements where price is stored
	prices = page_content.find_all(class_='main_price')

JeyKJey / proxy_requests.py

Last active April 2, 2020 10:22

	proxies = {'http' : 'http://10.10.0.0:0000',
	'https': 'http://120.10.0.0:0000'}
	page_response = requests.get(page_link, proxies=proxies, timeout=5)