brianraila · October 18, 2018 02:26
diff --git a/crawler2.py b/crawler2.py
 # used cars

 import requests
 import csv
 from bs4 import BeautifulSoup


 USED_CARS_URL = 'https://www.sgcarmart.com/used_cars/info.php?ID={}&DL={}'


 url = 'https://www.sgcarmart.com/used_cars/listing.php?BRSR={}VEH={}&RPG=100&AVL=2'


 def get_links(page_url):

 	r = requests.get(page_url)
 	response = r.text
 	print('got page')
 	soup = BeautifulSoup(response, 'lxml')

 	links = soup.find_all('a')
 	return links


 def get_codes(links):

 	codes = []

 	for link in links:
 		if 'info.php' in link.get('href'):
 			link = link.get('href')
 			ID = link.split('?')[1].split('&')[0].split('=')[1]
 			try:
 				DL = link.split('?')[1].split('&')[1].split('=')[1]
 			except:
 				DL = '8000'
 			codes.append([ID, DL])

 	return codes



 def fetch_and_save(codes, vehicle_type):

 	all_values = []
 	index = 0
 	for code in codes:
 		if index % 2 == 0:
 			url = USED_CARS_URL.format(codes[index][0], codes[index][1])
 			owner_type = "Dealer"
 			if int(code[1]) == 1000:
 				owner_type = "Direct Owner Sale"
 			try:
 				r = requests.get(url, timeout=1)
 				response = r.text
 				response.replace("\r", "").replace("\t", "").replace("\n", "")

 				soup = BeautifulSoup(response, 'lxml')
 				vehicle = soup.find("a", {"class": "link_redbanner"}).text 

 				try:
 					price = soup.find(string="Price").find_next('td').contents[0].text
 				except:
 					price = ""

 				try:
 					dep = soup.find(string="Depreciation").find_next('td').contents[0].text
 				except:
 					dep = ""

 				try:
 					reg_date = soup.find(string="Reg Date").find_next('td').contents[0]
 					reg_date.replace('\r\n\t\t\t\t\t\t\t\t', '')
 				except:
 					reg_date = ''

 				try:	
 					man_f = soup.find(string="Manufactured").find_next('td').contents[0]
 					man_f.replace('\r\n\t\t\t\t\t\t\t\t', '')
 				except:
 					man_f = ""

 				try:
 					mil =  soup.find(string="Mileage").find_next('td').contents[0]
 				except:
 					mil = ''

 				try:
 					road_tax = soup.find(string="Road Tax").find_next('td').contents[0]
 				except:
 					road_tax = '-'
 					
 				try:
 					coe =  soup.find(string="COE").find_next('td').contents[0]
 				except:
 					coe = ''

 				try:
 					arf = soup.find(string="ARF").find_next('td').contents[0]
 				except:
 					arf - ''

 				try:
 					omv = soup.find(string="OMV").find_next('td').contents[0]
 				except:
 					omv = ''

 				posted = soup.find("div", {"id": "usedcar_postdate"}).text
 				posted = str(posted)
 				posted = posted.split('|')[0].split(':')[1]

 				
 				row = [vehicle, vehicle_type, posted, owner_type,
 					price, dep, reg_date, man_f, mil, road_tax, coe, omv, arf ]
 				

 				
 				print(row)
 				with open('task2.csv', 'a') as csv_file:
 					writer = csv.writer(csv_file)
 					writer.writerow(row)
 					print('Saved {}'.format(vehicle))

 			except:
 				print("Taking too long...skipped")


 		index = index + 1

 # DONT TRY THIS AT HOME...USED A DICTIONARY INSTEAD ;)  
 categories = [7,'Station Wagon' ,200, 13,'Mid Sized Sedan' ,1900, 12,'Luxury Sedan' ,2600 ,11,'Hatchback', 1400,
 				10,'MPV', 1500 ,9,'SUV', 1900 ,8,'Sports car' ,1500 ,
 				5,'Truck',  600,4,'Van' ,600 ,14,'Bus/Mini Bus', 100 ,3,'Others', 0 ]

 all_links = []

 index = 0
 for i in categories:
 	if isinstance(i, int):
 		if index % 3 == 0:
 			category = categories[index + 1]
 			limit = categories[index + 2]
 			limit = int(limit/100)
 			for k in range(0,limit):
 				cursor = k * 100
 				page_links = get_links(url.format(str(cursor), str(categories[index])))
 				print("Page {} links .page {} fetched".format(category, k))
 				all_links = all_links + page_links

 			all_codes = get_codes(all_links)

 			# print(all_codes)

 			fetch_and_save(all_codes, category)

 			all_codes = []

 	index = index + 1
	# used cars

	import requests
	import csv
	from bs4 import BeautifulSoup


	USED_CARS_URL = 'https://www.sgcarmart.com/used_cars/info.php?ID={}&DL={}'


	url = 'https://www.sgcarmart.com/used_cars/listing.php?BRSR={}VEH={}&RPG=100&AVL=2'


	def get_links(page_url):

	r = requests.get(page_url)
	response = r.text
	print('got page')
	soup = BeautifulSoup(response, 'lxml')

	links = soup.find_all('a')
	return links


	def get_codes(links):

	codes = []

	for link in links:
	if 'info.php' in link.get('href'):
	link = link.get('href')
	ID = link.split('?')[1].split('&')[0].split('=')[1]
	try:
	DL = link.split('?')[1].split('&')[1].split('=')[1]
	except:
	DL = '8000'
	codes.append([ID, DL])

	return codes



	def fetch_and_save(codes, vehicle_type):

	all_values = []
	index = 0
	for code in codes:
	if index % 2 == 0:
	url = USED_CARS_URL.format(codes[index][0], codes[index][1])
	owner_type = "Dealer"
	if int(code[1]) == 1000:
	owner_type = "Direct Owner Sale"
	try:
	r = requests.get(url, timeout=1)
	response = r.text
	response.replace("\r", "").replace("\t", "").replace("\n", "")

	soup = BeautifulSoup(response, 'lxml')
	vehicle = soup.find("a", {"class": "link_redbanner"}).text

	try:
	price = soup.find(string="Price").find_next('td').contents[0].text
	except:
	price = ""

	try:
	dep = soup.find(string="Depreciation").find_next('td').contents[0].text
	except:
	dep = ""

	try:
	reg_date = soup.find(string="Reg Date").find_next('td').contents[0]
	reg_date.replace('\r\n\t\t\t\t\t\t\t\t', '')
	except:
	reg_date = ''

	try:
	man_f = soup.find(string="Manufactured").find_next('td').contents[0]
	man_f.replace('\r\n\t\t\t\t\t\t\t\t', '')
	except:
	man_f = ""

	try:
	mil = soup.find(string="Mileage").find_next('td').contents[0]
	except:
	mil = ''

	try:
	road_tax = soup.find(string="Road Tax").find_next('td').contents[0]
	except:
	road_tax = '-'

	try:
	coe = soup.find(string="COE").find_next('td').contents[0]
	except:
	coe = ''

	try:
	arf = soup.find(string="ARF").find_next('td').contents[0]
	except:
	arf - ''

	try:
	omv = soup.find(string="OMV").find_next('td').contents[0]
	except:
	omv = ''

	posted = soup.find("div", {"id": "usedcar_postdate"}).text
	posted = str(posted)
	posted = posted.split('\|')[0].split(':')[1]


	row = [vehicle, vehicle_type, posted, owner_type,
	price, dep, reg_date, man_f, mil, road_tax, coe, omv, arf ]



	print(row)
	with open('task2.csv', 'a') as csv_file:
	writer = csv.writer(csv_file)
	writer.writerow(row)
	print('Saved {}'.format(vehicle))

	except:
	print("Taking too long...skipped")


	index = index + 1

	# DONT TRY THIS AT HOME...USED A DICTIONARY INSTEAD ;)
	categories = [7,'Station Wagon' ,200, 13,'Mid Sized Sedan' ,1900, 12,'Luxury Sedan' ,2600 ,11,'Hatchback', 1400,
	10,'MPV', 1500 ,9,'SUV', 1900 ,8,'Sports car' ,1500 ,
	5,'Truck', 600,4,'Van' ,600 ,14,'Bus/Mini Bus', 100 ,3,'Others', 0 ]

	all_links = []

	index = 0
	for i in categories:
	if isinstance(i, int):
	if index % 3 == 0:
	category = categories[index + 1]
	limit = categories[index + 2]
	limit = int(limit/100)
	for k in range(0,limit):
	cursor = k * 100
	page_links = get_links(url.format(str(cursor), str(categories[index])))
	print("Page {} links .page {} fetched".format(category, k))
	all_links = all_links + page_links

	all_codes = get_codes(all_links)

	# print(all_codes)

	fetch_and_save(all_codes, category)

	all_codes = []

	index = index + 1