Created
March 20, 2017 14:06
-
-
Save hbro/deb705eb7c85e5bc30ffe3a0f3d8cbed to your computer and use it in GitHub Desktop.
Scraper for www.toyotacertified.be (used Toyota cars search engine).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.support.ui import Select | |
import csv | |
import re | |
import os.path | |
import time | |
import datetime | |
import smtplib | |
from email.mime.text import MIMEText | |
# SETUP | |
print("Configuring...") | |
# cars.csv file location | |
carsfile = "cars.csv" | |
# set e-mail settings | |
mailto = ["[email protected]","[email protected]"] | |
mailfrom = "[email protected]" | |
# prepare regexes | |
reDigits = re.compile(r'\d+') | |
# interested in models | |
models = ["Auris","Auris Hybride"] | |
# csv headers | |
headers = ["url","subject","mileage","fuelType","buildYear","carrosserieType","color","price","dealerInfo","found"] | |
# login to smtp server | |
print("Connecting to mailserver...") | |
smtpserver = smtplib.SMTP('localhost') | |
# cache urls from csv file | |
print("Reading list of known cars in memory...") | |
carUrls = [] | |
if os.path.isfile(carsfile): | |
with open(carsfile,"r", newline="\n", encoding="utf-8") as csvfile: | |
csvreader = csv.DictReader(csvfile,fieldnames=headers,delimiter=",",quotechar='"') | |
carUrls = [car["url"] for car in csvreader] | |
# get csv file ready for writing | |
print("Opening csv file for writing...") | |
csvfile = open(carsfile,"a", newline="\n", encoding="utf-8") | |
csvwriter = csv.DictWriter(csvfile,fieldnames=headers,quoting=csv.QUOTE_ALL) | |
# start browser | |
print("Starting browser...") | |
#browser = webdriver.Firefox() | |
browser = webdriver.PhantomJS() | |
browser.implicitly_wait(10) | |
# repeat search for each model | |
print("Initiating search...") | |
for model in models: | |
print("Looking for model {}...".format(model)) | |
# go to Toyota occassie search page | |
browser.get("http://www.toyotacertified.be/advanced-search") | |
# populate search fields as desired | |
print("Populating search fields...") | |
Select(browser.find_element_by_id("ModelID")).select_by_visible_text(model) | |
Select(browser.find_element_by_id("MileageTo")).select_by_value("100000") | |
Select(browser.find_element_by_id("RegistrationYearFrom")).select_by_visible_text("2010") | |
for selectedFuel in browser.find_elements_by_name("selectedFuels"): | |
if selectedFuel.get_attribute("value") in ["1","4"]: | |
selectedFuel.click() | |
Select(browser.find_element_by_id("PriceTo")).select_by_value("15000") | |
browser.find_element_by_id("PostalCode").send_keys("2350") | |
Select(browser.find_element_by_id("MaxDistance")).select_by_value("50") | |
# submit search | |
print("Submitting search...") | |
browser.find_element_by_xpath("//div[@class='submitButton']/input").click() | |
# go over each page | |
hasNextPage = True | |
while hasNextPage: | |
cars = browser.find_elements_by_class_name("ulCarItem") | |
print("Found {} cars!".format(len(cars))) | |
for car in cars: | |
carUrl = car.find_element_by_tag_name("a").get_attribute("href") | |
if not carUrl in carUrls: | |
print("New car!") | |
# collect car data | |
carData = dict() | |
carData["url"] = carUrl | |
carData["subject"] = " ".join([car.find_element_by_class_name("liCarName").text,car.find_element_by_xpath("li[@class='liCarType']/div[@class='description']").text]) | |
carData["mileage"] = "".join(reDigits.findall(car.find_element_by_class_name("liMileage").text)) | |
carData["fuelType"] = car.find_element_by_class_name("liFuelType").text | |
carData["buildYear"] = car.find_element_by_xpath("li[@class='liConstructionYear']/label[@class='year']").text | |
carData["carrosserieType"] = car.find_element_by_class_name("liCarrosserieType").text | |
carData["color"] = car.find_element_by_class_name("liColor").text | |
carData["price"] = "".join(reDigits.findall(car.find_element_by_class_name("liPrice").text)) | |
carData["dealerInfo"] = car.find_element_by_class_name("dealerInfo").get_attribute('textContent').strip() | |
carData["found"] = datetime.datetime.now().isoformat() | |
# report to user | |
print("\n".join([" "+": ".join(i).capitalize() for i in carData.items()])) | |
# save in csv | |
print("Storing in csv file...") | |
csvwriter.writerow(carData) | |
# send an e-mail alert | |
print("Sending email...") | |
email = MIMEText("\n".join([": ".join(i).capitalize() for i in carData.items()])) | |
email["Subject"] = carData["subject"] | |
email["From"] = mailfrom | |
email["To"] = ", ".join(mailto) | |
smtpserver.sendmail(mailfrom,mailto,email.as_string()) | |
# open next page | |
print("Looking to see if there is a next page...") | |
nextPageElem = browser.find_element_by_class_name("nextPage") | |
if nextPageElem.get_attribute("disabled"): | |
print("No next page; Done searching for model {}".format(model)) | |
hasNextPage = False | |
else: | |
print("Browsing to next page of search results...") | |
nextPageElem.click() | |
time.sleep(2) | |
# for a new search, go back to the first page of search results (Toyota site is bugged) | |
print("Returning to first page of search results (Toyota search engine bug)...") | |
for firstPageElem in browser.find_elements_by_xpath("//a[@pageindex='1']"): | |
if firstPageElem.is_displayed(): | |
firstPageElem.click() | |
break | |
time.sleep(2) | |
print("Done searching; Closing open handles...") | |
# close open handles / browser | |
csvfile.close() | |
smtpserver.close() | |
browser.close() | |
browser.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment