Skip to content

Instantly share code, notes, and snippets.

@gregoriopellegrino
Last active March 23, 2019 09:00
Show Gist options
  • Save gregoriopellegrino/6f696e53e3edc422f019be22854d4726 to your computer and use it in GitHub Desktop.
Save gregoriopellegrino/6f696e53e3edc422f019be22854d4726 to your computer and use it in GitHub Desktop.
Scraping the web
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from pyvirtualdisplay import Display
import os
from bs4 import BeautifulSoup
import csv
try:
display = Display(visible=0, size=(800, 600))
display.start()
browser = webdriver.Firefox()
browser.implicitly_wait(30)
except KeyboardInterrupt:
browser.quit()
display.stop()
url_partenza = "https://www.autoscout24.it/lst/mini/cooper?sort=standard&desc=0&fuel=L&doorfrom=2&doorto=3&ustate=N%2CU&cy=I&atype=C"
url = url_partenza
autos = list()
while True:
browser.get(url)
print(browser.current_url)
html = browser.page_source
soup = BeautifulSoup(html)
for annuncio in soup.select('div.cldt-summary-full-item'):
auto = list()
auto.append(annuncio.select('.cldt-summary-makemodel').pop().text.strip().replace("\n", " "))
if len(annuncio.select('.cldt-summary-version')) > 0:
auto.append(annuncio.select('.cldt-summary-version').pop().text.strip().replace("\n", " "))
else:
auto.append("")
auto.append(annuncio.select('.cldt-price').pop().text.strip().replace("\n", " "))
if len(annuncio.select('.cldf-summary-seller-company-name')) > 0:
auto.append(annuncio.select('.cldf-summary-seller-company-name').pop().text.strip().replace("\n", " "))
else:
auto.append("")
if len(annuncio.select('.cldf-summary-seller-contact-address')) > 0:
auto.append(annuncio.select('.cldf-summary-seller-contact-address').pop().text.strip().replace("\n", " "))
else:
auto.append("")
for caratteristisca in annuncio.select('.cldt-summary-vehicle-data ul li'):
auto.append(caratteristisca.text.strip().replace("\n", " "))
autos.append(auto)
element = browser.find_element_by_partial_link_text("Successivo")
if element.get_attribute("href"):
url = element.get_attribute("href")
else:
break
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'auto.csv'), "w") as f:
writer = csv.writer(f)
writer.writerows(autos)
browser.quit()
display.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment