Created
August 3, 2021 17:31
-
-
Save lobstrio/ee81093ce2289a9bc75b23cd5d98bcbd to your computer and use it in GitHub Desktop.
Collect all data from a Search URL on Google Maps đź‘‹
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# _*_ coding: utf-8 _*° | |
# Copyright(C) 2021 lobstr | |
from selenium import webdriver | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import NoSuchElementException | |
import time | |
import csv | |
from lxml import html | |
class CrawlerGoogleMapsSelenium(): | |
def __init__(self): | |
self.driver = webdriver.Chrome("/Users/sashabouloudnine/Downloads/chromedriver") | |
def accept_cookies(self): | |
accept_button = self.driver.find_element_by_xpath("//span[contains(text(), \"J\'accepte\")]") | |
if accept_button: | |
self.driver.execute_script("arguments[0].scrollIntoView();", accept_button) | |
time.sleep(2) | |
accept_button.click() | |
time.sleep(2) | |
def iter_etabs(self, starting_url): | |
assert starting_url | |
self.driver.get(starting_url) | |
time.sleep(2) | |
self.accept_cookies() | |
_ = WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//div[contains(@jsaction, 'mouseover:pane')]"))) | |
time.sleep(2) | |
urls = self.driver.find_elements_by_xpath("//div[contains(@jsaction, 'mouseover:pane')]/a") | |
urls_list = [] | |
for url in urls: | |
url = url.get_attribute('href') | |
urls_list.append(url) | |
for url in urls_list: | |
assert url | |
result_dict = self.get_etab(url) | |
print('\t'.join([str(v) for v in result_dict.values()])) | |
yield result_dict | |
def get_etab(self, url): | |
assert url | |
print(url) | |
self.driver.get(url) | |
_ = WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[@jstcache=127]"))) | |
time.sleep(5) | |
try: | |
categorie = self.driver.find_element_by_xpath('//button[@jsaction="pane.rating.category"]').text | |
except NoSuchElementException: | |
categorie = '' | |
try: | |
reviews = self.driver.find_element_by_xpath('//button[@jsaction="pane.rating.moreReviews"]').text | |
except NoSuchElementException: | |
reviews = '' | |
try: | |
prix = self.driver.find_element_by_xpath('//span[contains(@aria-label, "Prix:")]').text | |
except NoSuchElementException: | |
prix = '' | |
# aria-label | |
try: | |
nom = self.driver.find_element_by_xpath('//div[@role="main" and @aria-label]').get_attribute('aria-label') | |
except NoSuchElementException: | |
nom = '' | |
try: | |
score = self.driver.find_element_by_xpath('//ol[@class="section-star-array"]').get_attribute('aria-label') | |
score.replace('\xa0', ' ') | |
except NoSuchElementException: | |
score = '' | |
try: | |
adresse = self.driver.find_element_by_xpath('//button[@data-item-id="address"]').get_attribute('aria-label') | |
adresse = adresse.replace('Adresse: ', '') | |
except NoSuchElementException: | |
adresse = '' | |
try: | |
telephone = self.driver.find_element_by_xpath('//button[contains(@aria-label, "Numéro de téléphone:")]').get_attribute('aria-label') | |
telephone = telephone.replace('Numéro de téléphone: ', '') | |
except NoSuchElementException: | |
telephone = '' | |
try: | |
website = self.driver.find_element_by_xpath('//button[contains(@aria-label, "Site Web:")]').get_attribute('aria-label') | |
website = website.replace('Site Web: ', '') | |
except NoSuchElementException: | |
website = '' | |
result_dict = { | |
'nom': nom, | |
'categorie': categorie, | |
'reviews': reviews, | |
'score': score, | |
'prix': prix, | |
'adresse': adresse, | |
'telephone': telephone, | |
'website': website | |
} | |
return result_dict | |
def main(self, url): | |
l = [] | |
etabs = list(self.iter_etabs(url)) | |
keys = ['nom', 'categorie', 'reviews', 'score', 'prix', 'adresse', 'telephone', 'website'] | |
with open('googlemaps_20210803.csv', mode='w') as f: | |
writer = csv.DictWriter(f, delimiter='\t', fieldnames=keys) | |
writer.writeheader() | |
for etab in etabs: | |
writer.writerow(etab) | |
if __name__ == '__main__': | |
starting_url = 'https://www.google.com/maps/search/restaurant+marseille/@43.2850096,5.3752173,14z' | |
google_maps_crawler = CrawlerGoogleMapsSelenium() | |
google_maps_crawler.main(starting_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment