-
-
Save albertogalan/7b9856ccb939d36263bd47ca27b5e33a to your computer and use it in GitHub Desktop.
Scraping Linkedin profiles information through Selenium Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python package | |
import csv | |
import time | |
import random | |
import sys | |
import os | |
# selenium package | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.common.exceptions import TimeoutException | |
from selenium.common.exceptions import UnexpectedAlertPresentException | |
from selenium.common.exceptions import WebDriverException | |
from selenium.common.exceptions import StaleElementReferenceException | |
# identifiants | |
email_address = "[email protected]" | |
password = "your_password" | |
search_item = "Chief Happiness Officer" | |
# fonction pause | |
def pause(): | |
time_break = random.randint(3,5) | |
# print "Pause de " + str(time_break) + " seconde(s)." | |
return time.sleep(time_break) | |
# options | |
options = webdriver.ChromeOptions() | |
options.add_argument("--kiosk") | |
capa = DesiredCapabilities.CHROME | |
capa["pageLoadStrategy"] = "none" | |
driver = webdriver.Chrome(desired_capabilities=capa, chrome_options=options) | |
wait = WebDriverWait(driver, 30) | |
pause() | |
print "Driver 1 ouvert" | |
# url de depart | |
linkedin_url = "https://www.linkedin.com/" | |
# aller sur linkedin | |
driver.get(linkedin_url) #1 : page principale | |
try: | |
wait.until(EC.presence_of_element_located( | |
(By.ID, "login-submit")) | |
) | |
except (TimeoutException): | |
sys.exit("Error message - loading page") | |
pause() | |
print "Connecte a Linkedin - 1" | |
# s'identifier | |
driver.find_element_by_id("login-email").send_keys(email_address) | |
pause() | |
driver.find_element_by_id("login-password").send_keys(password) | |
pause() | |
driver.find_element_by_id("login-submit").click() | |
pause() | |
wait.until(EC.element_to_be_clickable( | |
(By.ID, "nav-typeahead-wormhole")) | |
) | |
pause() | |
print "Profil connecte a Linkedin - 1" | |
# lancer la recherche | |
driver.find_element_by_css_selector("div.nav-search-typeahead input").send_keys(search_item) | |
pause() | |
driver.find_element_by_css_selector("div.nav-search-typeahead input").send_keys(Keys.ENTER) | |
wait.until(EC.presence_of_element_located( | |
(By.CSS_SELECTOR, "span.name.actor-name")) | |
) | |
wait.until(EC.presence_of_element_located( | |
(By.CSS_SELECTOR, "h3.search-results__total") | |
)) | |
pause() | |
print "Fin de la recherche avec mot cle : " + search_item | |
# scroll down smoothly | |
scheight = .1 | |
while scheight < 1.0: | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight*%s);" % scheight) | |
scheight += .1 | |
pause() | |
wait.until(EC.element_to_be_clickable( | |
(By.CSS_SELECTOR, 'div.next-text') | |
)) | |
try: | |
print driver.find_element_by_css_selector('div.next-text').text | |
except(NoSuchElementException): | |
print 'No div.next-text' | |
finally: | |
pass | |
# ouvrir csv | |
with open('linkedin_items.csv', 'w') as csvfile: | |
cwriter = csv.writer(csvfile, delimiter=' ', | |
quotechar='$', quoting=csv.QUOTE_MINIMAL) | |
# prendre infos | |
profils = driver.find_elements_by_css_selector("div.search-result__wrapper") | |
# boucle | |
for profil in profils: | |
# creer tableau | |
csv_row = [] | |
# name | |
try: | |
name_location = profil.find_element_by_css_selector( | |
"span.name.actor-name") | |
driver.execute_script( | |
"arguments[0].scrollIntoView({behavior: 'smooth', block: 'center', inline: 'nearest'});", | |
name_location) | |
name = name_location.text.encode('utf-8') | |
csv_row.append(name) | |
except (NoSuchElementException, WebDriverException, StaleElementReferenceException): | |
name = '' | |
csv_row.append(name) | |
finally: | |
pass | |
if name is not 'None': | |
# url | |
try: | |
url_profil = profil.find_element_by_css_selector("a.search-result__result-link").get_attribute('href') | |
csv_row.append(url_profil) | |
except (NoSuchElementException, WebDriverException, StaleElementReferenceException): | |
url_profil = 'None' | |
csv_row.append(url_profil) | |
finally: | |
pass | |
# position | |
try: | |
position = profil.find_element_by_css_selector("p.subline-level-1").text.encode('utf-8') | |
csv_row.append(position) | |
except (NoSuchElementException, WebDriverException, StaleElementReferenceException): | |
position = 'None' | |
csv_row.append(position) | |
finally: | |
pass | |
# localisation | |
try: | |
localisation = profil.find_element_by_css_selector("p.subline-level-2").text.encode('utf-8') | |
csv_row.append(localisation) | |
except (NoSuchElementException, WebDriverException, StaleElementReferenceException): | |
localisation = 'None' | |
csv_row.append(localisation) | |
finally: | |
pass | |
else: | |
url_profil = '' | |
csv_row.append(url_profil) | |
position = '' | |
csv_row.append(position) | |
localisation = '' | |
csv_row.append(localisation) | |
# copier dans le csv | |
try: | |
cwriter.writerow(csv_row) | |
print("-- SUCCESS : %s" % name) | |
except(NoSuchElementException, WebDriverException, StaleElementReferenceException, UnicodeEncodeError): | |
print("Error message - csv") | |
finally: | |
pass | |
# fermer les navigateurs | |
print("Bravo !") | |
driver.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment