Skip to content

Instantly share code, notes, and snippets.

@riteshkawadkar
Created May 7, 2021 18:02
Show Gist options
  • Save riteshkawadkar/885408d0b13327d9bd617c1548140f43 to your computer and use it in GitHub Desktop.
Save riteshkawadkar/885408d0b13327d9bd617c1548140f43 to your computer and use it in GitHub Desktop.
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Chrome
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
from scrape_linkedin import ProfileScraper
import random #new import made
from selenium import webdriver #new import made
import pandas as pd
import json
import os
import re
import time
from webdriver_manager.chrome import ChromeDriverManager
import csv
import xlsxwriter
options = Options()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--ignore-certificate-errors")
##ADD your profile path
##You can find it by visitng chrome://version/
## replcae below by your Profile Path
options.add_argument("--user-data-dir=C:\\Users\\rites\\AppData\\Local\\Google\\Chrome\\User Data")
##### Web scrapper for infinite scrolling page #####
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
## ADD your linkeding URl here
driver.get("https://www.linkedin.com/in/riteshkawadkar/detail/recent-activity/")
time.sleep(2) # Allow 2 seconds for the web page to open
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
##### Extract Reddit URLs #####
#urls = []
#soup = BeautifulSoup(driver.page_source, "lxml")
#divs = soup.find_all('div', class_='occludable-update ember-view')
#print(len(divs))
posts = driver.find_elements_by_xpath("//*[@class='occludable-update ember-view']/div/div/div[contains(@class, 'feed-shared-actor--with-control-menu')]//parent::div[1]//parent::div[starts-with(@data-urn, 'urn')]")
print(len(posts))
posturls=[]
##### Web scrapper for infinite scrolling page #####
for post in posts:
url_ = 'https://www.linkedin.com/feed/update/' + post.get_attribute("data-urn")
posturls.append(url_)
print(len(posturls))
workbook = xlsxwriter.Workbook('write_data.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 1, 'POST_URL')
worksheet.write(0, 1, 'POST_AUTHOR')
worksheet.write(0, 1, 'POST_DATE')
worksheet.write(0, 1, 'POST_TEXT')
worksheet.write(0, 1, 'POST_REACTIONS_COUNT')
worksheet.write(0, 1, 'POST_REACTIONS_COMMENT')
row = 1
for post in posturls:
driver.get(post)
POST_URL = post
try:
POST_AUTHOR = driver.find_element_by_xpath("//span[@class='feed-shared-actor__title']/span/span[1]").text
POST_DATE = driver.find_element_by_xpath("//span[@class='feed-shared-actor__sub-description t-12 t-normal t-black--light']/span/span[1]").text
POST_TEXT = driver.find_element_by_xpath("//div[@dir='ltr'][1]").text
except:
print('POST_AUTHOR POST_DATE POST_TEXT Not found')
try:
POST_REACTIONS_COUNT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[1]/button/span").text
except:
print('POST_REACTIONS_COUNT Not found')
POST_REACTIONS_COUNT = "0"
try:
POST_REACTIONS_COMMENT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[2]/button/span").text
except:
print('POST_REACTIONS_COMMENT Not found')
POST_REACTIONS_COMMENT = "0"
#print(POST_URL + ';' + POST_AUTHOR + ';' + POST_DATE + ';' + POST_REACTIONS_COUNT + ';' + POST_REACTIONS_COMMENT)
worksheet.write(row, 0, POST_URL)
worksheet.write(row, 1, POST_AUTHOR)
worksheet.write(row, 2, POST_DATE)
worksheet.write(row, 3, POST_TEXT)
worksheet.write(row, 4, POST_REACTIONS_COUNT)
worksheet.write(row, 5, POST_REACTIONS_COMMENT)
row += 1
workbook.close()
#print(soup)
#print(len(soup.find_all('div', class_='occludable-update ember-view')))
driver.quit()
'''
POST_AUTHOR = //span[@class="feed-shared-actor__title"]/span/span
POST_DATE = //span[@class="feed-shared-actor__sub-description t-12 t-normal t-black--light"]/span/span
POST_TEXT = //div[@dir="ltr"]
POST_REACTIONS_COUNT = //ul[starts-with(@class, "social-details-social-counts") ]/li[1]/button/span
POST_REACTIONS_COMMENT = //ul[starts-with(@class, "social-details-social-counts") ]/li[2]/button/span
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment