riteshkawadkar · May 7, 2021 18:02
diff --git a/linkedin-post-scraper.py b/linkedin-post-scraper.py
 import time
 from selenium import webdriver
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin


 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver import Chrome
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from webdriver_manager.chrome import ChromeDriverManager

 from datetime import datetime
 from scrape_linkedin import ProfileScraper
 import random                       #new import made
 from selenium import webdriver      #new import made
 import pandas as pd
 import json
 import os
 import re
 import time
 from webdriver_manager.chrome import ChromeDriverManager

 import csv
 import xlsxwriter



 options = Options()
 options.add_argument('--no-sandbox')
 options.add_argument('--disable-dev-shm-usage')
 options.add_argument("--ignore-certificate-errors")

 ##ADD your profile path
 ##You can find it by visitng chrome://version/
 ## replcae below by your Profile Path
 options.add_argument("--user-data-dir=C:\\Users\\rites\\AppData\\Local\\Google\\Chrome\\User Data")

 ##### Web scrapper for infinite scrolling page #####

 driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)


 ## ADD your linkeding URl here
 driver.get("https://www.linkedin.com/in/riteshkawadkar/detail/recent-activity/")
 time.sleep(2)  # Allow 2 seconds for the web page to open
 scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
 screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
 i = 1

 while True:
    # scroll one screen height each time
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break 

 ##### Extract Reddit URLs #####
 #urls = []
 #soup = BeautifulSoup(driver.page_source, "lxml")

 #divs = soup.find_all('div', class_='occludable-update ember-view')
 #print(len(divs))
 posts = driver.find_elements_by_xpath("//*[@class='occludable-update ember-view']/div/div/div[contains(@class, 'feed-shared-actor--with-control-menu')]//parent::div[1]//parent::div[starts-with(@data-urn, 'urn')]")
 print(len(posts))



 posturls=[]
 ##### Web scrapper for infinite scrolling page #####
 for post in posts:
    url_ = 'https://www.linkedin.com/feed/update/' + post.get_attribute("data-urn")
    posturls.append(url_)
    
 print(len(posturls))

 workbook = xlsxwriter.Workbook('write_data.xlsx')    
 worksheet = workbook.add_worksheet()


 worksheet.write(0, 1, 'POST_URL')
 worksheet.write(0, 1, 'POST_AUTHOR')
 worksheet.write(0, 1, 'POST_DATE')
 worksheet.write(0, 1, 'POST_TEXT')
 worksheet.write(0, 1, 'POST_REACTIONS_COUNT')
 worksheet.write(0, 1, 'POST_REACTIONS_COMMENT')
    
 row = 1
    
 for post in posturls:
    
    driver.get(post)
    POST_URL = post
    
    try:
        POST_AUTHOR = driver.find_element_by_xpath("//span[@class='feed-shared-actor__title']/span/span[1]").text
        POST_DATE = driver.find_element_by_xpath("//span[@class='feed-shared-actor__sub-description t-12 t-normal t-black--light']/span/span[1]").text
        POST_TEXT = driver.find_element_by_xpath("//div[@dir='ltr'][1]").text
    except:
        print('POST_AUTHOR POST_DATE POST_TEXT Not found')
      
    try:
      POST_REACTIONS_COUNT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[1]/button/span").text
    except:
      print('POST_REACTIONS_COUNT Not found')
      POST_REACTIONS_COUNT = "0"
    
    try:
      POST_REACTIONS_COMMENT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[2]/button/span").text
    except:
      print('POST_REACTIONS_COMMENT Not found')
      POST_REACTIONS_COMMENT = "0"
      
    #print(POST_URL + ';' + POST_AUTHOR + ';' + POST_DATE +  ';' + POST_REACTIONS_COUNT + ';' + POST_REACTIONS_COMMENT)
    worksheet.write(row, 0, POST_URL)
    worksheet.write(row, 1, POST_AUTHOR)
    worksheet.write(row, 2, POST_DATE)
    worksheet.write(row, 3, POST_TEXT)
    worksheet.write(row, 4, POST_REACTIONS_COUNT)
    worksheet.write(row, 5, POST_REACTIONS_COMMENT)
    row += 1

    
    
 workbook.close()


    

    
 #print(soup)
 #print(len(soup.find_all('div', class_='occludable-update ember-view')))

 driver.quit()




 '''
 POST_AUTHOR = //span[@class="feed-shared-actor__title"]/span/span
 POST_DATE = //span[@class="feed-shared-actor__sub-description t-12 t-normal t-black--light"]/span/span
 POST_TEXT = //div[@dir="ltr"]
 POST_REACTIONS_COUNT = //ul[starts-with(@class, "social-details-social-counts") ]/li[1]/button/span
 POST_REACTIONS_COMMENT = //ul[starts-with(@class, "social-details-social-counts") ]/li[2]/button/span

 '''
	import time
	from selenium import webdriver
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin


	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver import Chrome
	from selenium.common.exceptions import TimeoutException
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from webdriver_manager.chrome import ChromeDriverManager

	from datetime import datetime
	from scrape_linkedin import ProfileScraper
	import random #new import made
	from selenium import webdriver #new import made
	import pandas as pd
	import json
	import os
	import re
	import time
	from webdriver_manager.chrome import ChromeDriverManager

	import csv
	import xlsxwriter



	options = Options()
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument("--ignore-certificate-errors")

	##ADD your profile path
	##You can find it by visitng chrome://version/
	## replcae below by your Profile Path
	options.add_argument("--user-data-dir=C:\\Users\\rites\\AppData\\Local\\Google\\Chrome\\User Data")

	##### Web scrapper for infinite scrolling page #####

	driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)


	## ADD your linkeding URl here
	driver.get("https://www.linkedin.com/in/riteshkawadkar/detail/recent-activity/")
	time.sleep(2) # Allow 2 seconds for the web page to open
	scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
	screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
	i = 1

	while True:
	# scroll one screen height each time
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	#driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
	i += 1
	time.sleep(scroll_pause_time)
	# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
	scroll_height = driver.execute_script("return document.body.scrollHeight;")
	# Break the loop when the height we need to scroll to is larger than the total scroll height
	if (screen_height) * i > scroll_height:
	break

	##### Extract Reddit URLs #####
	#urls = []
	#soup = BeautifulSoup(driver.page_source, "lxml")

	#divs = soup.find_all('div', class_='occludable-update ember-view')
	#print(len(divs))
	posts = driver.find_elements_by_xpath("//*[@class='occludable-update ember-view']/div/div/div[contains(@class, 'feed-shared-actor--with-control-menu')]//parent::div[1]//parent::div[starts-with(@data-urn, 'urn')]")
	print(len(posts))



	posturls=[]
	##### Web scrapper for infinite scrolling page #####
	for post in posts:
	url_ = 'https://www.linkedin.com/feed/update/' + post.get_attribute("data-urn")
	posturls.append(url_)

	print(len(posturls))

	workbook = xlsxwriter.Workbook('write_data.xlsx')
	worksheet = workbook.add_worksheet()


	worksheet.write(0, 1, 'POST_URL')
	worksheet.write(0, 1, 'POST_AUTHOR')
	worksheet.write(0, 1, 'POST_DATE')
	worksheet.write(0, 1, 'POST_TEXT')
	worksheet.write(0, 1, 'POST_REACTIONS_COUNT')
	worksheet.write(0, 1, 'POST_REACTIONS_COMMENT')

	row = 1

	for post in posturls:

	driver.get(post)
	POST_URL = post

	try:
	POST_AUTHOR = driver.find_element_by_xpath("//span[@class='feed-shared-actor__title']/span/span[1]").text
	POST_DATE = driver.find_element_by_xpath("//span[@class='feed-shared-actor__sub-description t-12 t-normal t-black--light']/span/span[1]").text
	POST_TEXT = driver.find_element_by_xpath("//div[@dir='ltr'][1]").text
	except:
	print('POST_AUTHOR POST_DATE POST_TEXT Not found')

	try:
	POST_REACTIONS_COUNT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[1]/button/span").text
	except:
	print('POST_REACTIONS_COUNT Not found')
	POST_REACTIONS_COUNT = "0"

	try:
	POST_REACTIONS_COMMENT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[2]/button/span").text
	except:
	print('POST_REACTIONS_COMMENT Not found')
	POST_REACTIONS_COMMENT = "0"

	#print(POST_URL + ';' + POST_AUTHOR + ';' + POST_DATE + ';' + POST_REACTIONS_COUNT + ';' + POST_REACTIONS_COMMENT)
	worksheet.write(row, 0, POST_URL)
	worksheet.write(row, 1, POST_AUTHOR)
	worksheet.write(row, 2, POST_DATE)
	worksheet.write(row, 3, POST_TEXT)
	worksheet.write(row, 4, POST_REACTIONS_COUNT)
	worksheet.write(row, 5, POST_REACTIONS_COMMENT)
	row += 1



	workbook.close()





	#print(soup)
	#print(len(soup.find_all('div', class_='occludable-update ember-view')))

	driver.quit()




	'''
	POST_AUTHOR = //span[@class="feed-shared-actor__title"]/span/span
	POST_DATE = //span[@class="feed-shared-actor__sub-description t-12 t-normal t-black--light"]/span/span
	POST_TEXT = //div[@dir="ltr"]
	POST_REACTIONS_COUNT = //ul[starts-with(@class, "social-details-social-counts") ]/li[1]/button/span
	POST_REACTIONS_COMMENT = //ul[starts-with(@class, "social-details-social-counts") ]/li[2]/button/span

	'''