Created
May 7, 2021 18:02
-
-
Save riteshkawadkar/885408d0b13327d9bd617c1548140f43 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver import Chrome | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from webdriver_manager.chrome import ChromeDriverManager | |
from datetime import datetime | |
from scrape_linkedin import ProfileScraper | |
import random #new import made | |
from selenium import webdriver #new import made | |
import pandas as pd | |
import json | |
import os | |
import re | |
import time | |
from webdriver_manager.chrome import ChromeDriverManager | |
import csv | |
import xlsxwriter | |
options = Options() | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
options.add_argument("--ignore-certificate-errors") | |
##ADD your profile path | |
##You can find it by visitng chrome://version/ | |
## replcae below by your Profile Path | |
options.add_argument("--user-data-dir=C:\\Users\\rites\\AppData\\Local\\Google\\Chrome\\User Data") | |
##### Web scrapper for infinite scrolling page ##### | |
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) | |
## ADD your linkeding URl here | |
driver.get("https://www.linkedin.com/in/riteshkawadkar/detail/recent-activity/") | |
time.sleep(2) # Allow 2 seconds for the web page to open | |
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec | |
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web | |
i = 1 | |
while True: | |
# scroll one screen height each time | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
#driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i)) | |
i += 1 | |
time.sleep(scroll_pause_time) | |
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page | |
scroll_height = driver.execute_script("return document.body.scrollHeight;") | |
# Break the loop when the height we need to scroll to is larger than the total scroll height | |
if (screen_height) * i > scroll_height: | |
break | |
##### Extract Reddit URLs ##### | |
#urls = [] | |
#soup = BeautifulSoup(driver.page_source, "lxml") | |
#divs = soup.find_all('div', class_='occludable-update ember-view') | |
#print(len(divs)) | |
posts = driver.find_elements_by_xpath("//*[@class='occludable-update ember-view']/div/div/div[contains(@class, 'feed-shared-actor--with-control-menu')]//parent::div[1]//parent::div[starts-with(@data-urn, 'urn')]") | |
print(len(posts)) | |
posturls=[] | |
##### Web scrapper for infinite scrolling page ##### | |
for post in posts: | |
url_ = 'https://www.linkedin.com/feed/update/' + post.get_attribute("data-urn") | |
posturls.append(url_) | |
print(len(posturls)) | |
workbook = xlsxwriter.Workbook('write_data.xlsx') | |
worksheet = workbook.add_worksheet() | |
worksheet.write(0, 1, 'POST_URL') | |
worksheet.write(0, 1, 'POST_AUTHOR') | |
worksheet.write(0, 1, 'POST_DATE') | |
worksheet.write(0, 1, 'POST_TEXT') | |
worksheet.write(0, 1, 'POST_REACTIONS_COUNT') | |
worksheet.write(0, 1, 'POST_REACTIONS_COMMENT') | |
row = 1 | |
for post in posturls: | |
driver.get(post) | |
POST_URL = post | |
try: | |
POST_AUTHOR = driver.find_element_by_xpath("//span[@class='feed-shared-actor__title']/span/span[1]").text | |
POST_DATE = driver.find_element_by_xpath("//span[@class='feed-shared-actor__sub-description t-12 t-normal t-black--light']/span/span[1]").text | |
POST_TEXT = driver.find_element_by_xpath("//div[@dir='ltr'][1]").text | |
except: | |
print('POST_AUTHOR POST_DATE POST_TEXT Not found') | |
try: | |
POST_REACTIONS_COUNT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[1]/button/span").text | |
except: | |
print('POST_REACTIONS_COUNT Not found') | |
POST_REACTIONS_COUNT = "0" | |
try: | |
POST_REACTIONS_COMMENT = driver.find_element_by_xpath("//ul[starts-with(@class, 'social-details-social-counts') ]/li[2]/button/span").text | |
except: | |
print('POST_REACTIONS_COMMENT Not found') | |
POST_REACTIONS_COMMENT = "0" | |
#print(POST_URL + ';' + POST_AUTHOR + ';' + POST_DATE + ';' + POST_REACTIONS_COUNT + ';' + POST_REACTIONS_COMMENT) | |
worksheet.write(row, 0, POST_URL) | |
worksheet.write(row, 1, POST_AUTHOR) | |
worksheet.write(row, 2, POST_DATE) | |
worksheet.write(row, 3, POST_TEXT) | |
worksheet.write(row, 4, POST_REACTIONS_COUNT) | |
worksheet.write(row, 5, POST_REACTIONS_COMMENT) | |
row += 1 | |
workbook.close() | |
#print(soup) | |
#print(len(soup.find_all('div', class_='occludable-update ember-view'))) | |
driver.quit() | |
''' | |
POST_AUTHOR = //span[@class="feed-shared-actor__title"]/span/span | |
POST_DATE = //span[@class="feed-shared-actor__sub-description t-12 t-normal t-black--light"]/span/span | |
POST_TEXT = //div[@dir="ltr"] | |
POST_REACTIONS_COUNT = //ul[starts-with(@class, "social-details-social-counts") ]/li[1]/button/span | |
POST_REACTIONS_COMMENT = //ul[starts-with(@class, "social-details-social-counts") ]/li[2]/button/span | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment