Last active
April 28, 2021 15:03
-
-
Save riteshkawadkar/1f11a75175f15626e064988e0dc8679e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver import Chrome | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from webdriver_manager.chrome import ChromeDriverManager | |
options = Options() | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
options.add_argument("--ignore-certificate-errors") | |
options.add_argument("--user-data-dir=C:\\Users\\rites\\AppData\\Local\\Google\\Chrome\\User Data") | |
##### Web scrapper for infinite scrolling page ##### | |
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) | |
driver.get("https://www.linkedin.com/in/manish-kawadkar-37629435/detail/recent-activity/") | |
time.sleep(2) # Allow 2 seconds for the web page to open | |
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec | |
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web | |
i = 1 | |
while True: | |
# scroll one screen height each time | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
#driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i)) | |
i += 1 | |
time.sleep(scroll_pause_time) | |
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page | |
scroll_height = driver.execute_script("return document.body.scrollHeight;") | |
# Break the loop when the height we need to scroll to is larger than the total scroll height | |
if (screen_height) * i > scroll_height: | |
break | |
##### Extract Reddit URLs ##### | |
urls = [] | |
soup = BeautifulSoup(driver.page_source, "html.parser") | |
#print(soup) | |
print(len(soup.find_all('div', class_='occludable-update ember-view'))) | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment