Skip to content

Instantly share code, notes, and snippets.

@nherbaut
Created May 17, 2022 08:00
Show Gist options
  • Select an option

  • Save nherbaut/1067d84c858c90990f951c17562c6591 to your computer and use it in GitHub Desktop.

Select an option

Save nherbaut/1067d84c858c90990f951c17562c6591 to your computer and use it in GitHub Desktop.
top cs researchers scrapping
#!/usr/bin/env python3
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import requests
import time
import dateparser
import datetime
import json
import os
import time
import undetected_chromedriver as uc
import csv
with open('researchers.csv', 'w', newline='') as csvfile:
csvfile.write("#word_rank;regional_rank;name;ranking;citations;publications")
def get_researchers_data(brower):
elems = browser.find_elements(by=By.CSS_SELECTOR, value=".scientist-item")
for elem in elems:
word_rank=elem.find_elements(by=By.CSS_SELECTOR,value="span span")[0].text
regional_rank=elem.find_elements(by=By.CSS_SELECTOR,value="span span")[2].text
name=elem.find_elements(by=By.CSS_SELECTOR, value="h4")[0].text
ranking=elem.find_elements(by=By.CSS_SELECTOR, value=".ranking")[0].text
citations=elem.find_elements(by=By.CSS_SELECTOR, value=".ranking")[2].text
publications=elem.find_elements(by=By.CSS_SELECTOR, value=".ranking")[4].text
with open('researchers.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=';',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([word_rank,regional_rank,name,ranking,citations,publications])
browser = uc.Chrome()
actions = ActionChains(browser)
browser.get('https://research.com/scientists-rankings/computer-science')
get_researchers_data(browser)
pagination=browser.find_elements(by=By.CSS_SELECTOR, value="#rankingPagination")
elems=browser.find_elements(by=By.CSS_SELECTOR, value="#rankingPagination a.active +a")
while len(elems)>0:
body = browser.find_element_by_css_selector('body')
for i in range(0,50):
body.send_keys(Keys.PAGE_DOWN)
if len(elems)>0:
time.sleep(5)
elems[0].click()
get_researchers_data(browser)
elems=browser.find_elements(by=By.CSS_SELECTOR, value="#rankingPagination a.active +a")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment