Skip to content

Instantly share code, notes, and snippets.

@vcode11
Created September 18, 2020 04:41
Show Gist options
  • Select an option

  • Save vcode11/1f3af7285f580101bc065a5d9d01e2b5 to your computer and use it in GitHub Desktop.

Select an option

Save vcode11/1f3af7285f580101bc065a5d9d01e2b5 to your computer and use it in GitHub Desktop.
import pickle
import time
import pandas as pd
from selenium import webdriver
import selenium.webdriver.support.ui as ui
def get_list(tags) -> str:
'''Converts list of tags to data string.'''
elems = list(map(lambda tag: tag.text, tags))
data_str = ';'.join(elems)
return data_str
columns = ['name', 'description', 'technologies', 'topics', 'year', 'project_count']
data = []
links = []
browser = webdriver.Firefox()
def scrape_2020_data(url, year=2020):
'''Scrape 2020 gsoc data'''
browser.get(url)
ui.WebDriverWait(browser, 15).until(
lambda browser: browser.find_elements_by_css_selector('div.organization-card__name')
)
org_cards = browser.find_elements_by_css_selector('div.organization-card__name')
for org in org_cards:
org.click()
name = browser.find_element_by_css_selector('h2.organization-card__title').text
technologies = browser.find_elements_by_css_selector('li.organization__tag--technology')
technologies = get_list(technologies)
topics = browser.find_elements_by_css_selector('li.organization__tag--topic')
topics = get_list(topics)
desc = browser.find_element_by_css_selector('p.organization-card__precis').text
link = browser.find_element_by_css_selector('a.md-primary.md-button').get_attribute('href')
row = [name, desc, technologies, topics, year,]
print(row)
data.append(row)
links.append(link)
count = []
for link in links:
browser.get(link)
projects = []
try:
ui.WebDriverWait(browser, 15).until(lambda browser: browser.find_elements_by_css_selector('div.project-card__right-header-content'))
projects = browser.find_elements_by_css_selector('div.project-card__right-header-content')
except Exception as e:
print(e)
count.append(len(projects))
for i in range(len(data)):
data[i].append(count[i])
def save_data(filename):
gsoc_df = pd.DataFrame(data, columns=columns)
gsoc_df.to_csv(filename, index=False)
scrape_2020_data('https://summerofcode.withgoogle.com/organizations/?sp-page=5')
save_data('gsoc2020.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment