Skip to content

Instantly share code, notes, and snippets.

@databyjp
Created August 26, 2020 12:10
Show Gist options
  • Select an option

  • Save databyjp/9e7e0755f7918ea7c7528fd09b06cbc3 to your computer and use it in GitHub Desktop.

Select an option

Save databyjp/9e7e0755f7918ea7c7528fd09b06cbc3 to your computer and use it in GitHub Desktop.
import pandas as pd
# ===== START SCRAPING =====
import requests
from bs4 import BeautifulSoup
import re
def scrape_this(uri="/pages/forms/"):
page = requests.get("https://scrapethissite.com" + uri)
soup = BeautifulSoup(page.text, "html.parser")
# ========== GET A SIMPLE STAT ==========
div = soup.find(id="hockey") # Find the right div
table = div.find("table")
# ========== SCRAPE AN ENTIRE TABLE ==========
data_rows = table.find_all("tr", attrs={"class": "team"}) # Includes the header row!
parsed_data = list()
stat_keys = [col.attrs["class"][0] for col in data_rows[0].find_all("td")]
for row in data_rows:
tmp_data = dict()
for attr in stat_keys:
attr_val = row.find(attrs={"class": attr}).text
tmp_data[attr] = re.sub(r"^\s+|\s+$", "", attr_val)
parsed_data.append(tmp_data)
data_df = pd.DataFrame(parsed_data)
return data_df
page = requests.get("https://scrapethissite.com/pages/forms/")
soup = BeautifulSoup(page.text, "html.parser")
pagination = soup.find("ul", attrs={"class": "pagination"})
link_elms = pagination.find_all("li")
links = [link_elm.find("a").attrs["href"] for link_elm in link_elms]
links = set(links)
temp_dfs = list()
for link in links:
tmp_df = scrape_this(uri=link)
temp_dfs.append(tmp_df)
hockey_team_df = pd.concat(temp_dfs, axis=0).reset_index(drop=True)
hockey_team_df.sort_values(["year", "name"], inplace=True)
hockey_team_df.to_csv("hockey_team_df.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment