Created
August 26, 2020 12:10
-
-
Save databyjp/9e7e0755f7918ea7c7528fd09b06cbc3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| # ===== START SCRAPING ===== | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| def scrape_this(uri="/pages/forms/"): | |
| page = requests.get("https://scrapethissite.com" + uri) | |
| soup = BeautifulSoup(page.text, "html.parser") | |
| # ========== GET A SIMPLE STAT ========== | |
| div = soup.find(id="hockey") # Find the right div | |
| table = div.find("table") | |
| # ========== SCRAPE AN ENTIRE TABLE ========== | |
| data_rows = table.find_all("tr", attrs={"class": "team"}) # Includes the header row! | |
| parsed_data = list() | |
| stat_keys = [col.attrs["class"][0] for col in data_rows[0].find_all("td")] | |
| for row in data_rows: | |
| tmp_data = dict() | |
| for attr in stat_keys: | |
| attr_val = row.find(attrs={"class": attr}).text | |
| tmp_data[attr] = re.sub(r"^\s+|\s+$", "", attr_val) | |
| parsed_data.append(tmp_data) | |
| data_df = pd.DataFrame(parsed_data) | |
| return data_df | |
| page = requests.get("https://scrapethissite.com/pages/forms/") | |
| soup = BeautifulSoup(page.text, "html.parser") | |
| pagination = soup.find("ul", attrs={"class": "pagination"}) | |
| link_elms = pagination.find_all("li") | |
| links = [link_elm.find("a").attrs["href"] for link_elm in link_elms] | |
| links = set(links) | |
| temp_dfs = list() | |
| for link in links: | |
| tmp_df = scrape_this(uri=link) | |
| temp_dfs.append(tmp_df) | |
| hockey_team_df = pd.concat(temp_dfs, axis=0).reset_index(drop=True) | |
| hockey_team_df.sort_values(["year", "name"], inplace=True) | |
| hockey_team_df.to_csv("hockey_team_df.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment