databyjp · August 26, 2020 12:10
diff --git a/scraper_mult_pages.py b/scraper_mult_pages.py

 import pandas as pd

 # ===== START SCRAPING =====
 import requests
 from bs4 import BeautifulSoup
 import re


 def scrape_this(uri="/pages/forms/"):

    page = requests.get("https://scrapethissite.com" + uri)
    soup = BeautifulSoup(page.text, "html.parser")

    # ========== GET A SIMPLE STAT ==========
    div = soup.find(id="hockey")  # Find the right div
    table = div.find("table")

    # ========== SCRAPE AN ENTIRE TABLE ==========
    data_rows = table.find_all("tr", attrs={"class": "team"})  # Includes the header row!
    parsed_data = list()
    stat_keys = [col.attrs["class"][0] for col in data_rows[0].find_all("td")]

    for row in data_rows:
        tmp_data = dict()
        for attr in stat_keys:
            attr_val = row.find(attrs={"class": attr}).text
            tmp_data[attr] = re.sub(r"^\s+|\s+$", "", attr_val)
        parsed_data.append(tmp_data)

    data_df = pd.DataFrame(parsed_data)
    return data_df


 page = requests.get("https://scrapethissite.com/pages/forms/")
 soup = BeautifulSoup(page.text, "html.parser")
 pagination = soup.find("ul", attrs={"class": "pagination"})
 link_elms = pagination.find_all("li")
 links = [link_elm.find("a").attrs["href"] for link_elm in link_elms]
 links = set(links)

 temp_dfs = list()
 for link in links:
    tmp_df = scrape_this(uri=link)
    temp_dfs.append(tmp_df)
 hockey_team_df = pd.concat(temp_dfs, axis=0).reset_index(drop=True)
 hockey_team_df.sort_values(["year", "name"], inplace=True)
 hockey_team_df.to_csv("hockey_team_df.csv")

	import pandas as pd

	# ===== START SCRAPING =====
	import requests
	from bs4 import BeautifulSoup
	import re


	def scrape_this(uri="/pages/forms/"):

	page = requests.get("https://scrapethissite.com" + uri)
	soup = BeautifulSoup(page.text, "html.parser")

	# ========== GET A SIMPLE STAT ==========
	div = soup.find(id="hockey") # Find the right div
	table = div.find("table")

	# ========== SCRAPE AN ENTIRE TABLE ==========
	data_rows = table.find_all("tr", attrs={"class": "team"}) # Includes the header row!
	parsed_data = list()
	stat_keys = [col.attrs["class"][0] for col in data_rows[0].find_all("td")]

	for row in data_rows:
	tmp_data = dict()
	for attr in stat_keys:
	attr_val = row.find(attrs={"class": attr}).text
	tmp_data[attr] = re.sub(r"^\s+\|\s+$", "", attr_val)
	parsed_data.append(tmp_data)

	data_df = pd.DataFrame(parsed_data)
	return data_df


	page = requests.get("https://scrapethissite.com/pages/forms/")
	soup = BeautifulSoup(page.text, "html.parser")
	pagination = soup.find("ul", attrs={"class": "pagination"})
	link_elms = pagination.find_all("li")
	links = [link_elm.find("a").attrs["href"] for link_elm in link_elms]
	links = set(links)

	temp_dfs = list()
	for link in links:
	tmp_df = scrape_this(uri=link)
	temp_dfs.append(tmp_df)
	hockey_team_df = pd.concat(temp_dfs, axis=0).reset_index(drop=True)
	hockey_team_df.sort_values(["year", "name"], inplace=True)
	hockey_team_df.to_csv("hockey_team_df.csv")
No results found