Last active
March 30, 2022 09:34
-
-
Save geoffreygarrett/0e59e3962cb1a8f801db7536b5fb0b6a to your computer and use it in GitHub Desktop.
Function to scrape all tables from a URL to a list of pandas DataFrame objects.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Function to scrape all tables from a URL to pandas DataFrame objects. The | |
default URL is the Wikipedia page for Orbital Launch Systems. | |
""" | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup as bs | |
def scrape_url_for_tables(url, html_class="wikitable sortable"): | |
""" | |
Retrieves Comparison of orbital launch systems (by default) from wikipedia | |
and stores it in a pandas dataframe. | |
Parameters | |
---------- | |
url : str | |
URL of the wikipedia page to scrape. | |
html_class | |
HTML class of the table to scrape. | |
Returns | |
------- | |
dfs : List[pandas.DataFrame] | |
List of Dataframes containing the scraped data. | |
""" | |
# Retrieve the page | |
page = requests.get(url).text | |
# Create a BeautifulSoup object | |
soup = bs(page, "html.parser") | |
# Find the tables | |
table = soup.find("table", {"class": html_class}) | |
return pd.read_html(str(table)) | |
if __name__ == "__main__": | |
URL = "https://en.wikipedia.org/wiki/Comparison_of_orbital_launcher_families" | |
df = url_tables_to_dataframes(URL)[0] | |
print(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment