Last active
January 31, 2024 11:55
-
-
Save a-v-ershov/1fb20cb2761f51b26e99a55043cf371f to your computer and use it in GitHub Desktop.
Parser to create Pandas Dataframe from HTML pages from crunchbase.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from datetime import datetime | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from tqdm.notebook import tqdm | |
def parse_crunchbase_html(soup: BeautifulSoup) -> pd.DataFrame: | |
""" | |
Parse html page with search results from crunchbase.com | |
""" | |
# Get data for each company | |
companies_data = soup.find_all("grid-row", {"class": "ng-star-inserted"}) | |
# Get all column ids | |
grid_cells = companies_data[0].find_all("grid-cell", {"class": lambda x: x and x.startswith("column-id")}) | |
classes = [g["class"] for g in grid_cells] | |
classes = [item for sublist in classes for item in sublist] | |
column_ids = list(set([c for c in classes if c.startswith("column-id")])) | |
# Get column names | |
header = soup.find("grid-header") | |
column_ids_to_names = {} | |
for c in column_ids: | |
try: | |
column_ids_to_names[c] = header.find("grid-column-header", {"class": c}).text.strip() | |
except Exception: | |
continue | |
# Parse data for each company | |
name_column = column_ids_to_names["column-id-identifier"] | |
parsed_data = [] | |
for company_data in companies_data: | |
# Get company name | |
company_name = company_data.find("div", {"class": "identifier-label"}).text.strip() | |
# Get data for each column | |
parsed_company_data = {name_column: company_name} | |
for cid, cname in column_ids_to_names.items(): | |
parsed_company_data[cname] = company_data.find("grid-cell", {"class": cid}).text.strip() | |
parsed_data.append(parsed_company_data) | |
# Create dataframe | |
return pd.DataFrame(parsed_data) | |
def parse_crunchbase_html_files(folder_name: str) -> pd.DataFrame: | |
""" | |
Parse all html files in a folder | |
""" | |
abs_path = os.path.abspath(folder_name) | |
html_filenames = [os.path.join(abs_path, f) for f in os.listdir(folder_name) if f.endswith(".html")] | |
dfs = [] | |
for html_filename in tqdm(html_filenames): | |
with open(html_filename, "r") as f: | |
soup = BeautifulSoup(f, "html.parser") | |
df = parse_crunchbase_html(soup) | |
dfs.append(df) | |
return pd.concat(dfs) | |
def enrich_crunchbase_df(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Drop duplicates, parse several columns and remove columns with empty values | |
""" | |
edf = df.copy(deep=True) | |
edf["Founded Date parsed"] = edf["Founded Date"].apply(_parse_date_column) | |
edf["Founded Year"] = edf["Founded Date parsed"].apply(lambda x: int(x.year) if x else None) | |
edf["Max revenue in millions"] = edf["Estimated Revenue Range"].apply(_get_max_revenue_in_milions) | |
edf["Max number of employees"] = edf["Number of Employees"].apply(_get_max_number_of_employees) | |
edf["Headquarter county"] = edf["Headquarters Location"].apply(lambda x: x.split(",")[-1].strip()) | |
edf = edf.drop(["Facebook", "Twitter", "LinkedIn"], axis=1) | |
edf = edf.drop_duplicates(subset=["Organization Name"], keep="first") | |
return edf | |
def prepare_df_for_excel_export(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Fix illegal characters in a dataframe to be exported to excel | |
""" | |
return df.applymap( | |
lambda x: x.replace("\r", " ").replace("\n", " ").replace("\t", " ").replace("\v", " ").replace("\x03", " ") | |
if isinstance(x, str) | |
else x | |
) | |
def _parse_date_column(date_str: str): | |
""" | |
Parse a date string to a datetime object | |
""" | |
for date_format in ["%Y", "%b %d, %Y", "%d %b %Y", "%b %Y"]: | |
try: | |
return datetime.strptime(date_str, date_format).date() | |
except ValueError: | |
continue | |
return None | |
def _get_max_revenue_in_milions(revenue_range: str): | |
""" | |
Get a max number from a revenue range string in millions. | |
Example: '$10M to $50M' to to 50. '$500M to $1B' to 1000 $10b+ to 10000 | |
""" | |
parsed_revenue_range = revenue_range.replace("$", "").replace("M", "").replace("B", "000").replace("+", "") | |
parsed_revenue_range = parsed_revenue_range.split(" to ") | |
return int(parsed_revenue_range[-1]) | |
def _get_max_number_of_employees(employee_range: str) -> int | None: | |
""" | |
Translates an employee range string to the maximum number. | |
Example: '1-10' translates to 10. '10001+' translates to 10001. | |
""" | |
if employee_range == "—": | |
return None | |
else: | |
parsed_employee_range = employee_range.replace("+", "").split("-") | |
return int(parsed_employee_range[-1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment