Last active
July 18, 2022 07:25
-
-
Save KobaKhit/c0efbe6c219c7cfc21bfa5ce2f1a3d01 to your computer and use it in GitHub Desktop.
Parse all html tables on a page and return them as a list of pandas dataframes. Modified from @srome
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/ | |
class HTMLTableParser: | |
@staticmethod | |
def get_element(node): | |
# for XPATH we have to count only for nodes with same type! | |
length = len(list(node.previous_siblings)) + 1 | |
if (length) > 1: | |
return '%s:nth-child(%s)' % (node.name, length) | |
else: | |
return node.name | |
@classmethod | |
def get_css_path(cls,node): | |
path = [cls.get_element(node)] | |
for parent in node.parents: | |
if parent.name == 'body': | |
break | |
path.insert(0, cls.get_element(parent)) | |
return ' > '.join(path) | |
def parse_url(self, url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'lxml') | |
# if table does not have id property return css/xpath pointer | |
get_id = lambda x : x['id'] if 'id' in x else self.get_css_path(x) | |
tables = [(get_id(table),self.parse_html_table(table))\ | |
for table in soup.find_all('table')] | |
return tables | |
def parse_html_table(self, table): | |
n_columns = 0 | |
n_rows=0 | |
column_names = [] | |
# Find number of rows and columns | |
# we also find the column titles if we can | |
for row in table.find_all('tr'): | |
# Determine the number of rows in the table | |
td_tags = row.find_all('td') | |
if len(td_tags) > 0: | |
n_rows+=1 | |
if n_columns == 0: | |
# Set the number of columns for our table | |
n_columns = len(td_tags) | |
# Handle column names if we find them | |
th_tags = row.find_all('th') | |
if len(th_tags) > 0 and len(column_names) == 0: | |
for th in th_tags: | |
column_names.append(th.get_text().strip()) | |
# Safeguard on Column Titles | |
if len(column_names) > 0 and len(column_names) != n_columns: | |
raise Exception("Column titles do not match the number of columns") | |
columns = column_names if len(column_names) > 0 else range(0,n_columns) | |
df = pd.DataFrame(columns = columns, | |
index= range(0,n_rows)) | |
row_marker = 0 | |
for row in table.find_all('tr'): | |
column_marker = 0 | |
columns = row.find_all('td') | |
for column in columns: | |
df.iat[row_marker,column_marker] = column.get_text() | |
column_marker += 1 | |
if len(columns) > 0: | |
row_marker += 1 | |
# Convert to float if possible | |
for col in df: | |
try: | |
df[col] = df[col].astype(float) | |
except ValueError: | |
df[col] = df[col].str.strip() | |
pass | |
return df | |
def main(): | |
url = 'http://www.espn.com/nba/standings' | |
hp = HTMLTableParser() | |
tables = hp.parse_url(url) | |
print(tables) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@srome hello. I modified your script a little to account for the case where tables do not have id and instead use the css/xpath pointer. Also, I strip newline characters from column names.