Created
March 5, 2019 09:57
-
-
Save revolutionisme/14a919efae6b4f08f1cc0757758978ed to your computer and use it in GitHub Desktop.
Read html table using pandas and write to csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
from bs4 import BeautifulSoup | |
import re | |
import sys | |
for root, dirs, files in os.walk('/toplevelfolderpath'): | |
print(root, dirs, files) | |
for file in files: | |
if "xls" in file: | |
try: | |
tables = pd.read_html(os.path.join(root, file)) | |
html_soup = BeautifulSoup(open(os.path.join(root, file)), 'html.parser') | |
text = html_soup.find(text=re.compile('Item To Search ')) | |
splitted_text = str(text).split(" - ") | |
location = splitted_text[1].split("->") | |
if len(location) == 2: | |
state = location[0] | |
district = location[1].replace(" \']", "") | |
elif len(location) == 1: | |
state = location[0] | |
district = "Not available" | |
else: | |
state = "Unnamed: 0" | |
district = "Unnamed: 0" | |
for i, table in enumerate(tables): | |
table.rename(columns={'Unnamed: 0': state, | |
'Unnamed: 1': district}, | |
inplace=True) | |
csv_file_name = file.replace(".xls", str(i)+".csv") | |
table.to_csv(os.path.join(root, csv_file_name), index=False) | |
except ValueError as ve: | |
print(f"File - {file} couldn't be processed because of {sys.exc_info()}") | |
except Exception as ex: | |
print(f"File - {file} couldn't be processed because of {sys.exc_info()}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment