Skip to content

Instantly share code, notes, and snippets.

@revolutionisme
Created March 5, 2019 09:57
Show Gist options
  • Save revolutionisme/14a919efae6b4f08f1cc0757758978ed to your computer and use it in GitHub Desktop.
Save revolutionisme/14a919efae6b4f08f1cc0757758978ed to your computer and use it in GitHub Desktop.
Read html table using pandas and write to csv
import pandas as pd
import os
from bs4 import BeautifulSoup
import re
import sys
for root, dirs, files in os.walk('/toplevelfolderpath'):
print(root, dirs, files)
for file in files:
if "xls" in file:
try:
tables = pd.read_html(os.path.join(root, file))
html_soup = BeautifulSoup(open(os.path.join(root, file)), 'html.parser')
text = html_soup.find(text=re.compile('Item To Search '))
splitted_text = str(text).split(" - ")
location = splitted_text[1].split("->")
if len(location) == 2:
state = location[0]
district = location[1].replace(" \']", "")
elif len(location) == 1:
state = location[0]
district = "Not available"
else:
state = "Unnamed: 0"
district = "Unnamed: 0"
for i, table in enumerate(tables):
table.rename(columns={'Unnamed: 0': state,
'Unnamed: 1': district},
inplace=True)
csv_file_name = file.replace(".xls", str(i)+".csv")
table.to_csv(os.path.join(root, csv_file_name), index=False)
except ValueError as ve:
print(f"File - {file} couldn't be processed because of {sys.exc_info()}")
except Exception as ex:
print(f"File - {file} couldn't be processed because of {sys.exc_info()}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment