Skip to content

Instantly share code, notes, and snippets.

@filipeandre
Last active May 15, 2025 22:20
Show Gist options
  • Save filipeandre/748c602889309de302ce26dfbafa4de4 to your computer and use it in GitHub Desktop.
Save filipeandre/748c602889309de302ce26dfbafa4de4 to your computer and use it in GitHub Desktop.
Excel Table Data Extractor
import boto3
import tempfile
import os
import sys
from urllib.parse import urlparse
import openpyxl
def parse_s3_uri(s3_uri):
parsed = urlparse(s3_uri)
bucket = parsed.netloc
key = parsed.path.lstrip("/")
return bucket, key
def download_from_s3(s3_uri):
s3 = boto3.client("s3")
bucket, key = parse_s3_uri(s3_uri)
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
s3.download_file(bucket, key, tmp_file.name)
return tmp_file.name
def get_cell_type_name(cell):
type_map = {
'n': 'number',
's': 'string',
'b': 'boolean',
'f': 'formula',
'e': 'error',
'd': 'date',
'inlineStr': 'richtext',
'str': 'cached_string'
}
return type_map.get(cell.data_type, 'unknown')
def extract_tables_with_types(sheet):
tables = []
visited_headers = set()
rows = list(sheet.iter_rows(min_row=1))
i = 0
while i < len(rows):
row = rows[i]
if any(cell.value is not None for cell in row):
# Treat as potential table header
header = tuple(
f"{cell.value} ({get_cell_type_name(cell)})"
for cell in row if cell.value is not None
)
if header and header not in visited_headers:
visited_headers.add(header)
table_data_types = []
# Check next rows until we hit an empty row
j = i + 1
while j < len(rows):
data_row = rows[j]
if all(cell.value is None for cell in data_row):
break # end of table
row_types = [
get_cell_type_name(cell) for cell in data_row[:len(row)]
]
table_data_types.append(row_types)
j += 1
tables.append((header, table_data_types))
i = j
continue
i += 1
return tables
def analyze_excel(file_path):
wb = openpyxl.load_workbook(file_path, data_only=True)
result = {}
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
tables = extract_tables_with_types(sheet)
result[sheet_name] = tables
return result
def main():
if len(sys.argv) < 2:
print("Usage: python extract_excel_info.py s3://your-bucket/path/to/file.xlsx")
sys.exit(1)
s3_uri = sys.argv[1]
if not s3_uri.startswith("s3://"):
print("Error: Argument must be an S3 URI starting with s3://")
sys.exit(1)
local_file = download_from_s3(s3_uri)
try:
analysis = analyze_excel(local_file)
for sheet, tables in analysis.items():
print(f"\nSheet: {sheet}")
for idx, (headers, data_types) in enumerate(tables, 1):
print(f" Table {idx} headers: {headers}")
print(f" Data type rows (max 5 shown):")
for row in data_types[:5]:
print(f" {row}")
if len(data_types) > 5:
print(f" ... ({len(data_types)} rows total)")
finally:
os.remove(local_file)
if __name__ == "__main__":
main()
@filipeandre
Copy link
Author

curl -s https://gist.githubusercontent.com/filipeandre/748c602889309de302ce26dfbafa4de4/raw/385838832f65080bc22a82e72bd548ab638b4ba2/extract_excel_info.py | python3 - s3://your-bucket/path/to/file.xlsx

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment