Skip to content

Instantly share code, notes, and snippets.

@antlauzon
Created November 8, 2019 04:40
Show Gist options
  • Save antlauzon/e55e97cf8580e472cc9d8d94707c4c27 to your computer and use it in GitHub Desktop.
Save antlauzon/e55e97cf8580e472cc9d8d94707c4c27 to your computer and use it in GitHub Desktop.
optn star data for organ donation translation for athena/hive
import csv
import os
import re
import sys
from bs4 import BeautifulSoup
from pathlib import Path
DATA_DIR = sys.argv[1]
TYPE_MAP = {
'character': 'string',
'numeric': 'bigint'
}
DDL_TEMPLATE = """
CREATE EXTERNAL TABLE IF NOT EXISTS optn.{} (
{}
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
'separatorChar' = ',',
'quoteChar' = '\\"',
'escapeChar' = '\\\\'
)
STORED AS TEXTFILE
LOCATION 's3://optn-2019/{}/'
"""
DATE_RE = re.compile("(\d\d)/(\d\d)/(\d\d\d\d)")
for dat_file in Path(DATA_DIR).rglob('*.DAT'):
header_file = str(dat_file).replace('.DAT', '.htm.conv.htm')
header_data = open(header_file, 'rb').read()
header_data = header_data.decode('utf-8', 'ignore')
soup = BeautifulSoup(header_data, features="html.parser")
base_name = os.path.basename(dat_file)
ddl_path = str(dat_file).replace('.DAT', '.ddl')
csv_path = str(dat_file).replace('.DAT', '.csv')
table_name = base_name.replace('.DAT', '').lower().replace(' ', '_')
row = open
float_ind = []
with open(csv_path, 'w') as f:
for row_i, row in enumerate(open(dat_file, 'rb').readlines()):
row = row.decode('utf-8', 'ignore')
cells = row.split('\t')
formatted_cells = []
if row_i == 0:
float_ind = [False for _ in range(len(cells))]
for cell_i, cell in enumerate(cells):
cell_data = cell.strip()
date_match = DATE_RE.match(cell)
if date_match:
cell_data = "{}-{}-{}".format(date_match.group(3),
date_match.group(1),
date_match.group(2))
if cell_data != '.':
if '.' in cell_data:
float_ind[cell_i] = True
formatted_cells.append("\"{}\"".format(cell_data))
else:
formatted_cells.append("\"\"")
f.write("{}\n".format(','.join(formatted_cells)))
ddl_columns = []
for row_i, row in enumerate(soup.find_all('tr')):
if row_i == 0:
continue
is_date = False
is_float = False
column_name = ''
data_type = ''
for i, cell in enumerate(row.find_all('td')):
cell_text = cell.text.strip().lower()
if i == 0:
column_name = cell_text
if i == 1 and cell_text == 'mmddyy':
is_date = True
elif i == 3:
if is_date:
data_type = 'date'
elif cell_text == 'numeric' and float_ind[row_i-1]:
data_type = 'float'
else:
data_type = TYPE_MAP[cell_text]
ddl_columns.append("`{}` {},".format(column_name, data_type))
ddl = DDL_TEMPLATE.format(table_name,
'\n\t\t'.join(ddl_columns)[:-1],
table_name)
with open(ddl_path, 'w') as f:
f.write(ddl)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment