Created
November 8, 2019 04:40
-
-
Save antlauzon/e55e97cf8580e472cc9d8d94707c4c27 to your computer and use it in GitHub Desktop.
optn star data for organ donation translation for athena/hive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
import re | |
import sys | |
from bs4 import BeautifulSoup | |
from pathlib import Path | |
DATA_DIR = sys.argv[1] | |
TYPE_MAP = { | |
'character': 'string', | |
'numeric': 'bigint' | |
} | |
DDL_TEMPLATE = """ | |
CREATE EXTERNAL TABLE IF NOT EXISTS optn.{} ( | |
{} | |
) | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' | |
WITH SERDEPROPERTIES ( | |
'separatorChar' = ',', | |
'quoteChar' = '\\"', | |
'escapeChar' = '\\\\' | |
) | |
STORED AS TEXTFILE | |
LOCATION 's3://optn-2019/{}/' | |
""" | |
DATE_RE = re.compile("(\d\d)/(\d\d)/(\d\d\d\d)") | |
for dat_file in Path(DATA_DIR).rglob('*.DAT'): | |
header_file = str(dat_file).replace('.DAT', '.htm.conv.htm') | |
header_data = open(header_file, 'rb').read() | |
header_data = header_data.decode('utf-8', 'ignore') | |
soup = BeautifulSoup(header_data, features="html.parser") | |
base_name = os.path.basename(dat_file) | |
ddl_path = str(dat_file).replace('.DAT', '.ddl') | |
csv_path = str(dat_file).replace('.DAT', '.csv') | |
table_name = base_name.replace('.DAT', '').lower().replace(' ', '_') | |
row = open | |
float_ind = [] | |
with open(csv_path, 'w') as f: | |
for row_i, row in enumerate(open(dat_file, 'rb').readlines()): | |
row = row.decode('utf-8', 'ignore') | |
cells = row.split('\t') | |
formatted_cells = [] | |
if row_i == 0: | |
float_ind = [False for _ in range(len(cells))] | |
for cell_i, cell in enumerate(cells): | |
cell_data = cell.strip() | |
date_match = DATE_RE.match(cell) | |
if date_match: | |
cell_data = "{}-{}-{}".format(date_match.group(3), | |
date_match.group(1), | |
date_match.group(2)) | |
if cell_data != '.': | |
if '.' in cell_data: | |
float_ind[cell_i] = True | |
formatted_cells.append("\"{}\"".format(cell_data)) | |
else: | |
formatted_cells.append("\"\"") | |
f.write("{}\n".format(','.join(formatted_cells))) | |
ddl_columns = [] | |
for row_i, row in enumerate(soup.find_all('tr')): | |
if row_i == 0: | |
continue | |
is_date = False | |
is_float = False | |
column_name = '' | |
data_type = '' | |
for i, cell in enumerate(row.find_all('td')): | |
cell_text = cell.text.strip().lower() | |
if i == 0: | |
column_name = cell_text | |
if i == 1 and cell_text == 'mmddyy': | |
is_date = True | |
elif i == 3: | |
if is_date: | |
data_type = 'date' | |
elif cell_text == 'numeric' and float_ind[row_i-1]: | |
data_type = 'float' | |
else: | |
data_type = TYPE_MAP[cell_text] | |
ddl_columns.append("`{}` {},".format(column_name, data_type)) | |
ddl = DDL_TEMPLATE.format(table_name, | |
'\n\t\t'.join(ddl_columns)[:-1], | |
table_name) | |
with open(ddl_path, 'w') as f: | |
f.write(ddl) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment