Last active
September 24, 2020 12:09
-
-
Save santrancisco/d0aec75dccdfc534ce9bca145fafcd02 to your computer and use it in GitHub Desktop.
Use AWS glue result to flatten json input, generate redshift create table command and jsonpaths file for COPY job
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import argparse | |
import re | |
parser = argparse.ArgumentParser() | |
parser.add_argument("file").required | |
args=parser.parse_args() | |
typemap={ | |
"longstring":"VARCHAR(MAX)", | |
"string":"VARCHAR(MAX)", | |
"int":"INTEGER", | |
"boolean":"BOOLEAN", | |
"double":"REAL" | |
} | |
f=open(args.file,'r').read() | |
j=json.loads(f) | |
# print (json.dumps(j)) | |
col = None | |
typestr="type" | |
namestr="name" | |
if j.get("TableList") != None: ## This is when we bother to run `aws glue get-tables` | |
col=j["TableList"][0]["StorageDescriptor"]["Columns"] | |
typestr="Type" | |
namestr="Name" | |
else: ## This is when we deal with json copied straight from AWS Console WebUI network traffic (aka being lazy) | |
col=j["actionResponses"][0]["data"]["tableVersions"][0]["table"]["storageDescriptor"]["columns"] | |
mymap = {} | |
for i in col: | |
if i[typestr].startswith("struct<"): | |
s=re.sub("array<[^>]*>","longstring",i[typestr]) | |
s=re.sub("map<[^>]*>","longstring",s) | |
s=s.replace("struct<","{").replace(">","}") | |
s=re.sub(r'(?<={|,)([^:]*)(?=:)', r'"\1"', s) | |
s=re.sub(r'(?<=:)([a-zA-Z0-9]*)(?=}|,)', r'"\1"', s) | |
inner = json.loads(s) | |
mymap[i[namestr]] = inner | |
# print(s) | |
else: | |
t=i[typestr] | |
t=re.sub("array<[^>]*>","longstring",i[typestr]) | |
t=re.sub("map<[^>]*>","longstring",t) | |
mymap[i[namestr]]=t | |
crt = "CREATE TABLE IF NOT EXISTS segment.eventlogs (" | |
jsonpaths = {"jsonpaths":[]} | |
jpath=[] | |
def simplerecurse(d,prefix,jprefix): | |
global crt | |
global jpath | |
for i in d: | |
# print ("%s - %s"%(i,d[i])) | |
if isinstance(d[i],dict): | |
p = prefix+i+"_" | |
jp = jprefix+"['"+i+"']" | |
simplerecurse(d[i],p,jp) | |
else: | |
crt += "\n%s%s\t%s,"%(prefix,i.replace(" ","").replace("-","_"),typemap[d[i]]) | |
jpath.append(jprefix+"['"+i+"']") | |
simplerecurse(mymap,"","$") | |
jsonpaths["jsonpaths"] = jpath | |
crt=crt[:-1]+"\n)" | |
f=open("crt.txt","w") | |
f.write(crt) | |
f.close() | |
f=open("jsonpath.json","w") | |
f.write(json.dumps(jsonpaths, indent=4)) | |
f.close() | |
print (crt) | |
print (json.dumps(jsonpaths, indent=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a quick parser for glue result in aws. Glue is very good at identifying the data structure you deal with. After glue runs, you can run
aws glue get-tables --database-name {your db name} > glue.json
to get the tablelist and their data structure. This file contains column information for your data. This script run over it and createCREATE TABLE
command for AWS Redshift and an aligned jsonpath.json file that you can use to load data using COPY command straight from S3 to your Redshift.Eg: