Creates time based Glue partitions given time range.
Keep in mind that you don't need data to add partitions. So, you can create partitions for a whole year and add the data to S3 later.
| PERFIL_ELEITORADO,CONSULTA_CAND_2010,CONSULTA_CAND_2012,CONSULTA_CAND_2014,BEM_CANDIDATO,CONSULTA_LEGENDAS ,CONSULTA_VAGAS ,VOTACAO_CANDIDATO_MUN_ZONA_2012,VOTACAO_CANDIDATO_MUN_ZONA_2014,VOTACAO_PARTIDO_MUN_ZONA_2012,VOTACAO_PARTIDO_MUN_ZONA_2014,VOTO_SECAO ,DETALHE_VOTACAO_MUN_ZONA_2012,DETALHE_VOTACAO_MUN_ZONA_2014 | |
| PERIODO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO | |
| UF,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO | |
| MUNICIPIO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO | |
| COD_MUNICIPIO_TSE,NUM_TURNO ,NUM_TURNO ,NUM_TURNO ,DESCRICAO_ELEICAO,NUM_TURNO,DESCRICAO_ELEICAO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO | |
| NR_ZONA,DESCRICAO_ELEI |
| "AC","AL","AP","AM","BA","CE","DF","ES","GO","MA","MT","MS","MG","PA","PB","PR","PE","PI","RJ","RN","RS","RO","RR","SC","SP","SE","TO" |
| import xmltodict | |
| import pandas as pd | |
| import requests | |
| xml = request.get('url').text | |
| df = pd.DataFrame(xmltodict.parse(xml)) | |
| df.rename(columns=lambda x: x.replace('@', ''), inplace=True) | |
| df.to_csv('data.csv') |
| def split_data_frame_list(df, | |
| target_column, | |
| output_type=float): | |
| ''' | |
| Accepts a column with multiple types and splits list variables to several rows. | |
| df: dataframe to split | |
| target_column: the column containing the values to split | |
| output_type: type of all outputs |
| def suffix(alist): | |
| if not len(alist): | |
| return [[]] | |
| else: | |
| return [alist] + suffix(alist[1:]) | |
| def preffix(alist): | |
| if not len(alist): |
| DROP TABLE IF EXISTS main; | |
| CREATE EXTERNAL TABLE main ( | |
| endTimeMillis BIGINT, | |
| startTimeMillis BIGINT, | |
| endTime STRING, | |
| startTime STRING, | |
| jams array<struct< | |
| uuid: STRING, | |
| pubMillis: BIGINT, |
| CREATE TABLE waze.polygons_geo | |
| WITH ( | |
| external_location = 's3://...', | |
| format = 'Parquet') AS | |
| WITH dataset AS ( | |
| SELECT | |
| polygons | |
| FROM waze.polygons) | |
| SELECT | |
| pol.polygon, |
Creates time based Glue partitions given time range.
Keep in mind that you don't need data to add partitions. So, you can create partitions for a whole year and add the data to S3 later.
| SELECT | |
| '{"type":"LineString", "coordinates":' || | |
| '[' || array_join(transform(line, loc -> '[' || CAST(loc.x AS VARCHAR) || ',' || CAST(loc.y AS VARCHAR) || ']'), ',') || ']}' | |
| FROM test.test |
| rm -r python | |
| rm lambda_layer | |
| mkdir python | |
| printf "[install]\nprefix=" > ~/.pydistutils.cfg | |
| pip3.7 install $1 -t python/ # insert any pip available module, repeat if necessary | |
| printf "" > ~/.pydistutils.cfg | |
| zip -r lambda_layer.zip ./python | |
| aws s3 cp lambda_layer.zip s3://config-lambda/layers/$1/ | |
| aws lambda publish-layer-version \ | |
| --layer-name $1 \ |