networksetup -setairportpower en0 off
networksetup -setairportpower en0 on
copy schema.table | |
(field1, field2, field3, field4, field5) | |
from 's3://path/to/s3/folder/' | |
iam_role 'arn:aws:iam::<aws-account-id>:role/<role-name>' | |
format as csv; |
CREATE TABLE table_name ( | |
column1 datatype, | |
column2 datatype, | |
column3 datatype, | |
.... | |
); |
unload ('select * from schema.table') | |
to 's3://path/to/s3/folder/' | |
iam_role 'arn:aws:iam::<aws-account-id>:role/<role-name>' | |
allowoverwrite | |
format as csv; |
# Job to load data from platform events db to parquet | |
# Based on Ky's script | |
# | |
# Parameters: | |
# --MONTHS: amount of months to overwrite the data. If value is "ALL" load all data | |
import os | |
import sys | |
import math | |
from datetime import datetime |
fact_df.createOrReplaceTempView('fact_df') | |
fact_df = spark.sql('select * from fact_df where id > 42') | |
fact_df = fact_df.alias('fact_df') |
from pyspark.sql.functions import * | |
# Generating continuous ids on random rows | |
fact_df = fact_df.withColumn('id', row_number().over(Window.orderBy(rand()))).alias('fact_df') |
from pyspark.sql.functions import * | |
fact_df = fact_df.withColumn('columnname', coalesce(col('columnname'), col('columnname_if_null'), col('columnname_if_null'), col('columnname_if_null'))).alias('fact_df') |
from pyspark.sql.functions import * | |
fact_df = fact_df.withColumn('column_name', regexp_replace(col('column_name', ',', '')) |
from pyspark.sql.functions import * | |
# Example 1 (recommended) | |
fact_df = fact_df.withColumn('datecolumn', from_utc_timestamp(col('datecolumn'), "America/Los_Angeles")) | |
# Example 2 (not recommended) | |
fact_df = fact_df.withColumn('datecolumn', from_utc_timestamp(col('datecolumn'), "CST")) |