networksetup -setairportpower en0 off
networksetup -setairportpower en0 on
| copy schema.table | |
| (field1, field2, field3, field4, field5) | |
| from 's3://path/to/s3/folder/' | |
| iam_role 'arn:aws:iam::<aws-account-id>:role/<role-name>' | |
| format as csv; |
| CREATE TABLE table_name ( | |
| column1 datatype, | |
| column2 datatype, | |
| column3 datatype, | |
| .... | |
| ); |
| unload ('select * from schema.table') | |
| to 's3://path/to/s3/folder/' | |
| iam_role 'arn:aws:iam::<aws-account-id>:role/<role-name>' | |
| allowoverwrite | |
| format as csv; |
| # Job to load data from platform events db to parquet | |
| # Based on Ky's script | |
| # | |
| # Parameters: | |
| # --MONTHS: amount of months to overwrite the data. If value is "ALL" load all data | |
| import os | |
| import sys | |
| import math | |
| from datetime import datetime |
| fact_df.createOrReplaceTempView('fact_df') | |
| fact_df = spark.sql('select * from fact_df where id > 42') | |
| fact_df = fact_df.alias('fact_df') |
| from pyspark.sql.functions import * | |
| # Generating continuous ids on random rows | |
| fact_df = fact_df.withColumn('id', row_number().over(Window.orderBy(rand()))).alias('fact_df') |
| from pyspark.sql.functions import * | |
| fact_df = fact_df.withColumn('columnname', coalesce(col('columnname'), col('columnname_if_null'), col('columnname_if_null'), col('columnname_if_null'))).alias('fact_df') |
| from pyspark.sql.functions import * | |
| fact_df = fact_df.withColumn('column_name', regexp_replace(col('column_name', ',', '')) |
| from pyspark.sql.functions import * | |
| # Example 1 (recommended) | |
| fact_df = fact_df.withColumn('datecolumn', from_utc_timestamp(col('datecolumn'), "America/Los_Angeles")) | |
| # Example 2 (not recommended) | |
| fact_df = fact_df.withColumn('datecolumn', from_utc_timestamp(col('datecolumn'), "CST")) |