Skip to content

Instantly share code, notes, and snippets.

View 64lines's full-sized avatar

Julian Alexander Murillo 64lines

  • Universidad Politécnica de Cartagena
  • Cartagena, Murcia - Spain
View GitHub Profile
from pyspark.sql.functions import *
fact_df = fact_df.withColumn('columnname', coalesce(col('columnname'), col('columnname_if_null'), col('columnname_if_null'), col('columnname_if_null'))).alias('fact_df')
from pyspark.sql.functions import *
fact_df = fact_df.withColumn('column_name', regexp_replace(col('column_name', ',', ''))
from pyspark.sql.functions import *
# Example 1 (recommended)
fact_df = fact_df.withColumn('datecolumn', from_utc_timestamp(col('datecolumn'), "America/Los_Angeles"))
# Example 2 (not recommended)
fact_df = fact_df.withColumn('datecolumn', from_utc_timestamp(col('datecolumn'), "CST"))
@64lines
64lines / lower.py
Last active February 19, 2019 20:35
from pyspark.sql.functions import *
fact_df = fact_df.withColumn('columname', lower(col('columname'))).alias('fact_df')
from pyspark.sql.functions import *
fact_df = fact_df.select([
'fact_df.*',
col('columnname').alias('columnalias'),
]).alias('fact_df')
from pyspark.sql.functions import *
fact_df = fact_df.withColumn('month', month('datecolumn')).alias('fact_df')
fact_df = fact_df.withColumn('day', dayofmonth('datecolumn')).alias('fact_df')
fact_df = fact_df.withColumn('year', year('datecolumn')).alias('fact_df')
from pyspark.sql.functions import *
fact_df = fact_df.withColumn('columname', concat(col('columname'), lit(' '), col('another_columname'))).alias('fact_df')
fact_df = other_df.union(another_df).alias('fact_df')
from pyspark.sql.functions import *
# Example 1
fact_df = fact_df.filter(col('colname').isNotNull()).alias('fact_df')
# Example 2
fact_df = fact_df.filter(col('colname').isNull()).alias('fact_df')
# Example 3
fact_df = fact_df.withColumn('colname', when(col('colname').isNotNull(), col('colname2')).otherwise(col('colname3'))).alias('fact_df')
@64lines
64lines / trim.py
Last active February 19, 2019 20:02
from pyspark.sql.functions import *
# Example 1
fact_df = fact_df.withColumn('colname', trim(col('colname'))).alias('fact_df')
# Example 2
fact_df = fact_df.join(dimension_df, trim(col('fact_df.colname') == trim(col('dimension_df.another_colname')), 'left').alias('fact_df')