spark = SparkSession.builder.appName('Magic').getOrCreate()
df = spark.read.format("csv").option("multiline", True).option("delimiter", ",").load(file_path)
df.coalase(1).write.mode("overwrite").format(file_type).save(file_path2)
Spark Authentication to a Data Store (if had to read some BlobStore/Datalake/..)
spark.sparkContext._jsc.hadoopConfiguration().set(accound_name, account_key)
df.rdd.zipWithIndex() # converts ("Sam", "MacBook") ==> (1, "Sam", "Physics")
df.rdd.zipWithIndex().map(lambda row: row[0], row[1:]) # (1, "Sam", "Physics") ==> (1, ("Sam", "Physics"))) # Key-Value pairs
new_df = df.rdd.zipWithIndex().map(lambda row: row[0], row[1]).toDF("ID", "Name")
new_df.printSchma()
print(new_df.count())
new_df = new_df.filter(new_df.Name == "Sam")
df.agg({'ID': 'min'}).collect()[0]['min(ID)'] # passing a dict in `agg`
df.agg({'ID': 'max'}).collect()[0]['max(ID)'] # passing a dict in `agg`
df.groupBy("Name").mapValues(list)
df.persist()
# do stuff,..
df.unpersist()
spark.sparkContext.setGroupJob("title", 'description here')
df.head(5)
df.rdd.take(5)