Created
June 26, 2020 08:48
-
-
Save dsalaj/e012997c0b9f536b3d95c5be3e2d40e0 to your computer and use it in GitHub Desktop.
Cheatsheet for pyspark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# filter with strings | |
df.filter(df.name.endswith('ice')).collect() | |
# [Row(age=2, name='Alice')] | |
# order with null values at the end | |
df.select(df.name).orderBy(df.name.desc_nulls_last()).collect() | |
# [Row(name='Tom'), Row(name='Alice'), Row(name=None)] | |
# filter by null | |
df.filter(df.height.isNotNull()).collect() | |
df.filter(df.height.isNull()).collect() | |
# filter by SQL LIKE | |
df.filter(df.name.like('Al%')).collect() | |
# filter by equality to one of the elements in the list | |
df.filter(df.score.isin([10, 20, 55])) | |
df.filter(~df.score.isin([10, 20, 55])) | |
# partition by field and save to parquet | |
df.write.partitionBy("year").format("parquet").save("artist_year.parquet") | |
# bucket and save to persistant table | |
df.write.bucketBy(42, "name").sortBy("age").saveAsTable("artist_bucketed") | |
# print distinct values of a column | |
df.select("page").distinct().show() | |
# print number of rows with specific value in a column | |
print(df.filter(df.page == "NextSong").count()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment