Last active
March 19, 2024 12:18
-
-
Save Menziess/905a3d5084d6a3f7db62469db5c6afd2 to your computer and use it in GitHub Desktop.
Spark check row exists in group using windowing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pyspark.sql.functions import * | |
| from pyspark.sql.window import Window | |
| from pyspark.sql import Column | |
| from functools import reduce | |
| w = Window.partitionBy('group_id') | |
| filter_expression = reduce(Column.__and__, ( | |
| array_contains(collect_list('value').over(w), val) | |
| for val in ('banana', 'orange') | |
| )) | |
| display( | |
| df | |
| .where(col('group_id').isNotNull()) | |
| .withColumn('has_all_fruits', filter_expression) | |
| .where('has_all_fruits') | |
| .drop('has_all_fruits') | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment