Skip to content

Instantly share code, notes, and snippets.

@tommydangerous
Created May 13, 2021 03:28
Show Gist options
  • Select an option

  • Save tommydangerous/c0d6364226535115de1b8aab4b4bfe13 to your computer and use it in GitHub Desktop.

Select an option

Save tommydangerous/c0d6364226535115de1b8aab4b4bfe13 to your computer and use it in GitHub Desktop.
PySpark example 3
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import (
IntegerType,
StringType,
StructField,
StructType,
)
"""
StructField arguments:
First argument: column name
Second argument: column type
Third argument: True if this column can have null values
"""
SCHEMA_COMING_SOON = StructType([
StructField('user_id', IntegerType(), True),
StructField('name', StringType(), True),
StructField('number_of_rows', IntegerType(), True),
])
@pandas_udf(
SCHEMA_COMING_SOON,
PandasUDFType.GROUPED_MAP,
)
def custom_transformation_function(df):
number_of_rows_by_date = df.groupby('date').size()
number_of_rows_by_date.columns = ['date', 'number_of_rows']
number_of_rows_by_date['user_id'] = df['user_id'].iloc[:1]
return number_of_rows_by_date
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment