diogobaltazar · December 16, 2019 14:46
diff --git a/test.py b/test.py
 > df = spark.createDataFrame( 
    [(1, 0), (3, 0)],
    ("a", "b")
 )
 > transf_column(df, F.col('a') + F.col('a'), 'a').show()
 +---+---+
 |  a|  b|
 +---+---+
 |  2|  0|
 |  6|  0|
 +---+---+
diff --git a/transf_ds.py b/transf_ds.py
 def transf_dataset(dataset, transform, col_name, filtering_condition = False):
  new = col_name + '_new'
  
  condition_ok = dataset.filter(filtering_condition)
  
  dataset = (
    dataset
    .filter(~filtering_condition)
    .union(
      condition_ok

      # apply transf
      .withColumn(new, transform)

      # loose old column, rename new col to old col name
      .select(list(
        filter(lambda _: _ != col_name, dataset.columns)
      ) + [new])
      .withColumnRenamed(new, col_name)

      # re-order cols
      .select(dataset.columns)
    )
  )
  
  return dataset
	> df = spark.createDataFrame(
	[(1, 0), (3, 0)],
	("a", "b")
	)
	> transf_column(df, F.col('a') + F.col('a'), 'a').show()
	+---+---+
	\| a\| b\|
	+---+---+
	\| 2\| 0\|
	\| 6\| 0\|
	+---+---+
	def transf_dataset(dataset, transform, col_name, filtering_condition = False):
	new = col_name + '_new'

	condition_ok = dataset.filter(filtering_condition)

	dataset = (
	dataset
	.filter(~filtering_condition)
	.union(
	condition_ok

	# apply transf
	.withColumn(new, transform)

	# loose old column, rename new col to old col name
	.select(list(
	filter(lambda _: _ != col_name, dataset.columns)
	) + [new])
	.withColumnRenamed(new, col_name)

	# re-order cols
	.select(dataset.columns)
	)
	)

	return dataset