fozziethebeat · December 3, 2021 05:18
diff --git a/minimal_can_beam_pipeline.py b/minimal_can_beam_pipeline.py
 # Requires
 #  pip install apache-beam
 #  pip install apache-beam[dataframe]
 #
 # Associated documentation
 #   Beam Dataframe API: https://beam.apache.org/releases/pydoc/2.34.0/apache_beam.dataframe.html
 #   Beam Dataframe Overview: https://beam.apache.org/documentation/dsls/dataframes/overview/
 #   Beam Dataframe Differences: https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas/


 import apache_beam as beam
 from apache_beam.dataframe.io import read_parquet
 from apache_beam.options.pipeline_options import PipelineOptions

 def make_bucketed_demographics(row):
    return row['age'] + ';' + row['race']
  
 with beam.Pipeline(options=PipelineOptions()) as pipeline:
    timeseries = (pipeline | read_parquet(data_file))
    timeseries.assign(bucketed=make_bucketed_demographics)
    timeseries.to_parquet('out.parquet')
	# Requires
	# pip install apache-beam
	# pip install apache-beam[dataframe]
	#
	# Associated documentation
	# Beam Dataframe API: https://beam.apache.org/releases/pydoc/2.34.0/apache_beam.dataframe.html
	# Beam Dataframe Overview: https://beam.apache.org/documentation/dsls/dataframes/overview/
	# Beam Dataframe Differences: https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas/


	import apache_beam as beam
	from apache_beam.dataframe.io import read_parquet
	from apache_beam.options.pipeline_options import PipelineOptions

	def make_bucketed_demographics(row):
	return row['age'] + ';' + row['race']

	with beam.Pipeline(options=PipelineOptions()) as pipeline:
	timeseries = (pipeline \| read_parquet(data_file))
	timeseries.assign(bucketed=make_bucketed_demographics)
	timeseries.to_parquet('out.parquet')
No results found