Created
December 3, 2021 05:18
-
-
Save fozziethebeat/8712e1e84b71b94206320489019fe6ce to your computer and use it in GitHub Desktop.
Minimal Beam CAN Pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Requires | |
# pip install apache-beam | |
# pip install apache-beam[dataframe] | |
# | |
# Associated documentation | |
# Beam Dataframe API: https://beam.apache.org/releases/pydoc/2.34.0/apache_beam.dataframe.html | |
# Beam Dataframe Overview: https://beam.apache.org/documentation/dsls/dataframes/overview/ | |
# Beam Dataframe Differences: https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas/ | |
import apache_beam as beam | |
from apache_beam.dataframe.io import read_parquet | |
from apache_beam.options.pipeline_options import PipelineOptions | |
def make_bucketed_demographics(row): | |
return row['age'] + ';' + row['race'] | |
with beam.Pipeline(options=PipelineOptions()) as pipeline: | |
timeseries = (pipeline | read_parquet(data_file)) | |
timeseries.assign(bucketed=make_bucketed_demographics) | |
timeseries.to_parquet('out.parquet') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment