slopp · February 27, 2024 16:14
diff --git a/README.md b/README.md
diff --git a/dagster_mock_many_tables.py b/dagster_mock_many_tables.py
 from dagster import asset, Definitions, define_asset_job, ScheduleDefinition, SourceAsset, AssetExecutionContext, AssetSelection, AssetKey
 import itertools

 # Mock functions that implement the core logic, drop in from your existing codebase
 def get_tables(): 
    """ Some function that returns a list of dicts, containing table and schema info"""
    return [{"table": "table1", "schema": "schema1"}, {"table": "table2", "schema": "schema2"}]

 def sql_to_gcs_bucket(table_name, gcs_bucket):
    """ Some function that takes a given sql table and creates a corresponding csv in a GCS bucket """
    return gcs_bucket + "/" + table_name

 def gcs_bucket_to_bq(table_name, gcs_file, bq_dataset):
    """ Some function that loads a table into BQ from a GCS bucket"""
    # used in place of the black box GCSToBQOperator
    # similar to https://medium.com/@bhaveshpatil0078/loading-csv-file-from-gcs-to-bigquery-using-python-9c646d4e884f 
    # but gives you more flexibility, eg to swap out for a BQ "external table" implementation
    return bq_dataset + "." + table_name

 def flatten(list_of_lists):
    """ Helper utility to flatten a list of lists"""
    return list(itertools.chain.from_iterable(list_of_lists))

 ########################
 # A dynamic dagster pipeline for multiple tables
 ########################

 GCS_BUCKET = "gcs://my_bucket" 
 BQ_DATASET = "my_dataset"
 tables = get_tables()

 def table_to_bq_factory(table_name, gcs_bucket, bq_dataset): 
    """ A factory that creates pipeline for each table"""
    assets = []
    
    table_source = SourceAsset(
        key=["sql_server", table_name],
        description="A representation of the SQL server source data for table: {table_name}"
    )
    assets.append(table_source)

    @asset(
        key=["gcs_extract", table_name],
        deps=[AssetKey(["sql_server", table_name])], 
        description=f"The CSV extract of {table_name} in GCS"
    ) 
    def table_extract(context: AssetExecutionContext): 

        result = sql_to_gcs_bucket(
            table_name,
            GCS_BUCKET
        )

        context.log.info(f"Extracted result {table_name} to {result}")

        context.add_output_metadata({
            "table": table_name,
            "gcs_location": result, 
        })

    assets.append(table_extract)

    @asset(
        key=["bq", table_name],
        deps=[AssetKey(["gcs_extract", table_name])], 
        description=f"The CSV extract of {table_name} in GCS"
    ) 
    def table_bq(context: AssetExecutionContext): 

        result = gcs_bucket_to_bq(
            table_name,
            GCS_BUCKET + "/" + table_name,
            BQ_DATASET
        )

        context.log.info(f"Loaded {table_name} to {result}")

        context.add_output_metadata({
            "table": table_name,
            "bq_table": result, 
        })

    assets.append(table_bq)

    return assets

 all_assets = []
 tables = get_tables()

 for table in tables:
    assets_for_this_table = table_to_bq_factory(table['table'], GCS_BUCKET, BQ_DATASET)
    all_assets.append(assets_for_this_table)

 # A job to run everything, more granular jobs could be created instead
 run_all_pipelines_daily = ScheduleDefinition(
    name = "run_all_pipelines_daily",
    cron_schedule="0 8 * * *",
    job=define_asset_job(name="run_all_pipelines", selection=AssetSelection.all())
 ) 

 defs = Definitions(
    assets=[*flatten(all_assets)],
    schedules=[run_all_pipelines_daily]
 )
diff --git a/dagster_mock_one_table.py b/dagster_mock_one_table.py
 from dagster import asset, Definitions, define_asset_job, ScheduleDefinition, SourceAsset, AssetExecutionContext, AssetSelection

 # Mock functions that implement the core logic, drop in from your existing codebase
 def get_tables(): 
    """ Some function that returns a list of dicts, containing table and schema info"""
    return [{"table": "table1", "schema": "schema1"}, {"table": "table1", "schema": "schema1"}]

 def sql_to_gcs_bucket(table_name, gcs_bucket):
    """ Some function that takes a given sql table and creates a corresponding csv in a GCS bucket """
    return gcs_bucket + "/" + table_name

 def gcs_bucket_to_bq(table_name, gcs_file, bq_dataset):
    """ Some function that loads a table into BQ from a GCS bucket"""
    # used in place of the black box GCSToBQOperator
    # similar to https://medium.com/@bhaveshpatil0078/loading-csv-file-from-gcs-to-bigquery-using-python-9c646d4e884f 
    # but gives you more flexibility, eg to swap out for a BQ "external table" implementation
    return bq_dataset + "." + table_name

 ########################
 # A dagster pipeline for one table
 ########################

 GCS_BUCKET = "gcs://my_bucket" 
 BQ_DATASET = "my_dataset"
 tables = get_tables()
 first_table = tables[0]

 # Optional, creating a source asset allows us to represent the underlying SQL Server table in our data lineage graph but does not affect how the pipeline is run
 first_table_source = SourceAsset(
    key=["sql_server", first_table["table"]],
    description="A representation of the SQL server source data"
 )

 @asset(
        deps=[first_table_source]
 ) 
 def first_table_extract(context: AssetExecutionContext): 
    """ The CSV extract of our table in GCS """

    table = first_table["table"]

    result = sql_to_gcs_bucket(
        table,
        GCS_BUCKET
    )

    # Dagster lets you add metadata and logging
    context.log.info(f"Extracted result {table} to {result}")

    context.add_output_metadata({
        "table": table,
        "gcs_location": result, 
        "schema": first_table['schema']
    })



 @asset(
        deps=[first_table_extract]
 )
 def first_table_bq(context: AssetExecutionContext):
    """ The BQ table"""

    table = first_table["table"]

    result = gcs_bucket_to_bq(
        table, 
        GCS_BUCKET + "/" + table,
        BQ_DATASET
    )

    context.log.info(f"Loaded gcs file {table} to BQ: {result}")

    context.add_output_metadata({
        "table": table,
        "BQ": result, 
    })

 # Create a scheduled job that targets these assets 
 run_pipeline_daily = ScheduleDefinition(
    name = "run_pipeline_daily",
    cron_schedule="0 8 * * *",
    job=define_asset_job(name="run_pipeline", selection=AssetSelection.all())
 ) 

 defs = Definitions(
    assets = [first_table_source, first_table_extract, first_table_bq], 
    schedules= [run_pipeline_daily]
 )
	from dagster import asset, Definitions, define_asset_job, ScheduleDefinition, SourceAsset, AssetExecutionContext, AssetSelection, AssetKey
	import itertools

	# Mock functions that implement the core logic, drop in from your existing codebase
	def get_tables():
	""" Some function that returns a list of dicts, containing table and schema info"""
	return [{"table": "table1", "schema": "schema1"}, {"table": "table2", "schema": "schema2"}]

	def sql_to_gcs_bucket(table_name, gcs_bucket):
	""" Some function that takes a given sql table and creates a corresponding csv in a GCS bucket """
	return gcs_bucket + "/" + table_name

	def gcs_bucket_to_bq(table_name, gcs_file, bq_dataset):
	""" Some function that loads a table into BQ from a GCS bucket"""
	# used in place of the black box GCSToBQOperator
	# similar to https://medium.com/@bhaveshpatil0078/loading-csv-file-from-gcs-to-bigquery-using-python-9c646d4e884f
	# but gives you more flexibility, eg to swap out for a BQ "external table" implementation
	return bq_dataset + "." + table_name

	def flatten(list_of_lists):
	""" Helper utility to flatten a list of lists"""
	return list(itertools.chain.from_iterable(list_of_lists))

	########################
	# A dynamic dagster pipeline for multiple tables
	########################

	GCS_BUCKET = "gcs://my_bucket"
	BQ_DATASET = "my_dataset"
	tables = get_tables()

	def table_to_bq_factory(table_name, gcs_bucket, bq_dataset):
	""" A factory that creates pipeline for each table"""
	assets = []

	table_source = SourceAsset(
	key=["sql_server", table_name],
	description="A representation of the SQL server source data for table: {table_name}"
	)
	assets.append(table_source)

	@asset(
	key=["gcs_extract", table_name],
	deps=[AssetKey(["sql_server", table_name])],
	description=f"The CSV extract of {table_name} in GCS"
	)
	def table_extract(context: AssetExecutionContext):

	result = sql_to_gcs_bucket(
	table_name,
	GCS_BUCKET
	)

	context.log.info(f"Extracted result {table_name} to {result}")

	context.add_output_metadata({
	"table": table_name,
	"gcs_location": result,
	})

	assets.append(table_extract)

	@asset(
	key=["bq", table_name],
	deps=[AssetKey(["gcs_extract", table_name])],
	description=f"The CSV extract of {table_name} in GCS"
	)
	def table_bq(context: AssetExecutionContext):

	result = gcs_bucket_to_bq(
	table_name,
	GCS_BUCKET + "/" + table_name,
	BQ_DATASET
	)

	context.log.info(f"Loaded {table_name} to {result}")

	context.add_output_metadata({
	"table": table_name,
	"bq_table": result,
	})

	assets.append(table_bq)

	return assets

	all_assets = []
	tables = get_tables()

	for table in tables:
	assets_for_this_table = table_to_bq_factory(table['table'], GCS_BUCKET, BQ_DATASET)
	all_assets.append(assets_for_this_table)

	# A job to run everything, more granular jobs could be created instead
	run_all_pipelines_daily = ScheduleDefinition(
	name = "run_all_pipelines_daily",
	cron_schedule="0 8 * * *",
	job=define_asset_job(name="run_all_pipelines", selection=AssetSelection.all())
	)

	defs = Definitions(
	assets=[*flatten(all_assets)],
	schedules=[run_all_pipelines_daily]
	)