tuulos · August 1, 2024 07:43
diff --git a/metaflow-cheat-sheet.d2 b/metaflow-cheat-sheet.d2
 classes: {
  invisible: {
    style.opacity: 0
    label: a
  }
 }

 frame: {
  *: {
    style.font-size: 40
  }
  label: _
  style.font-color: white
  grid-rows: 2
  *: {
    style.stroke-width: 0
    style.fill: white
  }
  style.fill: white
  style.stroke-width: 0
  ic: {
    style.font-color: white
    shape: image
    icon: https://metaflow.org/images/metaflow.svg
    icon.near: outside-top-center
    style.fill: white
  }
  Cheat Sheet: {
    label: Metaflow Cheat Sheet 0.1
    grid-rows: 2
    grid-columns: 2
    *: {
      style.font-size: 30
    }

    Flow Structures: {
      *: {
        style.font-size: 30
      }
      grid-rows: 3
      Task Parallelism: {
        style: {
          fill: white
        }
        code: |python
          # Process two or more functions
          # concurrently
          self.next(self.a, self.b)
        |
        code.style: {
          stroke-width: 0
          font-size: 20
        }
        box: {
          style.opacity: 0
          grid-rows: 2
          grid-columns: 3
          foo1.class: invisible
          step
          foo2.class: invisible
          step a
          foo3.class: invisible
          step b
          step -> step a: {style.animated: true}
          step -> step b: {style.animated: true}
        }
      }
      Data Parallelism: {
        style: {
          fill: white
        }
        code: |python
          # Process the elements of a list
          # concurrently
          self.mylist = ['A', 'B']
          self.next(self.a, foreach='mylist')
        |
        code.style: {
          stroke-width: 0
          font-size: 20
        }
        box: {
          style.opacity: 0
          grid-rows: 2
          step
          step a: {
            style.multiple: true
          }
          step -> step a: {
            style.animated: true
          }
        }
      }
      Distributed Computing: {
        style: {
          fill: white
        }
        code: |python
          # Set up an ephemeral cluster
          # for distributed computing
          self.next(self.a, num_parallel=N)
          @pytorch, @ray, @mpi, ...
        |
        code.style: {
          stroke-width: 0
          font-size: 20
        }
        box: {
          style.opacity: 0
          grid-rows: 2
          grid-columns: 3
          foo1.class: invisible
          step
          foo2.class: invisible
          step a/1
          foo3.class: invisible
          step a/2
          step -> step a/1: {style.animated: true}
          step -> step a/2: {style.animated: true}
          step a/1 <-> step a/2: {style.animated: true}
        }
      }
    }
    Decorators: {
      grid-rows: 5
      *: {
        style.font-size: 20
      }
      Cloud Compute: {
        style: {
          fill: white
          font-size: 30
        }
        code1: |python
          @batch       # Run the step on AWS Batch
          @kubernetes  # Run the step on Kubernetes
          @resources   # Specify resource requirements for the step 
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Dependency Management: {
        style: {
          fill: white
          font-size: 30
        }
        code1: |python
          @conda       # Specify dependencies for the step with Conda
          @conda_base  # Specify dependencies for the flow with Conda
          @pypi        # Specify dependencies for the step with Pip
          @pypi_base   # Specify dependencies for the step with Pip
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Making Flows Reliable: {
        style: {
          fill: white
          font-size: 30
        }
        code1: |python
          @retry       # Retry the step after a failure
          @catch       # Let the flow continue even if the step fails
          @timeout     # Interrupt the step after the specified time
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Configuring Step Environment: {
        style: {
          fill: white
          font-size: 30
        }
        code1: |python
          @environment # Specify environment variables for the step
          @secrets     # Fetch secrets from a secrets manager for the step
          @card        # Visualize the step results
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Scheduling a production deployment: {
        style: {
          fill: white
          font-size: 30
        }
        code1: |python
          @project     # Enable branched namespaces
          @schedule    # Schedule the flow to run at a specified time
          @trigger     # Schedule the flow to run when an event is received
          @trigger_on_finish # Schedule the flow to run when another flow completes
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
    }
    Notebook Tips: {
      grid-rows: 5
      grid-columns: 1
      *: {
        style.font-size: 20
      }
      Find my past runs: {
        style: {
          fill: white
          font-size: 30
        }
        code: |python
          # Fetch the results of my latest HelloFlow run
          run = Flow('HelloFlow').latest_run
          # Fetch the second newest run of HelloFlow by me
          run = list(Flow('HelloFlow'))[1]
          # Fetch the latest run by me tagged as 'goodmodel'
          run = list(Flow('HelloFlow').runs('goodmodel'))[0]
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Find runs by others: {
        style: {
          fill: white
          font-size: 30
        }
        code: |python
          # Fetch the results of Alice's latest HelloFlow run
          namespace('user:alice')
          run = Flow('HelloFlow').latest_run
          # Fetch the results of a specific run, produced by anyone            
          namespace(None)
          run = Run('HelloFlow/323')
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }

      Fetching Results: {
        style: {
          fill: white
          font-size: 30
        }
        code: |python
          # Fetch an artifact from a specific step (fast!)
          model = run['train'].task['model'].data
          # Fetch all artifact from a run (loads everything)                              
          model = run.data.model
          # Inspect logs of a specific step
          run['train'].task.stdout
          # Inspect cards of a specific step
          from metaflow.cards import get_cards
          get_cards(run['train'].task)
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }

      Running a flow in a notebook: {
        style: {
          fill: white
          font-size: 30
        }
        code: |python
          # in a cell define a flow
          from metaflow import FlowSpec, step, NBRunner
          class HelloFlow(FlowSpec):
             ...
          # and run it!
          run = NBRunner(HelloFlow).nbrun()
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Trigger a production flow via an event: {
        style: {
          fill: white
          font-size: 30
        }
        code: |python
          from metaflow.integrations import ArgoEvent
          # Trigger all flows waiting for my_event, set parameter alpha=16
          ArgoEvent('my_event').publish({'alpha': 16})
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
    }
    Running and deploying: {
      grid-rows: 5
      grid-columns: 1
      *: {
        style.font-size: 20
      }
      Local development on the CLI: {
        style: {
          fill: white
          font-size: 30
        }
        code: |bash
          # Run a flow locally, setting parameters
          python myflow.py run --alpha 16 --country 'South Korea'
          # Run a flow on Kubernetes, scheduling at most 64 pods in parallel.      
          python myflow.py run --with kubernetes --max-workers 64
          # Resume the latest run, skipping over successful steps
          python myflow.py resume
          # Resume a specific step of a specific run
          python myflow.py resume train --origin-run-id 1234
          # See the latest card in the train step
          python myflow.py card view train
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Run a flow programmatically: {
        style: {
          fill: white
          font-size: 30
        }
        code: |python
          # Set a parameter, run a flow, and wait for its completion
          from metaflow import Runner
          with Runner('helloflow.py').run(alpha=16)) as running:
              print(f'{running.run} finished')
              model = run.data.model
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Deploy to a production orchestrator: {
        style: {
          fill: white
          font-size: 30
        }
        code: |bash
          # Deploy to Argo Workflows
          python helloflow.py argo-workflows create
          # Deploy to Step Functions
          python helloflow.py step-functions create
          # Deploy to Airflow
          python helloflow.py airflow create
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Deploy an isolated branch deployment: {
        style: {
          fill: white
          font-size: 30
        }
        code: |bash
          # Add @project, then deploy a branch
          python helloflow.py --branch new_model argo-workflows create
          # Add @project, then deploy to main production
          python helloflow.py --production argo-workflows create
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
      Deploy a flow programmatically: {
        style: {
          fill: white
          font-size: 30
        }
        code: |python
          # Deploy a flow, trigger it, and access the run object
          from metaflow import Deployer
          deployer = Deployer('helloflow.py')
          deployed_flow = deployer.argo_workflows().create()
        |
        *.style: {
          stroke-width: 0
          font-size: 20
        }
      }
    }
  }
 }
	classes: {
	invisible: {
	style.opacity: 0
	label: a
	}
	}

	frame: {
	*: {
	style.font-size: 40
	}
	label: _
	style.font-color: white
	grid-rows: 2
	*: {
	style.stroke-width: 0
	style.fill: white
	}
	style.fill: white
	style.stroke-width: 0
	ic: {
	style.font-color: white
	shape: image
	icon: https://metaflow.org/images/metaflow.svg
	icon.near: outside-top-center
	style.fill: white
	}
	Cheat Sheet: {
	label: Metaflow Cheat Sheet 0.1
	grid-rows: 2
	grid-columns: 2
	*: {
	style.font-size: 30
	}

	Flow Structures: {
	*: {
	style.font-size: 30
	}
	grid-rows: 3
	Task Parallelism: {
	style: {
	fill: white
	}
	code: \|python
	# Process two or more functions
	# concurrently
	self.next(self.a, self.b)
	\|
	code.style: {
	stroke-width: 0
	font-size: 20
	}
	box: {
	style.opacity: 0
	grid-rows: 2
	grid-columns: 3
	foo1.class: invisible
	step
	foo2.class: invisible
	step a
	foo3.class: invisible
	step b
	step -> step a: {style.animated: true}
	step -> step b: {style.animated: true}
	}
	}
	Data Parallelism: {
	style: {
	fill: white
	}
	code: \|python
	# Process the elements of a list
	# concurrently
	self.mylist = ['A', 'B']
	self.next(self.a, foreach='mylist')
	\|
	code.style: {
	stroke-width: 0
	font-size: 20
	}
	box: {
	style.opacity: 0
	grid-rows: 2
	step
	step a: {
	style.multiple: true
	}
	step -> step a: {
	style.animated: true
	}
	}
	}
	Distributed Computing: {
	style: {
	fill: white
	}
	code: \|python
	# Set up an ephemeral cluster
	# for distributed computing
	self.next(self.a, num_parallel=N)
	@pytorch, @ray, @mpi, ...
	\|
	code.style: {
	stroke-width: 0
	font-size: 20
	}
	box: {
	style.opacity: 0
	grid-rows: 2
	grid-columns: 3
	foo1.class: invisible
	step
	foo2.class: invisible
	step a/1
	foo3.class: invisible
	step a/2
	step -> step a/1: {style.animated: true}
	step -> step a/2: {style.animated: true}
	step a/1 <-> step a/2: {style.animated: true}
	}
	}
	}
	Decorators: {
	grid-rows: 5
	*: {
	style.font-size: 20
	}
	Cloud Compute: {
	style: {
	fill: white
	font-size: 30
	}
	code1: \|python
	@batch # Run the step on AWS Batch
	@kubernetes # Run the step on Kubernetes
	@resources # Specify resource requirements for the step
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Dependency Management: {
	style: {
	fill: white
	font-size: 30
	}
	code1: \|python
	@conda # Specify dependencies for the step with Conda
	@conda_base # Specify dependencies for the flow with Conda
	@pypi # Specify dependencies for the step with Pip
	@pypi_base # Specify dependencies for the step with Pip
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Making Flows Reliable: {
	style: {
	fill: white
	font-size: 30
	}
	code1: \|python
	@retry # Retry the step after a failure
	@catch # Let the flow continue even if the step fails
	@timeout # Interrupt the step after the specified time
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Configuring Step Environment: {
	style: {
	fill: white
	font-size: 30
	}
	code1: \|python
	@environment # Specify environment variables for the step
	@secrets # Fetch secrets from a secrets manager for the step
	@card # Visualize the step results
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Scheduling a production deployment: {
	style: {
	fill: white
	font-size: 30
	}
	code1: \|python
	@project # Enable branched namespaces
	@schedule # Schedule the flow to run at a specified time
	@trigger # Schedule the flow to run when an event is received
	@trigger_on_finish # Schedule the flow to run when another flow completes
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	}
	Notebook Tips: {
	grid-rows: 5
	grid-columns: 1
	*: {
	style.font-size: 20
	}
	Find my past runs: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|python
	# Fetch the results of my latest HelloFlow run
	run = Flow('HelloFlow').latest_run
	# Fetch the second newest run of HelloFlow by me
	run = list(Flow('HelloFlow'))[1]
	# Fetch the latest run by me tagged as 'goodmodel'
	run = list(Flow('HelloFlow').runs('goodmodel'))[0]
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Find runs by others: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|python
	# Fetch the results of Alice's latest HelloFlow run
	namespace('user:alice')
	run = Flow('HelloFlow').latest_run
	# Fetch the results of a specific run, produced by anyone
	namespace(None)
	run = Run('HelloFlow/323')
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}

	Fetching Results: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|python
	# Fetch an artifact from a specific step (fast!)
	model = run['train'].task['model'].data
	# Fetch all artifact from a run (loads everything)
	model = run.data.model
	# Inspect logs of a specific step
	run['train'].task.stdout
	# Inspect cards of a specific step
	from metaflow.cards import get_cards
	get_cards(run['train'].task)
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}

	Running a flow in a notebook: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|python
	# in a cell define a flow
	from metaflow import FlowSpec, step, NBRunner
	class HelloFlow(FlowSpec):
	...
	# and run it!
	run = NBRunner(HelloFlow).nbrun()
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Trigger a production flow via an event: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|python
	from metaflow.integrations import ArgoEvent
	# Trigger all flows waiting for my_event, set parameter alpha=16
	ArgoEvent('my_event').publish({'alpha': 16})
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	}
	Running and deploying: {
	grid-rows: 5
	grid-columns: 1
	*: {
	style.font-size: 20
	}
	Local development on the CLI: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|bash
	# Run a flow locally, setting parameters
	python myflow.py run --alpha 16 --country 'South Korea'
	# Run a flow on Kubernetes, scheduling at most 64 pods in parallel.
	python myflow.py run --with kubernetes --max-workers 64
	# Resume the latest run, skipping over successful steps
	python myflow.py resume
	# Resume a specific step of a specific run
	python myflow.py resume train --origin-run-id 1234
	# See the latest card in the train step
	python myflow.py card view train
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Run a flow programmatically: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|python
	# Set a parameter, run a flow, and wait for its completion
	from metaflow import Runner
	with Runner('helloflow.py').run(alpha=16)) as running:
	print(f'{running.run} finished')
	model = run.data.model
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Deploy to a production orchestrator: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|bash
	# Deploy to Argo Workflows
	python helloflow.py argo-workflows create
	# Deploy to Step Functions
	python helloflow.py step-functions create
	# Deploy to Airflow
	python helloflow.py airflow create
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Deploy an isolated branch deployment: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|bash
	# Add @project, then deploy a branch
	python helloflow.py --branch new_model argo-workflows create
	# Add @project, then deploy to main production
	python helloflow.py --production argo-workflows create
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	Deploy a flow programmatically: {
	style: {
	fill: white
	font-size: 30
	}
	code: \|python
	# Deploy a flow, trigger it, and access the run object
	from metaflow import Deployer
	deployer = Deployer('helloflow.py')
	deployed_flow = deployer.argo_workflows().create()
	\|
	*.style: {
	stroke-width: 0
	font-size: 20
	}
	}
	}
	}
	}