Skip to content

Instantly share code, notes, and snippets.

@slash-cyberpunk
Last active September 24, 2024 09:43
Show Gist options
  • Select an option

  • Save slash-cyberpunk/db2ae28586567036d65d50f6317feb47 to your computer and use it in GitHub Desktop.

Select an option

Save slash-cyberpunk/db2ae28586567036d65d50f6317feb47 to your computer and use it in GitHub Desktop.
Config for stasd_exporter v0.22.7 and airflow 2.10.1
mappings:
- match: 'airflow\.(.+)_(start|end|heartbeat_failure)$'
match_type: regex
help: "Number of status jobs"
name: "airflow_jobs"
labels:
status: "$2"
job_name: "$1"
- match: "airflow.local_task_job.task_exit.*.*.*.*"
help: "Number of LocalTaskJob terminations with a return_code while running a task of a DAG."
name: "airflow_local_task_job"
labels:
job_id: "$1"
dag_id: "$2"
task_id: "$3"
return_code: "$4"
- match: 'airflow\.operator_(failures|successes)_(.+$)'
match_type: regex
help: "Operator status"
name: "airflow_operators"
labels:
status: "$1"
operator_name: "$2"
- match: 'airflow\.ti_(failures|successes)'
match_type: regex
help: "Overall task instances 'type'"
name: "airflow_task_instances"
labels:
type: "$1"
- match: "airflow.previously_succeeded"
help: "Number of previously succeeded task instances"
name: "airflow_previously_succeeded"
- match: "airflow.zombies_killed"
help: "Zombie tasks killed"
name: "airflow_zombies_killed"
- match: "airflow.scheduler_heartbeat"
help: "Scheduler heartbeats"
name: "airflow_scheduler_heartbeat"
- match: "airflow.dag_processing.processes"
help: "Number of currently running DAG parsing processes"
name: "airflow_dag_processing_processes"
- match: "airflow.dag_processing.processor_timeouts"
help: "Number of file processors that have been killed due to taking too long. Metric with file_path tagging."
name: "airflow_dag_processing_processor_timeouts"
- match: "airflow.dag_processing.sla_callback_count"
help: "Number of SLA callbacks received"
name: "airflow_dag_processing_sla_callback_count"
- match: "airflow.dag_processing.other_callback_count"
help: "Number of non-SLA callbacks received"
name: "airflow_dag_processing_other_callback_count"
- match: "airflow.dag_processing.file_path_queue_update_count"
help: "Number of times we have scanned the filesystem and queued all existing dags"
name: "airflow_dag_processing_file_path_queue_update_count"
- match: "airflow.dag_processing.manager_stalls"
help: "Number of stalled DagFileProcessorManager"
name: "airflow_dag_processing_manager_stalls"
- match: "airflow.dag_file_refresh_error"
help: "Number of failures loading any DAG files"
name: "airflow_dag_file_refresh_error"
- match: "airflow.scheduler.tasks.*"
help: "Number of tasks 'status' in scheduler"
name: "airflow_scheduler_tasks"
labels:
status: "$1"
- match: "airflow.scheduler.orphaned_tasks.*"
help: "Number of Orphaned tasks cleared or adopted by the Scheduler"
name: "airflow_scheduler_orphaned_tasks"
labels:
status: "$1"
- match: "airflow.scheduler.critical_section_busy"
help: "Count of times a scheduler process tried to get a lock on the critical section (needed to send tasks to the executor) and found it locked by another process."
name: "airflow_scheduler_critical_section_busy"
- match: "airflow.sla_missed"
help: "Number of SLA misses. Metric with dag_id and task_id tagging."
name: "airflow_sla_missed"
- match: "airflow.sla_callback_notification_failure"
help: "Number of failed SLA miss callback notification attempts. Metric with dag_id and func_name tagging."
name: "airflow_sla_callback_notification_failure"
- match: "airflow.sla_email_notification_failure"
help: "Number of failed SLA miss email notification attempts"
name: "airflow_sla_email_notification_failure"
- match: "airflow.ti.start.*.*"
help: "Number of started or completed task in a given dag. Similar to airflow_jobs but for task"
name: "airflow_taskinstance"
labels:
status: "start"
dag_id: "$1"
task_id: "$2"
state: "None" # Please note that metrics with the same name must also have the same set of label names.
- match: "airflow.ti.finish.*.*.*"
help: "Number of started or completed task in a given dag. Similar to airflow_jobs but for task"
name: "airflow_taskinstance"
labels:
status: "finish"
dag_id: "$1"
task_id: "$2"
state: "$3"
- match: "airflow.dag.callback_exceptions"
help: "Number of exceptions raised from DAG callbacks. When this happens, it means DAG callback is not working."
name: "airflow_dag_callback_exceptions"
- match: "airflow.celery.task_timeout_error"
help: "Number of AirflowTaskTimeout errors raised when publishing Task to Celery Broker."
name: "airflow_celery_task_timeout_error"
- match: "airflow.celery.execute_command.failure"
help: "Number of non-zero exit code from Celery task."
name: "airflow_celery_execute_command_failure"
- match: "airflow.task_removed_from_dag.*"
help: "Number of tasks removed or restored for a given dag"
name: "airflow_tasks_dag"
labels:
status: "removed"
dag_id: "$1"
- match: "airflow.task_restored_to_dag.*"
help: "Number of tasks removed or restored for a given dag"
name: "airflow_tasks_dag"
labels:
status: "restored"
dag_id: "$1"
- match: 'airflow\.task_instance_created_(.+)'
match_type: regex
help: "Number of tasks instances created for a given Operator"
name: "airflow_taskinstance_created"
- match: "airflow.triggerer_heartbeat"
help: "Triggerer heartbeats"
name: "airflow_triggerer_heartbeat"
- match: "airflow.triggers.*"
help: "Number of triggers that by status"
name: "airflow_triggers"
labels:
status: "$1"
- match: "airflow.dataset.*"
help: "Number of dataset by type"
name: "airflow_dataset"
labels:
type: "$1"
- match: "airflow.dagbag_size"
help: "DAG bag size"
name: "airflow_dagbag_size"
- match: "airflow.dag_processing.import_errors"
help: "Number of errors from trying to parse DAG files"
name: "airflow_dag_processing_import_errors"
- match: "airflow.dag_processing.total_parse_time"
help: "Seconds taken to scan and import all DAG files once"
name: "airflow_dag_processing_total_parse_time"
- match: "airflow.dag_processing.file_path_queue_size"
help: "Number of DAG files to be considered for the next scan"
name: "airflow_dag_processing_file_path_queue_size"
- match: "airflow.dag_processing.last_run.seconds_ago.*"
help: "Seconds since <dag_file> was last processed"
name: "airflow_dag_processing_last_run"
labels:
dag_file: "$1"
- match: "airflow.dag_processing.last_num_of_db_queries.*"
help: "Number of queries to Airflow database during parsing per dag_file"
name: "airflow_dag_processing_last_num_of_db_queries"
labels:
dag_file: "$1"
- match: "airflow.executor.*"
help: "Number of 'type' tasks on executor"
name: "airflow_executor"
labels:
type: "$1"
- match: "airflow.pool.*.*"
help: "Number of 'type' in the pool"
name: "airflow_pool"
labels:
type: "$1"
pool_name: "$2"
- match: "airflow.task.cpu_usage_percent.*.*"
help: "Percentage of CPU used by a task"
name: "airflow_task_cpu_usage_percent"
labels:
dag_id: "$1"
task_id: "$2"
- match: "airflow.task.mem_usage_percent.*.*"
help: "Percentage of memory used by a task"
name: "airflow_task_mem_usage_percent"
labels:
dag_id: "$1"
task_id: "$2"
- match: "airflow.triggers.running.*"
help: "Number of triggers currently running for a triggerer (described by hostname)"
name: "airflow_triggers_running"
labels:
hostname: "$1"
- match: "airflow.dagrun.dependency-check.*"
help: "Milliseconds taken to check DAG dependencies"
observer_type: summary
name: "airflow_dagrun_dependency_check"
labels:
dag_id: "$1"
- match: "airflow.dag.*.*.*"
help: "Milliseconds a task spends by type"
observer_type: summary
name: "airflow_dag_durations"
labels:
type: "$3"
dag_id: "$1"
task_id: "$2"
- match: "airflow.dag_processing.last_duration.*"
help: "Milliseconds taken to load the given DAG file"
observer_type: summary
name: "airflow_dag_processing_last_duration"
labels:
dag_file: "$1"
- match: "airflow.dagrun.duration.*.*"
help: "Milliseconds taken for a DagRun to reach 'type' state"
observer_type: summary
name: "airflow_dagrun_duration"
labels:
type: "$1"
dag_id: "$2"
- match: "airflow.dagrun.schedule_delay.*"
help: "Milliseconds of delay between the scheduled DagRun start date and the actual DagRun start date"
observer_type: summary
name: "airflow_dagrun_schedule_delay"
labels:
dag_id: "$1"
- match: "airflow.scheduler.critical_section_duration"
help: "Milliseconds spent in the critical section of scheduler loop – only a single scheduler can enter this loop at a time"
observer_type: summary
name: "airflow_scheduler_critical_section_duration"
- match: "airflow.scheduler.critical_section_query_duration"
help: "Milliseconds spent running the critical section task instance query"
observer_type: summary
name: "airflow_scheduler_critical_section_query_duration"
- match: "airflow.scheduler.scheduler_loop_duration"
help: "Milliseconds spent running one scheduler loop"
observer_type: summary
name: "airflow_scheduler_scheduler_loop_duration"
- match: "airflow.dagrun.*.first_task_scheduling_delay"
help: "Milliseconds elapsed between first task start_date and dagrun expected start"
observer_type: summary
name: "airflow_dagrun_first_task_scheduling_delay"
labels:
dag_id: "$1"
- match: "airflow.collect_db_dags"
help: "Milliseconds taken for fetching all Serialized Dags from DB"
observer_type: summary
name: "airflow_collect_db_dags"
- match: "airflow.kubernetes_executor.*.duration"
help: "Milliseconds taken for by type in Kubernetes Executor"
observer_type: summary
name: "airflow_kubernetes_executor_duration"
labels:
type: "$1"
- match: .
match_type: regex
action: drop
name: "dropped"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment