Created
September 26, 2024 21:11
-
-
Save TheRockStarDBA/49b8829cafbdcb79ea2cb587aba4f579 to your computer and use it in GitHub Desktop.
airflow metadata mining
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- ref: https://github.com/jimtodd92/airflow_metadata_mining/blob/main/airflow_metatdata_mining.sql | |
-- To get Critical user activities like paused and dagrun_clear in past 24 hours. | |
SELECT event | |
,DATE (dttm) | |
,count(*) | |
FROM PUBLIC.log | |
WHERE DATE (dttm) BETWEEN ( | |
SELECT max(DATE (dttm)) - 1 | |
FROM PUBLIC.log | |
) | |
AND ( | |
SELECT max(DATE (dttm)) | |
FROM PUBLIC.log | |
) | |
AND event IN ( | |
'paused' | |
,'dagrun_clear' | |
) | |
GROUP BY event | |
,DATE (dttm) | |
ORDER BY DATE (dttm) DESC; | |
-- To get 10 long ran tasks in a day. | |
SELECT dag_id | |
,task_id | |
,execution_date | |
,round(cast(duration / (60 * 60) AS NUMERIC), 2) AS duration_in_hour | |
FROM PUBLIC.task_instance | |
WHERE DATE (execution_date) = '2022-01-03' | |
AND STATE IN ('success') | |
AND duration IS NOT NULL | |
GROUP BY task_id | |
,dag_id | |
,execution_date | |
ORDER BY duration DESC limit 10; | |
-- To understand how occupied the airflow scheduler is, and decide a maintenance window. | |
SELECT x.start_time_window | |
,count(*) | |
FROM ( | |
SELECT start_date | |
,CASE | |
WHEN extract(hour FROM start_date) BETWEEN 0 | |
AND 5 | |
THEN '0-5' | |
WHEN extract(hour FROM start_date) BETWEEN 6 | |
AND 11 | |
THEN '6-11' | |
WHEN extract(hour FROM start_date) BETWEEN 12 | |
AND 17 | |
THEN '12-17' | |
WHEN extract(hour FROM start_date) BETWEEN 18 | |
AND 23 | |
THEN '18-23' | |
END AS start_time_window | |
FROM PUBLIC.task_instance | |
WHERE execution_date BETWEEN '2022-01-01' | |
AND '2022-01-02' | |
AND start_date IS NOT NULL | |
) AS x | |
GROUP BY x.start_time_window | |
ORDER BY cast(split_part(x.start_time_window, '-', 1) AS INT) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Good airflow ppt - https://airflowsummit.org/slides/2023/1-CD-1500-Reliable-Airflow-DAG-Design.pdf
Reliable Airflow DAG Design when building a Time-series Data Lakehouse