Last active
July 4, 2017 20:07
-
-
Save zircote/4658e9f19c976e508590 to your computer and use it in GitHub Desktop.
WIP DataDog check for Apache Aurora
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Aurora Scheduler check | |
Collects metrics from aurora scheduler. | |
""" | |
import requests | |
from checks import AgentCheck, CheckException | |
class AuroraCheck(AgentCheck): | |
GAUGE = AgentCheck.gauge | |
MONOTONIC_COUNT = AgentCheck.monotonic_count | |
COUNT = AgentCheck.count | |
SERVICE_CHECK_NAME = "aurora_master.can_connect" | |
service_check_needed = True | |
SYSTEM_METRICS = { | |
"jvm_uptime_secs": ('aurora.scheduler.jvm_uptime_secs', MONOTONIC_COUNT), | |
"system_load_avg": ('aurora.scheduler.system_load_avg', GAUGE), | |
"http_500_responses_events": ('aurora.scheduler.http_500_responses_events', MONOTONIC_COUNT), | |
"system_env_SHLVL": ('aurora.scheduler.system_env_SHLVL', GAUGE), | |
"system_free_physical_memory_mb": ('aurora.scheduler.system_free_physical_memory_mb', GAUGE), | |
"system_free_swap_mb": ('aurora.scheduler.system_free_swap_mb', GAUGE), | |
} | |
LEADER_METRICS = { | |
"process_cpu_cores_utilized": ('aurora.scheduler.process_cpu_cores_utilized', GAUGE), | |
"task_store_LOST": ('aurora.scheduler.task_store_LOST', GAUGE), | |
"scheduler_resource_offers": ('aurora.scheduler.scheduler_resource_offers', MONOTONIC_COUNT), | |
"framework_registered": ('aurora.scheduler.framework_registered', COUNT), | |
"scheduler_log_native_append_nanos_total": ( | |
'aurora.scheduler.scheduler_log_native_append_nanos_total', MONOTONIC_COUNT), | |
"scheduler_log_native_append_events": ('aurora.scheduler.scheduler_log_native_append_events', MONOTONIC_COUNT), | |
"timed_out_tasks": ('aurora.scheduler.timed_out_tasks', MONOTONIC_COUNT), | |
} | |
CRON_METRICS = { | |
"cron_job_collisions": ("aurora.scheduler.cron_job_collisions", MONOTONIC_COUNT), | |
"cron_job_launch_failures": ("aurora.scheduler.cron_job_launch_failures", GAUGE), | |
"cron_job_misfires": ("aurora.scheduler.cron_job_misfires", GAUGE), | |
"cron_job_parse_failures": ("aurora.scheduler.cron_job_parse_failures", GAUGE), | |
"cron_job_triggers": ("aurora.scheduler.cron_job_triggers", GAUGE), | |
"cron_jobs_loaded": ("aurora.scheduler.cron_jobs_loaded", GAUGE), | |
} | |
PREEMPTION_METRICS = { | |
"preemptor_tasks_preempted_non_prod": ("aurora.scheduler.preemptor_tasks_preempted_non_prod", MONOTONIC_COUNT), | |
"preemptor_tasks_preempted_prod": ("aurora.scheduler.preemptor_tasks_preempted_prod", MONOTONIC_COUNT), | |
} | |
QUARTZ_METRICS = { | |
'quartz_scheduler_running': ('aurora.scheduler.quartz_scheduler_running', COUNT) | |
} | |
TASK_STORE_METRICS = { | |
"task_store_ASSIGNED": ("aurora.scheduler.task_store_ASSIGNED", MONOTONIC_COUNT), | |
"task_store_DRAINING": ("aurora.scheduler.task_store_DRAINING", MONOTONIC_COUNT), | |
"task_store_FAILED": ("aurora.scheduler.task_store_FAILED", MONOTONIC_COUNT), | |
"task_store_FINISHED": ("aurora.scheduler.task_store_FINISHED", MONOTONIC_COUNT), | |
"task_store_INIT": ("aurora.scheduler.task_store_INIT", MONOTONIC_COUNT), | |
"task_store_KILLED": ("aurora.scheduler.task_store_KILLED", MONOTONIC_COUNT), | |
"task_store_KILLING": ("aurora.scheduler.task_store_KILLING", MONOTONIC_COUNT), | |
"task_store_LOST": ("aurora.scheduler.task_store_LOST", MONOTONIC_COUNT), | |
"task_store_PENDING": ("aurora.scheduler.task_store_PENDING", MONOTONIC_COUNT), | |
"task_store_PREEMPTING": ("aurora.scheduler.task_store_PREEMPTING", MONOTONIC_COUNT), | |
"task_store_RESTARTING": ("aurora.scheduler.task_store_RESTARTING", MONOTONIC_COUNT), | |
"task_store_RUNNING": ("aurora.scheduler.task_store_RUNNING", MONOTONIC_COUNT), | |
"task_store_STARTING": ("aurora.scheduler.task_store_STARTING", MONOTONIC_COUNT), | |
"task_store_THROTTLED": ("aurora.scheduler.task_store_THROTTLED", MONOTONIC_COUNT), | |
} | |
def _get_json(self, url, timeout): | |
tags = ["url:%s" % url] | |
msg = None | |
status = None | |
json = None | |
try: | |
r = requests.get(url, timeout=timeout) | |
json = r.json() | |
if r.is_redirect: | |
status = AgentCheck.OK | |
self.is_leader = False | |
msg = "Aurora Scheduler instance detected at %s but is not master" % url | |
elif r.status_code != 200: | |
status = AgentCheck.CRITICAL | |
msg = "Got %s when hitting %s" % (r.status_code, url) | |
else: | |
status = AgentCheck.OK | |
self.is_leader = True | |
msg = "Aurora Scheduler instance detected at %s " % url | |
except requests.exceptions.Timeout as e: | |
# If there's a timeout | |
msg = "%s seconds timeout when hitting %s" % (timeout, url) | |
status = AgentCheck.CRITICAL | |
except Exception as e: | |
msg = str(e) | |
status = AgentCheck.CRITICAL | |
finally: | |
if self.service_check_needed: | |
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) | |
self.service_check_needed = False | |
if status is AgentCheck.CRITICAL: | |
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) | |
raise CheckException("Cannot connect to aurora scheduler, please check your configuration.") | |
return json | |
def _get_state(self, url, timeout): | |
return self._get_json(url + '/vars.json', timeout) | |
def check(self, instance): | |
if 'url' not in instance: | |
raise Exception('Aurora scheduler instance missing "url" value.') | |
url = instance['url'] | |
instance_tags = instance.get('tags', []) | |
default_timeout = self.init_config.get('default_timeout', 5) | |
timeout = float(instance.get('timeout', default_timeout)) | |
state_metrics = self._get_state(url, timeout) | |
if state_metrics: | |
tags = [ | |
'aurora', | |
'mesos:framework' | |
] | |
tags += instance_tags | |
stats_metrics = self._get_state(url, timeout) | |
if stats_metrics is not None: | |
metrics = [self.SYSTEM_METRICS] | |
if self.is_leader: | |
metrics += [self.LEADER_METRICS, self.QUARTZ_METRICS, self.PREEMPTION_METRICS, | |
self.TASK_STORE_METRICS, self.CRON_METRICS, ] | |
for m in metrics: | |
print m | |
for key_name, (metric_name, metric_func) in m.iteritems(): | |
if key_name in stats_metrics: | |
metric_func(self, metric_name, stats_metrics[key_name], tags=tags) | |
else: | |
metric_func(self, metric_name, None, tags=tags) | |
self.service_check_needed = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
init_config: | |
default_timeout: 5 | |
instances: | |
- url: http://localhost:8081 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment