-
-
Save frimik/bec45ff66b979098931f7e4b3b218167 to your computer and use it in GitHub Desktop.
WIP DataDog check for Apache Aurora
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Aurora Scheduler check | |
Collects metrics from aurora scheduler. | |
""" | |
import requests | |
from checks import AgentCheck, CheckException | |
class AuroraCheck(AgentCheck): | |
GAUGE = AgentCheck.gauge | |
MONOTONIC_COUNT = AgentCheck.monotonic_count | |
COUNT = AgentCheck.count | |
SERVICE_CHECK_NAME = "aurora_master.can_connect" | |
service_check_needed = True | |
SYSTEM_METRICS = { | |
"jvm_uptime_secs": ('aurora.scheduler.jvm_uptime_secs', MONOTONIC_COUNT), | |
"system_load_avg": ('aurora.scheduler.system_load_avg', GAUGE), | |
"http_500_responses_events": ('aurora.scheduler.http_500_responses_events', MONOTONIC_COUNT), | |
"system_env_SHLVL": ('aurora.scheduler.system_env_SHLVL', GAUGE), | |
"system_free_physical_memory_mb": ('aurora.scheduler.system_free_physical_memory_mb', GAUGE), | |
"system_free_swap_mb": ('aurora.scheduler.system_free_swap_mb', GAUGE), | |
} | |
LEADER_METRICS = { | |
"process_cpu_cores_utilized": ('aurora.scheduler.process_cpu_cores_utilized', GAUGE), | |
"task_store_LOST": ('aurora.scheduler.task_store_LOST', GAUGE), | |
"scheduler_resource_offers": ('aurora.scheduler.scheduler_resource_offers', MONOTONIC_COUNT), | |
"framework_registered": ('aurora.scheduler.framework_registered', COUNT), | |
"scheduler_log_native_append_nanos_total": ( | |
'aurora.scheduler.scheduler_log_native_append_nanos_total', MONOTONIC_COUNT), | |
"scheduler_log_native_append_events": ('aurora.scheduler.scheduler_log_native_append_events', MONOTONIC_COUNT), | |
"timed_out_tasks": ('aurora.scheduler.timed_out_tasks', MONOTONIC_COUNT), | |
} | |
CRON_METRICS = { | |
"cron_job_collisions": ("aurora.scheduler.cron_job_collisions", MONOTONIC_COUNT), | |
"cron_job_launch_failures": ("aurora.scheduler.cron_job_launch_failures", GAUGE), | |
"cron_job_misfires": ("aurora.scheduler.cron_job_misfires", GAUGE), | |
"cron_job_parse_failures": ("aurora.scheduler.cron_job_parse_failures", GAUGE), | |
"cron_job_triggers": ("aurora.scheduler.cron_job_triggers", GAUGE), | |
"cron_jobs_loaded": ("aurora.scheduler.cron_jobs_loaded", GAUGE), | |
} | |
PREEMPTION_METRICS = { | |
"preemptor_tasks_preempted_non_prod": ("aurora.scheduler.preemptor_tasks_preempted_non_prod", MONOTONIC_COUNT), | |
"preemptor_tasks_preempted_prod": ("aurora.scheduler.preemptor_tasks_preempted_prod", MONOTONIC_COUNT), | |
} | |
QUARTZ_METRICS = { | |
'quartz_scheduler_running': ('aurora.scheduler.quartz_scheduler_running', COUNT) | |
} | |
TASK_STORE_METRICS = { | |
"task_store_ASSIGNED": ("aurora.scheduler.task_store_ASSIGNED", MONOTONIC_COUNT), | |
"task_store_DRAINING": ("aurora.scheduler.task_store_DRAINING", MONOTONIC_COUNT), | |
"task_store_FAILED": ("aurora.scheduler.task_store_FAILED", MONOTONIC_COUNT), | |
"task_store_FINISHED": ("aurora.scheduler.task_store_FINISHED", MONOTONIC_COUNT), | |
"task_store_INIT": ("aurora.scheduler.task_store_INIT", MONOTONIC_COUNT), | |
"task_store_KILLED": ("aurora.scheduler.task_store_KILLED", MONOTONIC_COUNT), | |
"task_store_KILLING": ("aurora.scheduler.task_store_KILLING", MONOTONIC_COUNT), | |
"task_store_LOST": ("aurora.scheduler.task_store_LOST", MONOTONIC_COUNT), | |
"task_store_PENDING": ("aurora.scheduler.task_store_PENDING", MONOTONIC_COUNT), | |
"task_store_PREEMPTING": ("aurora.scheduler.task_store_PREEMPTING", MONOTONIC_COUNT), | |
"task_store_RESTARTING": ("aurora.scheduler.task_store_RESTARTING", MONOTONIC_COUNT), | |
"task_store_RUNNING": ("aurora.scheduler.task_store_RUNNING", MONOTONIC_COUNT), | |
"task_store_STARTING": ("aurora.scheduler.task_store_STARTING", MONOTONIC_COUNT), | |
"task_store_THROTTLED": ("aurora.scheduler.task_store_THROTTLED", MONOTONIC_COUNT), | |
} | |
_task_store_states = [ | |
"ASSIGNED", | |
"DRAINING", | |
"FAILED", | |
"FINISHED", | |
"INIT", | |
"KILLED", | |
"KILLING", | |
"LOST", | |
"PENDING", | |
"PREEMPTING", | |
"RESTARTING", | |
"RUNNING", | |
"STARTING", | |
"THROTTLED", | |
] | |
TASK_STORE_GAUGES = {} | |
for taskstate in _task_store_states: | |
aurora_metric = "task_store_{taskstate}".format(taskstate=taskstate) | |
metric_tags = ["taskstate:{taskstate}".format(taskstate=taskstate.lower())] | |
TASK_STORE_GAUGES.update({ | |
aurora_metric: ( | |
"aurora.scheduler.task_store_gauge", GAUGE, metric_tags | |
) | |
}) | |
def _get_json(self, url, timeout): | |
tags = ["url:%s" % url] | |
msg = None | |
status = None | |
json = None | |
try: | |
r = requests.get(url, timeout=timeout) | |
json = r.json() | |
if r.is_redirect: | |
status = AgentCheck.OK | |
self.is_leader = False | |
msg = "Aurora Scheduler instance detected at %s but is not master" % url | |
elif r.status_code != 200: | |
status = AgentCheck.CRITICAL | |
msg = "Got %s when hitting %s" % (r.status_code, url) | |
else: | |
status = AgentCheck.OK | |
self.is_leader = True | |
msg = "Aurora Scheduler instance detected at %s " % url | |
except requests.exceptions.Timeout as e: | |
# If there's a timeout | |
msg = "%s seconds timeout when hitting %s" % (timeout, url) | |
status = AgentCheck.CRITICAL | |
except Exception as e: | |
msg = str(e) | |
status = AgentCheck.CRITICAL | |
finally: | |
if self.service_check_needed: | |
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) | |
self.service_check_needed = False | |
if status is AgentCheck.CRITICAL: | |
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg) | |
raise CheckException("Cannot connect to aurora scheduler, please check your configuration.") | |
return json | |
def _get_state(self, url, timeout): | |
return self._get_json(url + '/vars.json', timeout) | |
def check(self, instance): | |
if 'url' not in instance: | |
raise Exception('Aurora scheduler instance missing "url" value.') | |
url = instance['url'] | |
instance_tags = instance.get('tags', []) | |
default_timeout = self.init_config.get('default_timeout', 5) | |
timeout = float(instance.get('timeout', default_timeout)) | |
state_metrics = self._get_state(url, timeout) | |
if state_metrics: | |
tags = [ | |
'aurora', | |
'mesos:framework', | |
'url:%s' % url, | |
] | |
tags += instance_tags | |
stats_metrics = self._get_state(url, timeout) | |
if stats_metrics is not None: | |
metrics = [self.SYSTEM_METRICS] | |
if self.is_leader: | |
metrics += [self.LEADER_METRICS, self.QUARTZ_METRICS, self.PREEMPTION_METRICS, | |
self.TASK_STORE_METRICS, self.CRON_METRICS, | |
self.TASK_STORE_GAUGES, ] | |
for m in metrics: | |
for key_name, metric_prop in m.iteritems(): | |
metric_name = metric_prop[0] | |
metric_func = metric_prop[1] | |
metric_tags = metric_prop[2] if len(metric_prop) == 3 else [] | |
my_tags = tags + metric_tags | |
if key_name in stats_metrics: | |
self.log.debug("%s: %s, tags: %s", metric_name, stats_metrics[key_name], | |
my_tags) | |
metric_func(self, metric_name, stats_metrics[key_name], | |
tags=my_tags) | |
else: | |
metric_func(self, metric_name, None, | |
tags=my_tags) | |
self.service_check_needed = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
init_config: | |
default_timeout: 5 | |
# Add all master nodes: | |
instances: | |
- url: http://10.0.0.10:8081 | |
tags: | |
- cluster:foo | |
- url: http://10.0.0.11:8081 | |
tags: | |
- cluster:foo | |
- url: http://10.0.0.12:8081 | |
tags: | |
- cluster:foo | |
- url: http://10.0.0.13:8081 | |
tags: | |
- cluster:foo | |
- url: http://10.0.0.10:8081 | |
tags: | |
- cluster:foo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment