Skip to content

Instantly share code, notes, and snippets.

@zircote
Last active July 4, 2017 20:07
Show Gist options
  • Save zircote/4658e9f19c976e508590 to your computer and use it in GitHub Desktop.
Save zircote/4658e9f19c976e508590 to your computer and use it in GitHub Desktop.
WIP DataDog check for Apache Aurora
"""
Aurora Scheduler check
Collects metrics from aurora scheduler.
"""
import requests
from checks import AgentCheck, CheckException
class AuroraCheck(AgentCheck):
GAUGE = AgentCheck.gauge
MONOTONIC_COUNT = AgentCheck.monotonic_count
COUNT = AgentCheck.count
SERVICE_CHECK_NAME = "aurora_master.can_connect"
service_check_needed = True
SYSTEM_METRICS = {
"jvm_uptime_secs": ('aurora.scheduler.jvm_uptime_secs', MONOTONIC_COUNT),
"system_load_avg": ('aurora.scheduler.system_load_avg', GAUGE),
"http_500_responses_events": ('aurora.scheduler.http_500_responses_events', MONOTONIC_COUNT),
"system_env_SHLVL": ('aurora.scheduler.system_env_SHLVL', GAUGE),
"system_free_physical_memory_mb": ('aurora.scheduler.system_free_physical_memory_mb', GAUGE),
"system_free_swap_mb": ('aurora.scheduler.system_free_swap_mb', GAUGE),
}
LEADER_METRICS = {
"process_cpu_cores_utilized": ('aurora.scheduler.process_cpu_cores_utilized', GAUGE),
"task_store_LOST": ('aurora.scheduler.task_store_LOST', GAUGE),
"scheduler_resource_offers": ('aurora.scheduler.scheduler_resource_offers', MONOTONIC_COUNT),
"framework_registered": ('aurora.scheduler.framework_registered', COUNT),
"scheduler_log_native_append_nanos_total": (
'aurora.scheduler.scheduler_log_native_append_nanos_total', MONOTONIC_COUNT),
"scheduler_log_native_append_events": ('aurora.scheduler.scheduler_log_native_append_events', MONOTONIC_COUNT),
"timed_out_tasks": ('aurora.scheduler.timed_out_tasks', MONOTONIC_COUNT),
}
CRON_METRICS = {
"cron_job_collisions": ("aurora.scheduler.cron_job_collisions", MONOTONIC_COUNT),
"cron_job_launch_failures": ("aurora.scheduler.cron_job_launch_failures", GAUGE),
"cron_job_misfires": ("aurora.scheduler.cron_job_misfires", GAUGE),
"cron_job_parse_failures": ("aurora.scheduler.cron_job_parse_failures", GAUGE),
"cron_job_triggers": ("aurora.scheduler.cron_job_triggers", GAUGE),
"cron_jobs_loaded": ("aurora.scheduler.cron_jobs_loaded", GAUGE),
}
PREEMPTION_METRICS = {
"preemptor_tasks_preempted_non_prod": ("aurora.scheduler.preemptor_tasks_preempted_non_prod", MONOTONIC_COUNT),
"preemptor_tasks_preempted_prod": ("aurora.scheduler.preemptor_tasks_preempted_prod", MONOTONIC_COUNT),
}
QUARTZ_METRICS = {
'quartz_scheduler_running': ('aurora.scheduler.quartz_scheduler_running', COUNT)
}
TASK_STORE_METRICS = {
"task_store_ASSIGNED": ("aurora.scheduler.task_store_ASSIGNED", MONOTONIC_COUNT),
"task_store_DRAINING": ("aurora.scheduler.task_store_DRAINING", MONOTONIC_COUNT),
"task_store_FAILED": ("aurora.scheduler.task_store_FAILED", MONOTONIC_COUNT),
"task_store_FINISHED": ("aurora.scheduler.task_store_FINISHED", MONOTONIC_COUNT),
"task_store_INIT": ("aurora.scheduler.task_store_INIT", MONOTONIC_COUNT),
"task_store_KILLED": ("aurora.scheduler.task_store_KILLED", MONOTONIC_COUNT),
"task_store_KILLING": ("aurora.scheduler.task_store_KILLING", MONOTONIC_COUNT),
"task_store_LOST": ("aurora.scheduler.task_store_LOST", MONOTONIC_COUNT),
"task_store_PENDING": ("aurora.scheduler.task_store_PENDING", MONOTONIC_COUNT),
"task_store_PREEMPTING": ("aurora.scheduler.task_store_PREEMPTING", MONOTONIC_COUNT),
"task_store_RESTARTING": ("aurora.scheduler.task_store_RESTARTING", MONOTONIC_COUNT),
"task_store_RUNNING": ("aurora.scheduler.task_store_RUNNING", MONOTONIC_COUNT),
"task_store_STARTING": ("aurora.scheduler.task_store_STARTING", MONOTONIC_COUNT),
"task_store_THROTTLED": ("aurora.scheduler.task_store_THROTTLED", MONOTONIC_COUNT),
}
def _get_json(self, url, timeout):
tags = ["url:%s" % url]
msg = None
status = None
json = None
try:
r = requests.get(url, timeout=timeout)
json = r.json()
if r.is_redirect:
status = AgentCheck.OK
self.is_leader = False
msg = "Aurora Scheduler instance detected at %s but is not master" % url
elif r.status_code != 200:
status = AgentCheck.CRITICAL
msg = "Got %s when hitting %s" % (r.status_code, url)
else:
status = AgentCheck.OK
self.is_leader = True
msg = "Aurora Scheduler instance detected at %s " % url
except requests.exceptions.Timeout as e:
# If there's a timeout
msg = "%s seconds timeout when hitting %s" % (timeout, url)
status = AgentCheck.CRITICAL
except Exception as e:
msg = str(e)
status = AgentCheck.CRITICAL
finally:
if self.service_check_needed:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
self.service_check_needed = False
if status is AgentCheck.CRITICAL:
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags, message=msg)
raise CheckException("Cannot connect to aurora scheduler, please check your configuration.")
return json
def _get_state(self, url, timeout):
return self._get_json(url + '/vars.json', timeout)
def check(self, instance):
if 'url' not in instance:
raise Exception('Aurora scheduler instance missing "url" value.')
url = instance['url']
instance_tags = instance.get('tags', [])
default_timeout = self.init_config.get('default_timeout', 5)
timeout = float(instance.get('timeout', default_timeout))
state_metrics = self._get_state(url, timeout)
if state_metrics:
tags = [
'aurora',
'mesos:framework'
]
tags += instance_tags
stats_metrics = self._get_state(url, timeout)
if stats_metrics is not None:
metrics = [self.SYSTEM_METRICS]
if self.is_leader:
metrics += [self.LEADER_METRICS, self.QUARTZ_METRICS, self.PREEMPTION_METRICS,
self.TASK_STORE_METRICS, self.CRON_METRICS, ]
for m in metrics:
print m
for key_name, (metric_name, metric_func) in m.iteritems():
if key_name in stats_metrics:
metric_func(self, metric_name, stats_metrics[key_name], tags=tags)
else:
metric_func(self, metric_name, None, tags=tags)
self.service_check_needed = True
init_config:
default_timeout: 5
instances:
- url: http://localhost:8081
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment