Created
April 25, 2017 19:59
-
-
Save mrhillsman/1d53ed1350ed6d2f2c2c085005648e6d to your computer and use it in GitHub Desktop.
Updated Prometheus OpenStack Exporter for Identity v3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
OpenStack exporter for the prometheus monitoring system | |
Copyright (C) 2016 Canonical, Ltd. | |
Authors: | |
Jacek Nykis <[email protected]> | |
Laurent Sesques <[email protected]> | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License version 3, | |
as published by the Free Software Foundation. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranties of | |
MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR PURPOSE. | |
See the GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
""" | |
import argparse | |
import yaml | |
from os import environ as env | |
from os import rename, path | |
import traceback | |
import urlparse | |
from threading import Thread | |
import pickle | |
import requests | |
from time import sleep, time | |
from neutronclient.v2_0 import client as neutron_client | |
from keystoneclient.v3 import client as keystone_client | |
# from novaclient.v1_1 import client as nova_client | |
# http://docs.openstack.org/developer/python-novaclient/api.html | |
from keystoneauth1 import loading | |
from keystoneauth1 import session | |
from novaclient import client as nova_client | |
from BaseHTTPServer import BaseHTTPRequestHandler | |
from BaseHTTPServer import HTTPServer | |
from SocketServer import ForkingMixIn | |
from prometheus_client import CollectorRegistry, generate_latest, Gauge, CONTENT_TYPE_LATEST | |
from netaddr import iter_iprange | |
class DataGatherer(Thread): | |
def __init__(self): | |
Thread.__init__(self) | |
self.daemon = True | |
self.duration = 0 | |
self.refresh_interval = config.get('cache_refresh_interval', 900) | |
self.cache_file = config['cache_file'] | |
def run(self): | |
prodstack = {} | |
creds = { | |
"username": env['OS_USERNAME'], | |
"password": env['OS_PASSWORD'], | |
"tenant_name": env['OS_TENANT_NAME'], | |
"auth_url": env['OS_AUTH_URL'], | |
"region_name": env['OS_REGION_NAME'] | |
} | |
# creds_nova = [ | |
# 2, | |
# env['OS_USERNAME'], | |
# env['OS_PASSWORD'], | |
# env['OS_TENANT_NAME'], | |
# env['OS_AUTH_URL'], | |
# ] | |
while True: | |
start_time = time() | |
try: | |
keystone = keystone_client.Client(**creds) | |
loader = loading.get_plugin_loader('password') | |
auth = loader.load_from_options(auth_url=env['OS_AUTH_URL'], | |
username=env['OS_USERNAME'], | |
password=env['OS_PASSWORD'], | |
project_name=env['OS_TENANT_NAME'], | |
user_domain_name=env['OS_USER_DOMAIN_NAME'], | |
project_domain_name=env['OS_PROJECT_DOMAIN_NAME'] | |
) | |
sess = session.Session(auth=auth, verify=False) | |
nova = nova_client.Client(2, session=sess) | |
# nova = nova_client.Client(*creds_nova) | |
neutron = neutron_client.Client(session=sess) | |
# neutron = neutron_client.Client(**creds) | |
prodstack['projects'] = [x._info for x in keystone.projects.list()] | |
prodstack['hypervisors'] = [x._info for x in nova.hypervisors.list()] | |
prodstack['services'] = [x._info for x in nova.services.list()] | |
prodstack['networks'] = neutron.list_networks()['networks'] | |
prodstack['flavors'] = [x._info for x in nova.flavors.list()] | |
prodstack['aggregates'] = [x.to_dict() for x in nova.aggregates.list()] | |
prodstack['subnets'] = neutron.list_subnets()['subnets'] | |
prodstack['routers'] = neutron.list_routers()['routers'] | |
prodstack['ports'] = neutron.list_ports()['ports'] | |
prodstack['floatingips'] = neutron.list_floatingips()['floatingips'] | |
# Instance info is very heavy, disable until we merge this bit with pantomath | |
prodstack['instances'] = [] | |
marker = '' | |
while True: | |
search_opts = {'all_projects': '1', 'limit': '100', 'marker': marker} | |
new_instances = [x._info for x in nova.servers.list(search_opts=search_opts)] | |
if new_instances: | |
marker = new_instances[-1]['id'] | |
prodstack['instances'].extend(new_instances) | |
else: | |
break | |
except: | |
# Ignore failures, we will try again after refresh_interval. | |
# Most of them are termporary ie. connectivity problmes | |
# To alert on stale cache use openstack_exporter_cache_age_seconds metric | |
print(traceback.format_exc()) | |
else: | |
with open(self.cache_file + '.new', "wb+") as f: | |
pickle.dump((prodstack, ), f, pickle.HIGHEST_PROTOCOL) | |
rename(self.cache_file + '.new', self.cache_file) | |
self.duration = time() - start_time | |
sleep(self.refresh_interval) | |
def get_stats(self): | |
registry = CollectorRegistry() | |
labels = ['cloud'] | |
age = Gauge('openstack_exporter_cache_age_seconds', | |
'Cache age in seconds. It can reset more frequently ' | |
'than scraping interval so we use Gauge', | |
labels, registry=registry) | |
l = [config['cloud']] | |
age.labels(*l).set(time() - path.getmtime(self.cache_file)) | |
duration = Gauge('openstack_exporter_cache_refresh_duration_seconds', | |
'Cache refresh duration in seconds.', | |
labels, registry=registry) | |
duration.labels(*l).set(self.duration) | |
return generate_latest(registry) | |
class Neutron(): | |
def __init__(self): | |
self.registry = CollectorRegistry() | |
self.prodstack = {} | |
with open(config['cache_file'], 'rb') as f: | |
self.prodstack = pickle.load(f)[0] | |
self.tenant_map = {t['id']: t['name'] for t in self.prodstack['projects']} | |
self.network_map = {n['id']: n['name'] for n in self.prodstack['networks']} | |
self.subnet_map = {n['id']: {'name': n['name'], 'pool': n['allocation_pools']} for n in self.prodstack['subnets']} | |
self.routers = self.prodstack['routers'] | |
self.ports = self.prodstack['ports'] | |
self.floating_ips = self.prodstack['floatingips'] | |
def _get_router_ip(self, uuid): | |
owner = "network:router_gateway" | |
for port in self.ports: | |
if port["device_id"] == uuid and port["device_owner"] == owner: | |
return port["fixed_ips"][0]["ip_address"] | |
def get_floating_ips(self): | |
ips = {} | |
for ip in self.floating_ips: | |
subnet = self.network_map[ip['floating_network_id']] | |
try: | |
tenant = self.tenant_map[ip['tenant_id']] | |
except KeyError: | |
tenant = 'Unknown tenant ({})'.format(ip['tenant_id']) | |
key = (config['cloud'], subnet, tenant, 'floatingip', ip['status']) | |
if key in ips: | |
ips[key] += 1 | |
else: | |
ips[key] = 1 | |
return ips | |
def get_router_ips(self): | |
ips = {} | |
for r in self.routers: | |
if self._get_router_ip(r['id']): | |
tenant = self.tenant_map[r['tenant_id']] | |
subnet = self.network_map[r['external_gateway_info']['network_id']] | |
key = (config['cloud'], subnet, tenant, 'routerip', r['status']) | |
if key in ips: | |
ips[key] += 1 | |
else: | |
ips[key] = 1 | |
return ips | |
def gen_subnet_size(self): | |
labels = ['cloud', 'network_name'] | |
net_size = Gauge('neutron_net_size', | |
'Neutron networks size', | |
labels, registry=self.registry) | |
for n in self.prodstack['networks']: | |
size = 0 | |
for subnet in n['subnets']: | |
for pool in self.subnet_map[subnet]['pool']: | |
size += len(list(iter_iprange(pool['start'], pool['end']))) | |
l = [config['cloud'], self.network_map[n['id']]] | |
net_size.labels(*l).set(size) | |
def get_stats(self): | |
labels = ['cloud', 'subnet_name', 'tenant', 'ip_type', 'ip_status'] | |
ips = self.get_floating_ips() | |
ips.update(self.get_router_ips()) | |
metrics = Gauge('neutron_public_ip_usage', | |
'Neutron floating IP and router IP usage statistics', | |
labels, registry=self.registry) | |
for k, v in ips.items(): | |
metrics.labels(*k).set(v) | |
self.gen_subnet_size() | |
return generate_latest(self.registry) | |
class Nova(): | |
def __init__(self): | |
self.registry = CollectorRegistry() | |
self.prodstack = {} | |
with open(config['cache_file'], 'rb') as f: | |
self.prodstack = pickle.load(f)[0] | |
self.hypervisors = self.prodstack['hypervisors'] | |
self.tenant_map = {t['id']: t['name'] for t in self.prodstack['projects']} | |
self.flavor_map = {f['id']: {'ram': f['ram'], 'disk': f['disk'], 'vcpus': f['vcpus']} | |
for f in self.prodstack['flavors']} | |
self.aggregate_map = {} | |
self.services_map = {} | |
for s in self.prodstack['services']: | |
if s['binary'] == 'nova-compute': | |
self.services_map[s['host']] = s['status'] | |
for agg in self.prodstack['aggregates']: | |
self.aggregate_map.update({i: agg['name'] for i in agg['hosts']}) | |
def _get_schedulable_instances(self, host): | |
free_vcpus = host['vcpus'] * config['openstack_allocation_ratio_vcpu'] - host['vcpus_used'] | |
free_ram_mbs = host['memory_mb'] * config['openstack_allocation_ratio_ram'] - host['memory_mb_used'] | |
free_disk_gbs = host['local_gb'] * config['openstack_allocation_ratio_disk'] - host['local_gb_used'] | |
s = config['schedulable_instance_size'] | |
return min(int(free_vcpus / s['vcpu']), | |
int(free_ram_mbs / s['ram_mbs']), | |
int(free_disk_gbs / s['disk_gbs'])) | |
def _get_schedulable_instances_capacity(self, host): | |
capacity_vcpus = host['vcpus'] * config['openstack_allocation_ratio_vcpu'] | |
capacity_ram_mbs = host['memory_mb'] * config['openstack_allocation_ratio_ram'] | |
capacity_disk_gbs = host['local_gb'] * config['openstack_allocation_ratio_disk'] | |
s = config['schedulable_instance_size'] | |
return min(int(capacity_vcpus / s['vcpu']), | |
int(capacity_ram_mbs / s['ram_mbs']), | |
int(capacity_disk_gbs / s['disk_gbs'])) | |
def gen_hypervisor_stats(self): | |
labels = ['cloud', 'hypervisor_hostname', 'aggregate', 'nova_service_status'] | |
vms = Gauge('hypervisor_running_vms', 'Number of running VMs', labels, registry=self.registry) | |
vcpus_total = Gauge('hypervisor_vcpus_total', 'Total number of vCPUs', labels, registry=self.registry) | |
vcpus_used = Gauge('hypervisor_vcpus_used', 'Number of used vCPUs', labels, registry=self.registry) | |
mem_total = Gauge('hypervisor_memory_mbs_total', 'Total amount of memory in MBs', labels, registry=self.registry) | |
mem_used = Gauge('hypervisor_memory_mbs_used', 'Used memory in MBs', labels, registry=self.registry) | |
disk_total = Gauge('hypervisor_disk_gbs_total', 'Total amount of disk space in GBs', labels, registry=self.registry) | |
disk_used = Gauge('hypervisor_disk_gbs_used', 'Used disk space in GBs', labels, registry=self.registry) | |
schedulable_instances = Gauge('hypervisor_schedulable_instances', | |
'Number of schedulable instances, see "schedulable_instance_size" option', | |
labels, registry=self.registry) | |
schedulable_instances_capacity = Gauge('hypervisor_schedulable_instances_capacity', | |
'Number of schedulable instances we have capacity for', | |
labels, registry=self.registry) | |
for h in self.hypervisors: | |
host = h['service']['host'] | |
l = [config['cloud'], host, self.aggregate_map.get(host, 'unknown'), self.services_map[host]] | |
vms.labels(*l).set(h['running_vms']) | |
vcpus_total.labels(*l).set(h['vcpus']) | |
vcpus_used.labels(*l).set(h['vcpus_used']) | |
mem_total.labels(*l).set(h['memory_mb']) | |
mem_used.labels(*l).set(h['memory_mb_used']) | |
disk_total.labels(*l).set(h['local_gb']) | |
disk_used.labels(*l).set(h['local_gb_used']) | |
if config.get("schedulable_instance_size", False): | |
schedulable_instances.labels(*l).set(self._get_schedulable_instances(h)) | |
schedulable_instances_capacity.labels(*l).set(self._get_schedulable_instances_capacity(h)) | |
def gen_instance_stats(self): | |
instances = Gauge('nova_instances', | |
'Nova instances metrics', | |
['cloud', 'tenant', 'instance_state'], registry=self.registry) | |
res_ram = Gauge('nova_resources_ram_mbs', | |
'Nova RAM usage metric', | |
['cloud', 'tenant'], registry=self.registry) | |
res_vcpus = Gauge('nova_resources_vcpus', | |
'Nova vCPU usage metric', | |
['cloud', 'tenant'], registry=self.registry) | |
res_disk = Gauge('nova_resources_disk_gbs', | |
'Nova disk usage metric', | |
['cloud', 'tenant'], registry=self.registry) | |
for i in self.prodstack['instances']: | |
if i['tenant_id'] in self.tenant_map: | |
tenant = self.tenant_map[i['tenant_id']] | |
else: | |
tenant = 'orphaned' | |
flavor = self.flavor_map[i['flavor']['id']] | |
instances.labels(config['cloud'], tenant, i['status']).inc() | |
res_ram.labels(config['cloud'], tenant).inc(flavor['ram']) | |
res_vcpus.labels(config['cloud'], tenant).inc(flavor['vcpus']) | |
res_disk.labels(config['cloud'], tenant).inc(flavor['disk']) | |
def gen_overcommit_stats(self): | |
labels = ['cloud', 'resource'] | |
openstack_overcommit = Gauge('openstack_allocation_ratio', 'Openstack overcommit ratios', | |
labels, registry=self.registry) | |
l = [config['cloud'], 'vcpu'] | |
openstack_overcommit.labels(*l).set(config['openstack_allocation_ratio_vcpu']) | |
l = [config['cloud'], 'ram'] | |
openstack_overcommit.labels(*l).set(config['openstack_allocation_ratio_ram']) | |
l = [config['cloud'], 'disk'] | |
openstack_overcommit.labels(*l).set(config['openstack_allocation_ratio_disk']) | |
def get_stats(self): | |
self.gen_hypervisor_stats() | |
self.gen_instance_stats() | |
self.gen_overcommit_stats() | |
return generate_latest(self.registry) | |
class Swift(): | |
def __init__(self): | |
self.registry = CollectorRegistry() | |
self.baseurl = 'http://{}:6000/recon/{}' | |
self.swift_hosts = config.get('swift_hosts', []) | |
def gen_disk_usage_stats(self): | |
labels = ['cloud', 'hostname', 'device', 'type'] | |
swift_disk = Gauge('swift_disk_usage_bytes', 'Swift disk usage in bytes', | |
labels, registry=self.registry) | |
for h in self.swift_hosts: | |
r = requests.get(self.baseurl.format(h, 'diskusage')) | |
for disk in r.json(): | |
if not all([disk.get(i, False) for i in ['size', 'used', 'device']]): | |
continue | |
swift_disk.labels(config['cloud'], h, disk['device'], 'size').set(int(disk['size'])) | |
swift_disk.labels(config['cloud'], h, disk['device'], 'used').set(int(disk['used'])) | |
def gen_quarantine_stats(self): | |
labels = ['cloud', 'hostname', 'ring'] | |
swift_quarantine = Gauge('swift_quarantined_objects', 'Number of quarantined objects', | |
labels, registry=self.registry) | |
for h in self.swift_hosts: | |
r = requests.get(self.baseurl.format(h, 'quarantined')) | |
for ring in ['accounts', 'objects', 'containers']: | |
swift_quarantine.labels(config['cloud'], h, ring).set(r.json().get(ring)) | |
def gen_replication_stats(self): | |
labels = ['cloud', 'hostname', 'ring', 'type'] | |
swift_repl = Gauge('swift_replication_stats', 'Swift replication stats', labels, registry=self.registry) | |
labels = ['cloud', 'hostname', 'ring'] | |
swift_repl_duration = Gauge('swift_replication_duration_seconds', 'Swift replication duration in seconds', | |
labels, registry=self.registry) | |
for h in self.swift_hosts: | |
metrics = ['attempted', 'diff', 'diff_capped', 'empty', | |
'failure', 'hashmatch', 'no_change', 'remote_merge', | |
'remove', 'rsync', 'success', 'ts_repl'] | |
# Object replication is special | |
r = requests.get(self.baseurl.format(h, 'replication/object')) | |
try: | |
swift_repl_duration.labels(config['cloud'], h, 'object').set(r.json()['object_replication_time']) | |
except TypeError: | |
print(traceback.format_exc()) | |
for ring in ['account', 'container']: | |
r = requests.get(self.baseurl.format(h, 'replication/' + ring)) | |
try: | |
swift_repl_duration.labels(config['cloud'], h, ring).set(r.json()['replication_time']) | |
except TypeError: | |
print(traceback.format_exc()) | |
for metric in metrics: | |
try: | |
swift_repl.labels(config['cloud'], h, ring, metric).set(r.json()['replication_stats'][metric]) | |
except TypeError: | |
print(traceback.format_exc()) | |
def get_stats(self): | |
self.gen_disk_usage_stats() | |
self.gen_quarantine_stats() | |
self.gen_replication_stats() | |
return generate_latest(self.registry) | |
class ForkingHTTPServer(ForkingMixIn, HTTPServer): | |
pass | |
class OpenstackExporterHandler(BaseHTTPRequestHandler): | |
def __init__(self, *args, **kwargs): | |
BaseHTTPRequestHandler.__init__(self, *args, **kwargs) | |
def do_GET(self): | |
url = urlparse.urlparse(self.path) | |
if url.path == '/metrics': | |
try: | |
neutron = Neutron() | |
nova = Nova() | |
swift = Swift() | |
output = neutron.get_stats() + \ | |
nova.get_stats() + \ | |
swift.get_stats() + \ | |
data_gatherer.get_stats() | |
self.send_response(200) | |
self.send_header('Content-Type', CONTENT_TYPE_LATEST) | |
self.end_headers() | |
self.wfile.write(output) | |
except: | |
self.send_response(500) | |
self.end_headers() | |
self.wfile.write(traceback.format_exc()) | |
elif url.path == '/': | |
self.send_response(200) | |
self.end_headers() | |
self.wfile.write("""<html> | |
<head><title>OpenStack Exporter</title></head> | |
<body> | |
<h1>OpenStack Exporter</h1> | |
<p>Visit <code>/metrics</code> to use.</p> | |
</body> | |
</html>""") | |
else: | |
self.send_response(404) | |
self.end_headers() | |
def handler(*args, **kwargs): | |
OpenstackExporterHandler(*args, **kwargs) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(usage=__doc__, | |
description='Prometheus OpenStack exporter', | |
formatter_class=argparse.RawTextHelpFormatter) | |
parser.add_argument('config_file', nargs='?', | |
help='Configuration file path', | |
default='/etc/prometheus/prometheus-openstack-exporter.yaml', | |
type=argparse.FileType('r')) | |
args = parser.parse_args() | |
config = yaml.safe_load(args.config_file.read()) | |
data_gatherer = DataGatherer() | |
data_gatherer.start() | |
server = ForkingHTTPServer(('', config.get('listen_port')), handler) | |
server.serve_forever() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment