Fix some bugs when testing opensds ansible
[stor4nfv.git] / src / ceph / src / pybind / mgr / prometheus / module.py
1 import cherrypy
2 import json
3 import errno
4 import math
5 import os
6 from collections import OrderedDict
7 from mgr_module import MgrModule
8
9 # Defaults for the Prometheus HTTP server.  Can also set in config-key
10 # see https://github.com/prometheus/prometheus/wiki/Default-port-allocations
11 # for Prometheus exporter port registry
12
13 DEFAULT_ADDR = '::'
14 DEFAULT_PORT = 9283
15
16
17 # cherrypy likes to sys.exit on error.  don't let it take us down too!
18 def os_exit_noop(*args, **kwargs):
19     pass
20
21
22 os._exit = os_exit_noop
23
24
25 # to access things in class Module from subclass Root.  Because
26 # it's a dict, the writer doesn't need to declare 'global' for access
27
28 _global_instance = {'plugin': None}
29
30
31 def global_instance():
32     assert _global_instance['plugin'] is not None
33     return _global_instance['plugin']
34
35
36 def health_status_to_number(status):
37
38     if status == 'HEALTH_OK':
39         return 0
40     elif status == 'HEALTH_WARN':
41         return 1
42     elif status == 'HEALTH_ERR':
43         return 2
44
45 PG_STATES = ['creating', 'active', 'clean', 'down', 'scrubbing', 'degraded',
46         'inconsistent', 'peering', 'repair', 'recovering', 'forced-recovery',
47         'backfill', 'forced-backfill', 'wait-backfill', 'backfill-toofull',
48         'incomplete', 'stale', 'remapped', 'undersized', 'peered']
49
50 DF_CLUSTER = ['total_bytes', 'total_used_bytes', 'total_objects']
51
52 DF_POOL = ['max_avail', 'bytes_used', 'raw_bytes_used', 'objects', 'dirty',
53            'quota_bytes', 'quota_objects', 'rd', 'rd_bytes', 'wr', 'wr_bytes']
54
55 OSD_METADATA = ('cluster_addr', 'device_class', 'id', 'public_addr')
56
57 OSD_STATUS = ['weight', 'up', 'in']
58
59 POOL_METADATA = ('pool_id', 'name')
60
61 DISK_OCCUPATION = ('instance', 'device', 'ceph_daemon')
62
63
64 class Metric(object):
65     def __init__(self, mtype, name, desc, labels=None):
66         self.mtype = mtype
67         self.name = name
68         self.desc = desc
69         self.labelnames = labels    # tuple if present
70         self.value = dict()         # indexed by label values
71
72     def set(self, value, labelvalues=None):
73         # labelvalues must be a tuple
74         labelvalues = labelvalues or ('',)
75         self.value[labelvalues] = value
76
77     def str_expfmt(self):
78
79         def promethize(path):
80             ''' replace illegal metric name characters '''
81             result = path.replace('.', '_').replace('+', '_plus').replace('::', '_')
82
83             # Hyphens usually turn into underscores, unless they are
84             # trailing
85             if result.endswith("-"):
86                 result = result[0:-1] + "_minus"
87             else:
88                 result = result.replace("-", "_")
89
90             return "ceph_{0}".format(result)
91
92         def floatstr(value):
93             ''' represent as Go-compatible float '''
94             if value == float('inf'):
95                 return '+Inf'
96             if value == float('-inf'):
97                 return '-Inf'
98             if math.isnan(value):
99                 return 'NaN'
100             return repr(float(value))
101
102         name = promethize(self.name)
103         expfmt = '''
104 # HELP {name} {desc}
105 # TYPE {name} {mtype}'''.format(
106             name=name,
107             desc=self.desc,
108             mtype=self.mtype,
109         )
110
111         for labelvalues, value in self.value.items():
112             if self.labelnames:
113                 labels = zip(self.labelnames, labelvalues)
114                 labels = ','.join('%s="%s"' % (k, v) for k, v in labels)
115             else:
116                 labels = ''
117             if labels:
118                 fmtstr = '\n{name}{{{labels}}} {value}'
119             else:
120                 fmtstr = '\n{name} {value}'
121             expfmt += fmtstr.format(
122                 name=name,
123                 labels=labels,
124                 value=floatstr(value),
125             )
126         return expfmt
127
128
129 class Module(MgrModule):
130     COMMANDS = [
131         {
132             "cmd": "prometheus self-test",
133             "desc": "Run a self test on the prometheus module",
134             "perm": "rw"
135         },
136     ]
137
138     def __init__(self, *args, **kwargs):
139         super(Module, self).__init__(*args, **kwargs)
140         self.notified = False
141         self.serving = False
142         self.metrics = self._setup_static_metrics()
143         self.schema = OrderedDict()
144         _global_instance['plugin'] = self
145
146     def _stattype_to_str(self, stattype):
147
148         typeonly = stattype & self.PERFCOUNTER_TYPE_MASK
149         if typeonly == 0:
150             return 'gauge'
151         if typeonly == self.PERFCOUNTER_LONGRUNAVG:
152             # this lie matches the DaemonState decoding: only val, no counts
153             return 'counter'
154         if typeonly == self.PERFCOUNTER_COUNTER:
155             return 'counter'
156         if typeonly == self.PERFCOUNTER_HISTOGRAM:
157             return 'histogram'
158
159         return ''
160
161     def _setup_static_metrics(self):
162         metrics = {}
163         metrics['health_status'] = Metric(
164             'untyped',
165             'health_status',
166             'Cluster health status'
167         )
168         metrics['mon_quorum_count'] = Metric(
169             'gauge',
170             'mon_quorum_count',
171             'Monitors in quorum'
172         )
173         metrics['osd_metadata'] = Metric(
174             'untyped',
175             'osd_metadata',
176             'OSD Metadata',
177             OSD_METADATA
178         )
179
180         # The reason for having this separate to OSD_METADATA is
181         # so that we can stably use the same tag names that
182         # the Prometheus node_exporter does
183         metrics['disk_occupation'] = Metric(
184             'undef',
185             'disk_occupation',
186             'Associate Ceph daemon with disk used',
187             DISK_OCCUPATION
188         )
189
190         metrics['pool_metadata'] = Metric(
191             'untyped',
192             'pool_metadata',
193             'POOL Metadata',
194             POOL_METADATA
195         )
196         for state in OSD_STATUS:
197             path = 'osd_{}'.format(state)
198             self.log.debug("init: creating {}".format(path))
199             metrics[path] = Metric(
200                 'untyped',
201                 path,
202                 'OSD status {}'.format(state),
203                 ('ceph_daemon',)
204             )
205         for state in PG_STATES:
206             path = 'pg_{}'.format(state)
207             self.log.debug("init: creating {}".format(path))
208             metrics[path] = Metric(
209                 'gauge',
210                 path,
211                 'PG {}'.format(state),
212             )
213         for state in DF_CLUSTER:
214             path = 'cluster_{}'.format(state)
215             self.log.debug("init: creating {}".format(path))
216             metrics[path] = Metric(
217                 'gauge',
218                 path,
219                 'DF {}'.format(state),
220             )
221         for state in DF_POOL:
222             path = 'pool_{}'.format(state)
223             self.log.debug("init: creating {}".format(path))
224             metrics[path] = Metric(
225                 'gauge',
226                 path,
227                 'DF pool {}'.format(state),
228                 ('pool_id',)
229             )
230
231         return metrics
232
233     def shutdown(self):
234         self.serving = False
235         pass
236
237     def get_health(self):
238         health = json.loads(self.get('health')['json'])
239         self.metrics['health_status'].set(
240             health_status_to_number(health['status'])
241         )
242
243     def get_df(self):
244         # maybe get the to-be-exported metrics from a config?
245         df = self.get('df')
246         for stat in DF_CLUSTER:
247             path = 'cluster_{}'.format(stat)
248             self.metrics[path].set(df['stats'][stat])
249
250         for pool in df['pools']:
251             for stat in DF_POOL:
252                 path = 'pool_{}'.format(stat)
253                 self.metrics[path].set(pool['stats'][stat], (pool['id'],))
254
255     def get_quorum_status(self):
256         mon_status = json.loads(self.get('mon_status')['json'])
257         self.metrics['mon_quorum_count'].set(len(mon_status['quorum']))
258
259     def get_pg_status(self):
260         # TODO add per pool status?
261         pg_s = self.get('pg_summary')['all']
262         reported_pg_s = [(s,v) for key, v in pg_s.items() for s in
263                          key.split('+')]
264         for state, value in reported_pg_s:
265             path = 'pg_{}'.format(state)
266             self.metrics[path].set(value)
267         reported_states = [s[0] for s in reported_pg_s]
268         for state in PG_STATES:
269             path = 'pg_{}'.format(state)
270             if state not in reported_states:
271                 self.metrics[path].set(0)
272
273     def get_metadata_and_osd_status(self):
274         osd_map = self.get('osd_map')
275         osd_devices = self.get('osd_map_crush')['devices']
276         for osd in osd_map['osds']:
277             id_ = osd['osd']
278             p_addr = osd['public_addr'].split(':')[0]
279             c_addr = osd['cluster_addr'].split(':')[0]
280             dev_class = next((osd for osd in osd_devices if osd['id'] == id_))
281             self.metrics['osd_metadata'].set(0, (
282                 c_addr,
283                 dev_class['class'],
284                 id_,
285                 p_addr
286             ))
287             for state in OSD_STATUS:
288                 status = osd[state]
289                 self.metrics['osd_{}'.format(state)].set(
290                     status,
291                     ('osd.{}'.format(id_),))
292
293             osd_metadata = self.get_metadata("osd", str(id_))
294             dev_keys = ("backend_filestore_dev_node", "bluestore_bdev_dev_node")
295             osd_dev_node = None
296             for dev_key in dev_keys:
297                 val = osd_metadata.get(dev_key, None)
298                 if val and val != "unknown":
299                     osd_dev_node = val
300                     break
301             osd_hostname = osd_metadata.get('hostname', None)
302             if osd_dev_node and osd_hostname:
303                 self.log.debug("Got dev for osd {0}: {1}/{2}".format(
304                     id_, osd_hostname, osd_dev_node))
305                 self.metrics['disk_occupation'].set(0, (
306                     osd_hostname,
307                     osd_dev_node,
308                     "osd.{0}".format(id_)
309                 ))
310             else:
311                 self.log.info("Missing dev node metadata for osd {0}, skipping "
312                                "occupation record for this osd".format(id_))
313
314         for pool in osd_map['pools']:
315             id_ = pool['pool']
316             name = pool['pool_name']
317             self.metrics['pool_metadata'].set(0, (id_, name))
318
319     def collect(self):
320         self.get_health()
321         self.get_df()
322         self.get_quorum_status()
323         self.get_metadata_and_osd_status()
324         self.get_pg_status()
325
326         for daemon, counters in self.get_all_perf_counters().iteritems():
327             for path, counter_info in counters.items():
328                 stattype = self._stattype_to_str(counter_info['type'])
329                 # XXX simplify first effort: no histograms
330                 # averages are already collapsed to one value for us
331                 if not stattype or stattype == 'histogram':
332                     self.log.debug('ignoring %s, type %s' % (path, stattype))
333                     continue
334
335                 if path not in self.metrics:
336                     self.metrics[path] = Metric(
337                         stattype,
338                         path,
339                         counter_info['description'],
340                         ("ceph_daemon",),
341                     )
342
343                 self.metrics[path].set(
344                     counter_info['value'],
345                     (daemon,)
346                 )
347
348         return self.metrics
349
350     def handle_command(self, cmd):
351         if cmd['prefix'] == 'prometheus self-test':
352             self.collect()
353             return 0, '', 'Self-test OK'
354         else:
355             return (-errno.EINVAL, '',
356                     "Command not found '{0}'".format(cmd['prefix']))
357
358     def serve(self):
359
360         class Root(object):
361
362             # collapse everything to '/'
363             def _cp_dispatch(self, vpath):
364                 cherrypy.request.path = ''
365                 return self
366
367             def format_metrics(self, metrics):
368                 formatted = ''
369                 for m in metrics.values():
370                     formatted += m.str_expfmt()
371                 return formatted + '\n'
372
373             @cherrypy.expose
374             def index(self):
375                 return '''<!DOCTYPE html>
376 <html>
377         <head><title>Ceph Exporter</title></head>
378         <body>
379                 <h1>Ceph Exporter</h1>
380                 <p><a href='/metrics'>Metrics</a></p>
381         </body>
382 </html>'''
383
384             @cherrypy.expose
385             def metrics(self):
386                 metrics = global_instance().collect()
387                 cherrypy.response.headers['Content-Type'] = 'text/plain'
388                 if metrics:
389                     return self.format_metrics(metrics)
390
391         server_addr = self.get_localized_config('server_addr', DEFAULT_ADDR)
392         server_port = self.get_localized_config('server_port', DEFAULT_PORT)
393         self.log.info(
394             "server_addr: %s server_port: %s" %
395             (server_addr, server_port)
396         )
397
398         cherrypy.config.update({
399             'server.socket_host': server_addr,
400             'server.socket_port': int(server_port),
401             'engine.autoreload.on': False
402         })
403         cherrypy.tree.mount(Root(), "/")
404         cherrypy.engine.start()
405         cherrypy.engine.block()