1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include "HeartbeatMap.h"
18 #include "ceph_context.h"
19 #include "common/errno.h"
22 #define dout_subsys ceph_subsys_heartbeatmap
24 #define dout_prefix *_dout << "heartbeat_map "
28 HeartbeatMap::HeartbeatMap(CephContext *cct)
30 m_rwlock("HeartbeatMap::m_rwlock"),
31 m_inject_unhealthy_until(0),
32 m_unhealthy_workers(0),
37 HeartbeatMap::~HeartbeatMap()
39 assert(m_workers.empty());
42 heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
45 ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
46 heartbeat_handle_d *h = new heartbeat_handle_d(name);
47 ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
48 "heartbeat_handle_d timeout");
49 ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
50 "heartbeat_handle_d suicide_timeout");
51 m_workers.push_front(h);
52 h->list_item = m_workers.begin();
53 h->thread_id = thread_id;
58 void HeartbeatMap::remove_worker(const heartbeat_handle_d *h)
61 ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl;
62 m_workers.erase(h->list_item);
67 bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who, time_t now)
73 if (was && was < now) {
74 ldout(m_cct, 1) << who << " '" << h->name << "'"
75 << " had timed out after " << h->grace << dendl;
78 was = h->suicide_timeout;
79 if (was && was < now) {
80 ldout(m_cct, 1) << who << " '" << h->name << "'"
81 << " had suicide timed out after " << h->suicide_grace << dendl;
82 pthread_kill(h->thread_id, SIGABRT);
84 assert(0 == "hit suicide timeout");
89 void HeartbeatMap::reset_timeout(heartbeat_handle_d *h, time_t grace, time_t suicide_grace)
91 ldout(m_cct, 20) << "reset_timeout '" << h->name << "' grace " << grace
92 << " suicide " << suicide_grace << dendl;
93 time_t now = time(NULL);
94 _check(h, "reset_timeout", now);
96 h->timeout = now + grace;
100 h->suicide_timeout = now + suicide_grace;
102 h->suicide_timeout = 0;
103 h->suicide_grace = suicide_grace;
106 void HeartbeatMap::clear_timeout(heartbeat_handle_d *h)
108 ldout(m_cct, 20) << "clear_timeout '" << h->name << "'" << dendl;
109 time_t now = time(NULL);
110 _check(h, "clear_timeout", now);
112 h->suicide_timeout = 0;
115 bool HeartbeatMap::is_healthy()
120 time_t now = time(NULL);
121 if (m_cct->_conf->heartbeat_inject_failure) {
122 ldout(m_cct, 0) << "is_healthy injecting failure for next " << m_cct->_conf->heartbeat_inject_failure << " seconds" << dendl;
123 m_inject_unhealthy_until = now + m_cct->_conf->heartbeat_inject_failure;
124 m_cct->_conf->set_val("heartbeat_inject_failure", "0");
128 if (now < m_inject_unhealthy_until) {
129 ldout(m_cct, 0) << "is_healthy = false, injected failure for next " << (m_inject_unhealthy_until - now) << " seconds" << dendl;
133 for (list<heartbeat_handle_d*>::iterator p = m_workers.begin();
134 p != m_workers.end();
136 heartbeat_handle_d *h = *p;
137 if (!_check(h, "is_healthy", now)) {
145 m_unhealthy_workers = unhealthy;
146 m_total_workers = total;
148 ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY")
149 << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl;
153 int HeartbeatMap::get_unhealthy_workers() const
155 return m_unhealthy_workers;
158 int HeartbeatMap::get_total_workers() const
160 return m_total_workers;
163 void HeartbeatMap::check_touch_file()
166 string path = m_cct->_conf->heartbeat_file;
168 int fd = ::open(path.c_str(), O_WRONLY|O_CREAT, 0644);
170 ::utimes(path.c_str(), NULL);
173 ldout(m_cct, 0) << "unable to touch " << path << ": "
174 << cpp_strerror(errno) << dendl;