1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <sys/types.h>
18 #include <boost/scoped_ptr.hpp>
25 #include "os/ObjectStore.h"
26 #include "mon/MonClient.h"
27 #include "include/ceph_features.h"
29 #include "common/config.h"
31 #include "mon/MonMap.h"
33 #include "msg/Messenger.h"
35 #include "common/Timer.h"
36 #include "common/TracepointProvider.h"
37 #include "common/ceph_argparse.h"
39 #include "global/global_init.h"
40 #include "global/signal_handler.h"
42 #include "include/color.h"
43 #include "common/errno.h"
44 #include "common/pick_address.h"
46 #include "perfglue/heap_profiler.h"
48 #include "include/assert.h"
50 #define dout_context g_ceph_context
51 #define dout_subsys ceph_subsys_osd
55 TracepointProvider::Traits osd_tracepoint_traits("libosd_tp.so",
57 TracepointProvider::Traits os_tracepoint_traits("libos_tp.so",
58 "osd_objectstore_tracing");
59 #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS
60 TracepointProvider::Traits cyg_profile_traits("libcyg_profile_tp.so",
61 "osd_function_tracing");
64 } // anonymous namespace
68 void handle_osd_signal(int signum)
71 osd->handle_signal(signum);
76 cout << "usage: ceph-osd -i <ID> [flags]\n"
77 << " --osd-data PATH data directory\n"
78 << " --osd-journal PATH\n"
79 << " journal file or block device\n"
80 << " --mkfs create a [new] data directory\n"
81 << " --mkkey generate a new secret key. This is normally used in combination with --mkfs\n"
82 << " --convert-filestore\n"
83 << " run any pending upgrade operations\n"
84 << " --flush-journal flush all data out of journal\n"
85 << " --mkjournal initialize a new journal\n"
86 << " --check-wants-journal\n"
87 << " check whether a journal is desired\n"
88 << " --check-allows-journal\n"
89 << " check whether a journal is allowed\n"
90 << " --check-needs-journal\n"
91 << " check whether a journal is required\n"
92 << " --debug_osd <N> set debug level (e.g. 10)\n"
93 << " --get-device-fsid PATH\n"
94 << " get OSD fsid for the given block device\n"
96 generic_server_usage();
99 #ifdef BUILDING_FOR_EMBEDDED
100 void cephd_preload_embedded_plugins();
101 void cephd_preload_rados_classes(OSD *osd);
102 extern "C" int cephd_osd(int argc, const char **argv)
104 int main(int argc, const char **argv)
107 vector<const char*> args;
108 argv_to_vec(argc, argv, args);
111 vector<const char*> def_args;
112 // We want to enable leveldb's log, while allowing users to override this
113 // option, therefore we will pass it as a default argument to global_init().
114 def_args.push_back("--leveldb-log=");
116 auto cct = global_init(&def_args, args, CEPH_ENTITY_TYPE_OSD,
117 CODE_ENVIRONMENT_DAEMON,
119 ceph_heap_profiler_init();
123 bool mkjournal = false;
124 bool check_wants_journal = false;
125 bool check_allows_journal = false;
126 bool check_needs_journal = false;
128 bool flushjournal = false;
129 bool dump_journal = false;
130 bool convertfilestore = false;
131 bool get_osd_fsid = false;
132 bool get_cluster_fsid = false;
133 bool get_journal_fsid = false;
134 bool get_device_fsid = false;
136 std::string dump_pg_log;
139 for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
140 if (ceph_argparse_double_dash(args, i)) {
142 } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
144 } else if (ceph_argparse_flag(args, i, "--mkfs", (char*)NULL)) {
146 } else if (ceph_argparse_flag(args, i, "--mkjournal", (char*)NULL)) {
148 } else if (ceph_argparse_flag(args, i, "--check-allows-journal", (char*)NULL)) {
149 check_allows_journal = true;
150 } else if (ceph_argparse_flag(args, i, "--check-wants-journal", (char*)NULL)) {
151 check_wants_journal = true;
152 } else if (ceph_argparse_flag(args, i, "--check-needs-journal", (char*)NULL)) {
153 check_needs_journal = true;
154 } else if (ceph_argparse_flag(args, i, "--mkkey", (char*)NULL)) {
156 } else if (ceph_argparse_flag(args, i, "--flush-journal", (char*)NULL)) {
158 } else if (ceph_argparse_flag(args, i, "--convert-filestore", (char*)NULL)) {
159 convertfilestore = true;
160 } else if (ceph_argparse_witharg(args, i, &val, "--dump-pg-log", (char*)NULL)) {
162 } else if (ceph_argparse_flag(args, i, "--dump-journal", (char*)NULL)) {
164 } else if (ceph_argparse_flag(args, i, "--get-cluster-fsid", (char*)NULL)) {
165 get_cluster_fsid = true;
166 } else if (ceph_argparse_flag(args, i, "--get-osd-fsid", "--get-osd-uuid", (char*)NULL)) {
168 } else if (ceph_argparse_flag(args, i, "--get-journal-fsid", "--get-journal-uuid", (char*)NULL)) {
169 get_journal_fsid = true;
170 } else if (ceph_argparse_witharg(args, i, &device_path,
171 "--get-device-fsid", (char*)NULL)) {
172 get_device_fsid = true;
178 derr << "unrecognized arg " << args[0] << dendl;
182 if (get_journal_fsid) {
183 device_path = g_conf->osd_journal;
184 get_device_fsid = true;
186 if (get_device_fsid) {
188 int r = ObjectStore::probe_block_device_fsid(g_ceph_context, device_path,
191 cerr << "failed to get device fsid for " << device_path
192 << ": " << cpp_strerror(r) << std::endl;
195 cout << uuid << std::endl;
199 if (!dump_pg_log.empty()) {
200 common_init_finish(g_ceph_context);
203 int r = bl.read_file(dump_pg_log.c_str(), &error);
206 bufferlist::iterator p = bl.begin();
208 uint64_t pos = p.get_off();
212 catch (const buffer::error &e) {
213 derr << "failed to decode LogEntry at offset " << pos << dendl;
216 derr << pos << ":\t" << e << dendl;
219 derr << "unable to open " << dump_pg_log << ": " << error << dendl;
226 const char *id = g_conf->name.get_id().c_str();
227 int whoami = strtol(id, &end, 10);
228 if (*end || end == id || whoami < 0) {
229 derr << "must specify '-i #' where # is the osd number" << dendl;
233 if (g_conf->osd_data.empty()) {
234 derr << "must specify '--osd-data=foo' data path" << dendl;
239 string store_type = g_conf->osd_objectstore;
242 snprintf(fn, sizeof(fn), "%s/type", g_conf->osd_data.c_str());
243 int fd = ::open(fn, O_RDONLY);
248 store_type = string(bl.c_str(), bl.length() - 1); // drop \n
249 g_conf->set_val("osd_objectstore", store_type);
250 dout(5) << "object store type is " << store_type << dendl;
255 ObjectStore *store = ObjectStore::create(g_ceph_context,
259 g_conf->osd_os_flags);
261 derr << "unable to create object store" << dendl;
265 #ifdef BUILDING_FOR_EMBEDDED
266 cephd_preload_embedded_plugins();
270 common_init_finish(g_ceph_context);
271 KeyRing *keyring = KeyRing::create_empty();
273 derr << "Unable to get a Ceph keyring." << dendl;
277 EntityName ename(g_conf->name);
280 int ret = keyring->load(g_ceph_context, g_conf->keyring);
282 keyring->get_auth(ename, eauth)) {
283 derr << "already have key in keyring " << g_conf->keyring << dendl;
285 eauth.key.create(g_ceph_context, CEPH_CRYPTO_AES);
286 keyring->add(ename, eauth);
288 keyring->encode_plaintext(bl);
289 int r = bl.write_file(g_conf->keyring.c_str(), 0600);
291 derr << TEXT_RED << " ** ERROR: writing new keyring to " << g_conf->keyring
292 << ": " << cpp_strerror(r) << TEXT_NORMAL << dendl;
294 derr << "created new key in keyring " << g_conf->keyring << dendl;
298 common_init_finish(g_ceph_context);
299 MonClient mc(g_ceph_context);
300 if (mc.build_initial_monmap() < 0)
302 if (mc.get_monmap_privately() < 0)
305 if (mc.monmap.fsid.is_zero()) {
306 derr << "must specify cluster fsid" << dendl;
310 int err = OSD::mkfs(g_ceph_context, store, g_conf->osd_data,
311 mc.monmap.fsid, whoami);
313 derr << TEXT_RED << " ** ERROR: error creating empty object store in "
314 << g_conf->osd_data << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
317 derr << "created object store " << g_conf->osd_data
318 << " for osd." << whoami << " fsid " << mc.monmap.fsid << dendl;
323 common_init_finish(g_ceph_context);
324 int err = store->mkjournal();
326 derr << TEXT_RED << " ** ERROR: error creating fresh journal " << g_conf->osd_journal
327 << " for object store " << g_conf->osd_data
328 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
331 derr << "created new journal " << g_conf->osd_journal
332 << " for object store " << g_conf->osd_data << dendl;
335 if (check_wants_journal) {
336 if (store->wants_journal()) {
337 cout << "wants journal: yes" << std::endl;
340 cout << "wants journal: no" << std::endl;
344 if (check_allows_journal) {
345 if (store->allows_journal()) {
346 cout << "allows journal: yes" << std::endl;
349 cout << "allows journal: no" << std::endl;
353 if (check_needs_journal) {
354 if (store->needs_journal()) {
355 cout << "needs journal: yes" << std::endl;
358 cout << "needs journal: no" << std::endl;
363 common_init_finish(g_ceph_context);
364 int err = store->mount();
366 derr << TEXT_RED << " ** ERROR: error flushing journal " << g_conf->osd_journal
367 << " for object store " << g_conf->osd_data
368 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
369 goto flushjournal_out;
372 derr << "flushed journal " << g_conf->osd_journal
373 << " for object store " << g_conf->osd_data
377 exit(err < 0 ? 1 : 0);
380 common_init_finish(g_ceph_context);
381 int err = store->dump_journal(cout);
383 derr << TEXT_RED << " ** ERROR: error dumping journal " << g_conf->osd_journal
384 << " for object store " << g_conf->osd_data
385 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
388 derr << "dumped journal " << g_conf->osd_journal
389 << " for object store " << g_conf->osd_data
396 if (convertfilestore) {
397 int err = store->mount();
399 derr << TEXT_RED << " ** ERROR: error mounting store " << g_conf->osd_data
400 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
403 err = store->upgrade();
406 derr << TEXT_RED << " ** ERROR: error converting store " << g_conf->osd_data
407 << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl;
414 uuid_d cluster_fsid, osd_fsid;
416 int r = OSD::peek_meta(store, magic, cluster_fsid, osd_fsid, w);
418 derr << TEXT_RED << " ** ERROR: unable to open OSD superblock on "
419 << g_conf->osd_data << ": " << cpp_strerror(-r)
420 << TEXT_NORMAL << dendl;
422 derr << TEXT_RED << " ** please verify that underlying storage "
423 << "supports xattrs" << TEXT_NORMAL << dendl;
428 derr << "OSD id " << w << " != my id " << whoami << dendl;
431 if (strcmp(magic.c_str(), CEPH_OSD_ONDISK_MAGIC)) {
432 derr << "OSD magic " << magic << " != my " << CEPH_OSD_ONDISK_MAGIC
437 if (get_cluster_fsid) {
438 cout << cluster_fsid << std::endl;
442 cout << osd_fsid << std::endl;
446 pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC
447 |CEPH_PICK_ADDRESS_CLUSTER);
449 if (g_conf->public_addr.is_blank_ip() && !g_conf->cluster_addr.is_blank_ip()) {
451 << " ** WARNING: specified cluster addr but not public addr; we recommend **\n"
452 << " ** you specify neither or both. **"
453 << TEXT_NORMAL << dendl;
456 std::string public_msgr_type = g_conf->ms_public_type.empty() ? g_conf->get_val<std::string>("ms_type") : g_conf->ms_public_type;
457 std::string cluster_msgr_type = g_conf->ms_cluster_type.empty() ? g_conf->get_val<std::string>("ms_type") : g_conf->ms_cluster_type;
458 Messenger *ms_public = Messenger::create(g_ceph_context, public_msgr_type,
459 entity_name_t::OSD(whoami), "client",
461 Messenger::HAS_HEAVY_TRAFFIC |
462 Messenger::HAS_MANY_CONNECTIONS);
463 Messenger *ms_cluster = Messenger::create(g_ceph_context, cluster_msgr_type,
464 entity_name_t::OSD(whoami), "cluster",
466 Messenger::HAS_HEAVY_TRAFFIC |
467 Messenger::HAS_MANY_CONNECTIONS);
468 Messenger *ms_hb_back_client = Messenger::create(g_ceph_context, cluster_msgr_type,
469 entity_name_t::OSD(whoami), "hb_back_client",
470 getpid(), Messenger::HEARTBEAT);
471 Messenger *ms_hb_front_client = Messenger::create(g_ceph_context, public_msgr_type,
472 entity_name_t::OSD(whoami), "hb_front_client",
473 getpid(), Messenger::HEARTBEAT);
474 Messenger *ms_hb_back_server = Messenger::create(g_ceph_context, cluster_msgr_type,
475 entity_name_t::OSD(whoami), "hb_back_server",
476 getpid(), Messenger::HEARTBEAT);
477 Messenger *ms_hb_front_server = Messenger::create(g_ceph_context, public_msgr_type,
478 entity_name_t::OSD(whoami), "hb_front_server",
479 getpid(), Messenger::HEARTBEAT);
480 Messenger *ms_objecter = Messenger::create(g_ceph_context, public_msgr_type,
481 entity_name_t::OSD(whoami), "ms_objecter",
483 if (!ms_public || !ms_cluster || !ms_hb_front_client || !ms_hb_back_client || !ms_hb_back_server || !ms_hb_front_server || !ms_objecter)
485 ms_cluster->set_cluster_protocol(CEPH_OSD_PROTOCOL);
486 ms_hb_front_client->set_cluster_protocol(CEPH_OSD_PROTOCOL);
487 ms_hb_back_client->set_cluster_protocol(CEPH_OSD_PROTOCOL);
488 ms_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
489 ms_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL);
491 cout << "starting osd." << whoami
492 << " at " << ms_public->get_myaddr()
493 << " osd_data " << g_conf->osd_data
494 << " " << ((g_conf->osd_journal.empty()) ?
495 "(no journal)" : g_conf->osd_journal)
498 boost::scoped_ptr<Throttle> client_byte_throttler(
499 new Throttle(g_ceph_context, "osd_client_bytes",
500 g_conf->osd_client_message_size_cap));
502 // All feature bits 0 - 34 should be present from dumpling v0.67 forward
503 uint64_t osd_required =
505 CEPH_FEATURE_PGID64 |
508 ms_public->set_default_policy(Messenger::Policy::stateless_server(0));
509 ms_public->set_policy_throttlers(entity_name_t::TYPE_CLIENT,
510 client_byte_throttler.get(),
512 ms_public->set_policy(entity_name_t::TYPE_MON,
513 Messenger::Policy::lossy_client(CEPH_FEATURE_UID |
514 CEPH_FEATURE_PGID64 |
515 CEPH_FEATURE_OSDENC));
516 ms_public->set_policy(entity_name_t::TYPE_MGR,
517 Messenger::Policy::lossy_client(CEPH_FEATURE_UID |
518 CEPH_FEATURE_PGID64 |
519 CEPH_FEATURE_OSDENC));
521 //try to poison pill any OSD connections on the wrong address
522 ms_public->set_policy(entity_name_t::TYPE_OSD,
523 Messenger::Policy::stateless_server(0));
525 ms_cluster->set_default_policy(Messenger::Policy::stateless_server(0));
526 ms_cluster->set_policy(entity_name_t::TYPE_MON, Messenger::Policy::lossy_client(0));
527 ms_cluster->set_policy(entity_name_t::TYPE_OSD,
528 Messenger::Policy::lossless_peer(osd_required));
529 ms_cluster->set_policy(entity_name_t::TYPE_CLIENT,
530 Messenger::Policy::stateless_server(0));
532 ms_hb_front_client->set_policy(entity_name_t::TYPE_OSD,
533 Messenger::Policy::lossy_client(0));
534 ms_hb_back_client->set_policy(entity_name_t::TYPE_OSD,
535 Messenger::Policy::lossy_client(0));
536 ms_hb_back_server->set_policy(entity_name_t::TYPE_OSD,
537 Messenger::Policy::stateless_server(0));
538 ms_hb_front_server->set_policy(entity_name_t::TYPE_OSD,
539 Messenger::Policy::stateless_server(0));
541 ms_objecter->set_default_policy(Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX));
543 r = ms_public->bind(g_conf->public_addr);
546 r = ms_cluster->bind(g_conf->cluster_addr);
550 if (g_conf->osd_heartbeat_use_min_delay_socket) {
551 ms_hb_front_client->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
552 ms_hb_back_client->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
553 ms_hb_back_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
554 ms_hb_front_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY);
557 // hb back should bind to same ip as cluster_addr (if specified)
558 entity_addr_t hb_back_addr = g_conf->osd_heartbeat_addr;
559 if (hb_back_addr.is_blank_ip()) {
560 hb_back_addr = g_conf->cluster_addr;
561 if (hb_back_addr.is_ip())
562 hb_back_addr.set_port(0);
564 r = ms_hb_back_server->bind(hb_back_addr);
567 r = ms_hb_back_client->client_bind(hb_back_addr);
571 // hb front should bind to same ip as public_addr
572 entity_addr_t hb_front_addr = g_conf->public_addr;
573 if (hb_front_addr.is_ip())
574 hb_front_addr.set_port(0);
575 r = ms_hb_front_server->bind(hb_front_addr);
578 r = ms_hb_front_client->client_bind(hb_front_addr);
582 // Set up crypto, daemonize, etc.
583 global_init_daemonize(g_ceph_context);
584 common_init_finish(g_ceph_context);
586 TracepointProvider::initialize<osd_tracepoint_traits>(g_ceph_context);
587 TracepointProvider::initialize<os_tracepoint_traits>(g_ceph_context);
588 #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS
589 TracepointProvider::initialize<cyg_profile_traits>(g_ceph_context);
592 MonClient mc(g_ceph_context);
593 if (mc.build_initial_monmap() < 0)
595 global_init_chdir(g_ceph_context);
597 #ifndef BUILDING_FOR_EMBEDDED
598 if (global_init_preload_erasure_code(g_ceph_context) < 0)
602 srand(time(NULL) + getpid());
604 osd = new OSD(g_ceph_context,
616 g_conf->osd_journal);
618 int err = osd->pre_init();
620 derr << TEXT_RED << " ** ERROR: osd pre_init failed: " << cpp_strerror(-err)
621 << TEXT_NORMAL << dendl;
626 ms_hb_front_client->start();
627 ms_hb_back_client->start();
628 ms_hb_front_server->start();
629 ms_hb_back_server->start();
631 ms_objecter->start();
636 derr << TEXT_RED << " ** ERROR: osd init failed: " << cpp_strerror(-err)
637 << TEXT_NORMAL << dendl;
641 #ifdef BUILDING_FOR_EMBEDDED
642 cephd_preload_rados_classes(osd);
645 // install signal handlers
646 init_async_signal_handler();
647 register_async_signal_handler(SIGHUP, sighup_handler);
648 register_async_signal_handler_oneshot(SIGINT, handle_osd_signal);
649 register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal);
653 if (g_conf->inject_early_sigterm)
654 kill(getpid(), SIGTERM);
657 ms_hb_front_client->wait();
658 ms_hb_back_client->wait();
659 ms_hb_front_server->wait();
660 ms_hb_back_server->wait();
664 unregister_async_signal_handler(SIGHUP, sighup_handler);
665 unregister_async_signal_handler(SIGINT, handle_osd_signal);
666 unregister_async_signal_handler(SIGTERM, handle_osd_signal);
667 shutdown_async_signal_handler();
672 delete ms_hb_front_client;
673 delete ms_hb_back_client;
674 delete ms_hb_front_server;
675 delete ms_hb_back_server;
679 client_byte_throttler.reset();
681 // cd on exit, so that gmon.out (if any) goes into a separate directory for each node.
683 snprintf(s, sizeof(s), "gmon/%d", getpid());
684 if ((mkdir(s, 0755) == 0) && (chdir(s) == 0)) {
685 dout(0) << "ceph-osd: gmon.out should be in " << s << dendl;