kernel/fs/ceph/super.c

   1
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/backing-dev.h>
   5 #include <linux/ctype.h>
   6 #include <linux/fs.h>
   7 #include <linux/inet.h>
   8 #include <linux/in6.h>
   9 #include <linux/module.h>
  10 #include <linux/mount.h>
  11 #include <linux/parser.h>
  12 #include <linux/sched.h>
  13 #include <linux/seq_file.h>
  14 #include <linux/slab.h>
  15 #include <linux/statfs.h>
  16 #include <linux/string.h>
  17
  18 #include "super.h"
  19 #include "mds_client.h"
  20 #include "cache.h"
  21
  22 #include <linux/ceph/ceph_features.h>
  23 #include <linux/ceph/decode.h>
  24 #include <linux/ceph/mon_client.h>
  25 #include <linux/ceph/auth.h>
  26 #include <linux/ceph/debugfs.h>
  27
  28 /*
  29  * Ceph superblock operations
  30  *
  31  * Handle the basics of mounting, unmounting.
  32  */
  33
  34 /*
  35  * super ops
  36  */
  37 static void ceph_put_super(struct super_block *s)
  38 {
  39         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
  40
  41         dout("put_super\n");
  42         ceph_mdsc_close_sessions(fsc->mdsc);
  43 }
  44
  45 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  46 {
  47         struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
  48         struct ceph_monmap *monmap = fsc->client->monc.monmap;
  49         struct ceph_statfs st;
  50         u64 fsid;
  51         int err;
  52
  53         dout("statfs\n");
  54         err = ceph_monc_do_statfs(&fsc->client->monc, &st);
  55         if (err < 0)
  56                 return err;
  57
  58         /* fill in kstatfs */
  59         buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
  60
  61         /*
  62          * express utilization in terms of large blocks to avoid
  63          * overflow on 32-bit machines.
  64          *
  65          * NOTE: for the time being, we make bsize == frsize to humor
  66          * not-yet-ancient versions of glibc that are broken.
  67          * Someday, we will probably want to report a real block
  68          * size...  whatever that may mean for a network file system!
  69          */
  70         buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
  71         buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
  72         buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
  73         buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  74         buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
  75
  76         buf->f_files = le64_to_cpu(st.num_objects);
  77         buf->f_ffree = -1;
  78         buf->f_namelen = NAME_MAX;
  79
  80         /* leave fsid little-endian, regardless of host endianness */
  81         fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
  82         buf->f_fsid.val[0] = fsid & 0xffffffff;
  83         buf->f_fsid.val[1] = fsid >> 32;
  84
  85         return 0;
  86 }
  87
  88
  89 static int ceph_sync_fs(struct super_block *sb, int wait)
  90 {
  91         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
  92
  93         if (!wait) {
  94                 dout("sync_fs (non-blocking)\n");
  95                 ceph_flush_dirty_caps(fsc->mdsc);
  96                 dout("sync_fs (non-blocking) done\n");
  97                 return 0;
  98         }
  99
 100         dout("sync_fs (blocking)\n");
 101         ceph_osdc_sync(&fsc->client->osdc);
 102         ceph_mdsc_sync(fsc->mdsc);
 103         dout("sync_fs (blocking) done\n");
 104         return 0;
 105 }
 106
 107 /*
 108  * mount options
 109  */
 110 enum {
 111         Opt_wsize,
 112         Opt_rsize,
 113         Opt_rasize,
 114         Opt_caps_wanted_delay_min,
 115         Opt_caps_wanted_delay_max,
 116         Opt_cap_release_safety,
 117         Opt_readdir_max_entries,
 118         Opt_readdir_max_bytes,
 119         Opt_congestion_kb,
 120         Opt_last_int,
 121         /* int args above */
 122         Opt_snapdirname,
 123         Opt_last_string,
 124         /* string args above */
 125         Opt_dirstat,
 126         Opt_nodirstat,
 127         Opt_rbytes,
 128         Opt_norbytes,
 129         Opt_asyncreaddir,
 130         Opt_noasyncreaddir,
 131         Opt_dcache,
 132         Opt_nodcache,
 133         Opt_ino32,
 134         Opt_noino32,
 135         Opt_fscache,
 136         Opt_nofscache,
 137         Opt_poolperm,
 138         Opt_nopoolperm,
 139 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 140         Opt_acl,
 141 #endif
 142         Opt_noacl,
 143 };
 144
 145 static match_table_t fsopt_tokens = {
 146         {Opt_wsize, "wsize=%d"},
 147         {Opt_rsize, "rsize=%d"},
 148         {Opt_rasize, "rasize=%d"},
 149         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 150         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 151         {Opt_cap_release_safety, "cap_release_safety=%d"},
 152         {Opt_readdir_max_entries, "readdir_max_entries=%d"},
 153         {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 154         {Opt_congestion_kb, "write_congestion_kb=%d"},
 155         /* int args above */
 156         {Opt_snapdirname, "snapdirname=%s"},
 157         /* string args above */
 158         {Opt_dirstat, "dirstat"},
 159         {Opt_nodirstat, "nodirstat"},
 160         {Opt_rbytes, "rbytes"},
 161         {Opt_norbytes, "norbytes"},
 162         {Opt_asyncreaddir, "asyncreaddir"},
 163         {Opt_noasyncreaddir, "noasyncreaddir"},
 164         {Opt_dcache, "dcache"},
 165         {Opt_nodcache, "nodcache"},
 166         {Opt_ino32, "ino32"},
 167         {Opt_noino32, "noino32"},
 168         {Opt_fscache, "fsc"},
 169         {Opt_nofscache, "nofsc"},
 170         {Opt_poolperm, "poolperm"},
 171         {Opt_nopoolperm, "nopoolperm"},
 172 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 173         {Opt_acl, "acl"},
 174 #endif
 175         {Opt_noacl, "noacl"},
 176         {-1, NULL}
 177 };
 178
 179 static int parse_fsopt_token(char *c, void *private)
 180 {
 181         struct ceph_mount_options *fsopt = private;
 182         substring_t argstr[MAX_OPT_ARGS];
 183         int token, intval, ret;
 184
 185         token = match_token((char *)c, fsopt_tokens, argstr);
 186         if (token < 0)
 187                 return -EINVAL;
 188
 189         if (token < Opt_last_int) {
 190                 ret = match_int(&argstr[0], &intval);
 191                 if (ret < 0) {
 192                         pr_err("bad mount option arg (not int) "
 193                                "at '%s'\n", c);
 194                         return ret;
 195                 }
 196                 dout("got int token %d val %d\n", token, intval);
 197         } else if (token > Opt_last_int && token < Opt_last_string) {
 198                 dout("got string token %d val %s\n", token,
 199                      argstr[0].from);
 200         } else {
 201                 dout("got token %d\n", token);
 202         }
 203
 204         switch (token) {
 205         case Opt_snapdirname:
 206                 kfree(fsopt->snapdir_name);
 207                 fsopt->snapdir_name = kstrndup(argstr[0].from,
 208                                                argstr[0].to-argstr[0].from,
 209                                                GFP_KERNEL);
 210                 if (!fsopt->snapdir_name)
 211                         return -ENOMEM;
 212                 break;
 213
 214                 /* misc */
 215         case Opt_wsize:
 216                 fsopt->wsize = intval;
 217                 break;
 218         case Opt_rsize:
 219                 fsopt->rsize = intval;
 220                 break;
 221         case Opt_rasize:
 222                 fsopt->rasize = intval;
 223                 break;
 224         case Opt_caps_wanted_delay_min:
 225                 fsopt->caps_wanted_delay_min = intval;
 226                 break;
 227         case Opt_caps_wanted_delay_max:
 228                 fsopt->caps_wanted_delay_max = intval;
 229                 break;
 230         case Opt_readdir_max_entries:
 231                 fsopt->max_readdir = intval;
 232                 break;
 233         case Opt_readdir_max_bytes:
 234                 fsopt->max_readdir_bytes = intval;
 235                 break;
 236         case Opt_congestion_kb:
 237                 fsopt->congestion_kb = intval;
 238                 break;
 239         case Opt_dirstat:
 240                 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
 241                 break;
 242         case Opt_nodirstat:
 243                 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
 244                 break;
 245         case Opt_rbytes:
 246                 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
 247                 break;
 248         case Opt_norbytes:
 249                 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
 250                 break;
 251         case Opt_asyncreaddir:
 252                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR;
 253                 break;
 254         case Opt_noasyncreaddir:
 255                 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 256                 break;
 257         case Opt_dcache:
 258                 fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
 259                 break;
 260         case Opt_nodcache:
 261                 fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
 262                 break;
 263         case Opt_ino32:
 264                 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
 265                 break;
 266         case Opt_noino32:
 267                 fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
 268                 break;
 269         case Opt_fscache:
 270                 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
 271                 break;
 272         case Opt_nofscache:
 273                 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
 274                 break;
 275         case Opt_poolperm:
 276                 fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
 277                 printk ("pool perm");
 278                 break;
 279         case Opt_nopoolperm:
 280                 fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
 281                 break;
 282 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 283         case Opt_acl:
 284                 fsopt->sb_flags |= MS_POSIXACL;
 285                 break;
 286 #endif
 287         case Opt_noacl:
 288                 fsopt->sb_flags &= ~MS_POSIXACL;
 289                 break;
 290         default:
 291                 BUG_ON(token);
 292         }
 293         return 0;
 294 }
 295
 296 static void destroy_mount_options(struct ceph_mount_options *args)
 297 {
 298         dout("destroy_mount_options %p\n", args);
 299         kfree(args->snapdir_name);
 300         kfree(args);
 301 }
 302
 303 static int strcmp_null(const char *s1, const char *s2)
 304 {
 305         if (!s1 && !s2)
 306                 return 0;
 307         if (s1 && !s2)
 308                 return -1;
 309         if (!s1 && s2)
 310                 return 1;
 311         return strcmp(s1, s2);
 312 }
 313
 314 static int compare_mount_options(struct ceph_mount_options *new_fsopt,
 315                                  struct ceph_options *new_opt,
 316                                  struct ceph_fs_client *fsc)
 317 {
 318         struct ceph_mount_options *fsopt1 = new_fsopt;
 319         struct ceph_mount_options *fsopt2 = fsc->mount_options;
 320         int ofs = offsetof(struct ceph_mount_options, snapdir_name);
 321         int ret;
 322
 323         ret = memcmp(fsopt1, fsopt2, ofs);
 324         if (ret)
 325                 return ret;
 326
 327         ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
 328         if (ret)
 329                 return ret;
 330
 331         return ceph_compare_options(new_opt, fsc->client);
 332 }
 333
 334 static int parse_mount_options(struct ceph_mount_options **pfsopt,
 335                                struct ceph_options **popt,
 336                                int flags, char *options,
 337                                const char *dev_name,
 338                                const char **path)
 339 {
 340         struct ceph_mount_options *fsopt;
 341         const char *dev_name_end;
 342         int err;
 343
 344         if (!dev_name || !*dev_name)
 345                 return -EINVAL;
 346
 347         fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
 348         if (!fsopt)
 349                 return -ENOMEM;
 350
 351         dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
 352
 353         fsopt->sb_flags = flags;
 354         fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 355
 356         fsopt->rsize = CEPH_RSIZE_DEFAULT;
 357         fsopt->rasize = CEPH_RASIZE_DEFAULT;
 358         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 359         if (!fsopt->snapdir_name) {
 360                 err = -ENOMEM;
 361                 goto out;
 362         }
 363
 364         fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 365         fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
 366         fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
 367         fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
 368         fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 369         fsopt->congestion_kb = default_congestion_kb();
 370
 371         /*
 372          * Distinguish the server list from the path in "dev_name".
 373          * Internally we do not include the leading '/' in the path.
 374          *
 375          * "dev_name" will look like:
 376          *     <server_spec>[,<server_spec>...]:[<path>]
 377          * where
 378          *     <server_spec> is <ip>[:<port>]
 379          *     <path> is optional, but if present must begin with '/'
 380          */
 381         dev_name_end = strchr(dev_name, '/');
 382         if (dev_name_end) {
 383                 /* skip over leading '/' for path */
 384                 *path = dev_name_end + 1;
 385         } else {
 386                 /* path is empty */
 387                 dev_name_end = dev_name + strlen(dev_name);
 388                 *path = dev_name_end;
 389         }
 390         err = -EINVAL;
 391         dev_name_end--;         /* back up to ':' separator */
 392         if (dev_name_end < dev_name || *dev_name_end != ':') {
 393                 pr_err("device name is missing path (no : separator in %s)\n",
 394                                 dev_name);
 395                 goto out;
 396         }
 397         dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
 398         dout("server path '%s'\n", *path);
 399
 400         *popt = ceph_parse_options(options, dev_name, dev_name_end,
 401                                  parse_fsopt_token, (void *)fsopt);
 402         if (IS_ERR(*popt)) {
 403                 err = PTR_ERR(*popt);
 404                 goto out;
 405         }
 406
 407         /* success */
 408         *pfsopt = fsopt;
 409         return 0;
 410
 411 out:
 412         destroy_mount_options(fsopt);
 413         return err;
 414 }
 415
 416 /**
 417  * ceph_show_options - Show mount options in /proc/mounts
 418  * @m: seq_file to write to
 419  * @root: root of that (sub)tree
 420  */
 421 static int ceph_show_options(struct seq_file *m, struct dentry *root)
 422 {
 423         struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
 424         struct ceph_mount_options *fsopt = fsc->mount_options;
 425         size_t pos;
 426         int ret;
 427
 428         /* a comma between MNT/MS and client options */
 429         seq_putc(m, ',');
 430         pos = m->count;
 431
 432         ret = ceph_print_client_options(m, fsc->client);
 433         if (ret)
 434                 return ret;
 435
 436         /* retract our comma if no client options */
 437         if (m->count == pos)
 438                 m->count--;
 439
 440         if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
 441                 seq_puts(m, ",dirstat");
 442         if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
 443                 seq_puts(m, ",norbytes");
 444         if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
 445                 seq_puts(m, ",noasyncreaddir");
 446         if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
 447                 seq_puts(m, ",nodcache");
 448         if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
 449                 seq_puts(m, ",fsc");
 450         if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
 451                 seq_puts(m, ",nopoolperm");
 452
 453 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 454         if (fsopt->sb_flags & MS_POSIXACL)
 455                 seq_puts(m, ",acl");
 456         else
 457                 seq_puts(m, ",noacl");
 458 #endif
 459
 460         if (fsopt->wsize)
 461                 seq_printf(m, ",wsize=%d", fsopt->wsize);
 462         if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
 463                 seq_printf(m, ",rsize=%d", fsopt->rsize);
 464         if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
 465                 seq_printf(m, ",rasize=%d", fsopt->rasize);
 466         if (fsopt->congestion_kb != default_congestion_kb())
 467                 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
 468         if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
 469                 seq_printf(m, ",caps_wanted_delay_min=%d",
 470                          fsopt->caps_wanted_delay_min);
 471         if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
 472                 seq_printf(m, ",caps_wanted_delay_max=%d",
 473                            fsopt->caps_wanted_delay_max);
 474         if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
 475                 seq_printf(m, ",cap_release_safety=%d",
 476                            fsopt->cap_release_safety);
 477         if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
 478                 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
 479         if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
 480                 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
 481         if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
 482                 seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 483
 484         return 0;
 485 }
 486
 487 /*
 488  * handle any mon messages the standard library doesn't understand.
 489  * return error if we don't either.
 490  */
 491 static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
 492 {
 493         struct ceph_fs_client *fsc = client->private;
 494         int type = le16_to_cpu(msg->hdr.type);
 495
 496         switch (type) {
 497         case CEPH_MSG_MDS_MAP:
 498                 ceph_mdsc_handle_map(fsc->mdsc, msg);
 499                 return 0;
 500
 501         default:
 502                 return -1;
 503         }
 504 }
 505
 506 /*
 507  * create a new fs client
 508  */
 509 static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 510                                         struct ceph_options *opt)
 511 {
 512         struct ceph_fs_client *fsc;
 513         const u64 supported_features =
 514                 CEPH_FEATURE_FLOCK |
 515                 CEPH_FEATURE_DIRLAYOUTHASH |
 516                 CEPH_FEATURE_MDS_INLINE_DATA;
 517         const u64 required_features = 0;
 518         int page_count;
 519         size_t size;
 520         int err = -ENOMEM;
 521
 522         fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
 523         if (!fsc)
 524                 return ERR_PTR(-ENOMEM);
 525
 526         fsc->client = ceph_create_client(opt, fsc, supported_features,
 527                                          required_features);
 528         if (IS_ERR(fsc->client)) {
 529                 err = PTR_ERR(fsc->client);
 530                 goto fail;
 531         }
 532         fsc->client->extra_mon_dispatch = extra_mon_dispatch;
 533         fsc->client->monc.want_mdsmap = 1;
 534
 535         fsc->mount_options = fsopt;
 536
 537         fsc->sb = NULL;
 538         fsc->mount_state = CEPH_MOUNT_MOUNTING;
 539
 540         atomic_long_set(&fsc->writeback_count, 0);
 541
 542         err = bdi_init(&fsc->backing_dev_info);
 543         if (err < 0)
 544                 goto fail_client;
 545
 546         err = -ENOMEM;
 547         /*
 548          * The number of concurrent works can be high but they don't need
 549          * to be processed in parallel, limit concurrency.
 550          */
 551         fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
 552         if (fsc->wb_wq == NULL)
 553                 goto fail_bdi;
 554         fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
 555         if (fsc->pg_inv_wq == NULL)
 556                 goto fail_wb_wq;
 557         fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
 558         if (fsc->trunc_wq == NULL)
 559                 goto fail_pg_inv_wq;
 560
 561         /* set up mempools */
 562         err = -ENOMEM;
 563         page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
 564         size = sizeof (struct page *) * (page_count ? page_count : 1);
 565         fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
 566         if (!fsc->wb_pagevec_pool)
 567                 goto fail_trunc_wq;
 568
 569         /* setup fscache */
 570         if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
 571             (ceph_fscache_register_fs(fsc) != 0))
 572                 goto fail_fscache;
 573
 574         /* caps */
 575         fsc->min_caps = fsopt->max_readdir;
 576
 577         return fsc;
 578
 579 fail_fscache:
 580         ceph_fscache_unregister_fs(fsc);
 581 fail_trunc_wq:
 582         destroy_workqueue(fsc->trunc_wq);
 583 fail_pg_inv_wq:
 584         destroy_workqueue(fsc->pg_inv_wq);
 585 fail_wb_wq:
 586         destroy_workqueue(fsc->wb_wq);
 587 fail_bdi:
 588         bdi_destroy(&fsc->backing_dev_info);
 589 fail_client:
 590         ceph_destroy_client(fsc->client);
 591 fail:
 592         kfree(fsc);
 593         return ERR_PTR(err);
 594 }
 595
 596 static void destroy_fs_client(struct ceph_fs_client *fsc)
 597 {
 598         dout("destroy_fs_client %p\n", fsc);
 599
 600         ceph_fscache_unregister_fs(fsc);
 601
 602         destroy_workqueue(fsc->wb_wq);
 603         destroy_workqueue(fsc->pg_inv_wq);
 604         destroy_workqueue(fsc->trunc_wq);
 605
 606         bdi_destroy(&fsc->backing_dev_info);
 607
 608         mempool_destroy(fsc->wb_pagevec_pool);
 609
 610         destroy_mount_options(fsc->mount_options);
 611
 612         ceph_fs_debugfs_cleanup(fsc);
 613
 614         ceph_destroy_client(fsc->client);
 615
 616         kfree(fsc);
 617         dout("destroy_fs_client %p done\n", fsc);
 618 }
 619
 620 /*
 621  * caches
 622  */
 623 struct kmem_cache *ceph_inode_cachep;
 624 struct kmem_cache *ceph_cap_cachep;
 625 struct kmem_cache *ceph_cap_flush_cachep;
 626 struct kmem_cache *ceph_dentry_cachep;
 627 struct kmem_cache *ceph_file_cachep;
 628
 629 static void ceph_inode_init_once(void *foo)
 630 {
 631         struct ceph_inode_info *ci = foo;
 632         inode_init_once(&ci->vfs_inode);
 633 }
 634
 635 static int __init init_caches(void)
 636 {
 637         int error = -ENOMEM;
 638
 639         ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 640                                       sizeof(struct ceph_inode_info),
 641                                       __alignof__(struct ceph_inode_info),
 642                                       (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
 643                                       ceph_inode_init_once);
 644         if (ceph_inode_cachep == NULL)
 645                 return -ENOMEM;
 646
 647         ceph_cap_cachep = KMEM_CACHE(ceph_cap,
 648                                      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 649         if (ceph_cap_cachep == NULL)
 650                 goto bad_cap;
 651         ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
 652                                            SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 653         if (ceph_cap_flush_cachep == NULL)
 654                 goto bad_cap_flush;
 655
 656         ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
 657                                         SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 658         if (ceph_dentry_cachep == NULL)
 659                 goto bad_dentry;
 660
 661         ceph_file_cachep = KMEM_CACHE(ceph_file_info,
 662                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 663         if (ceph_file_cachep == NULL)
 664                 goto bad_file;
 665
 666         if ((error = ceph_fscache_register()))
 667                 goto bad_file;
 668
 669         return 0;
 670 bad_file:
 671         kmem_cache_destroy(ceph_dentry_cachep);
 672 bad_dentry:
 673         kmem_cache_destroy(ceph_cap_flush_cachep);
 674 bad_cap_flush:
 675         kmem_cache_destroy(ceph_cap_cachep);
 676 bad_cap:
 677         kmem_cache_destroy(ceph_inode_cachep);
 678         return error;
 679 }
 680
 681 static void destroy_caches(void)
 682 {
 683         /*
 684          * Make sure all delayed rcu free inodes are flushed before we
 685          * destroy cache.
 686          */
 687         rcu_barrier();
 688
 689         kmem_cache_destroy(ceph_inode_cachep);
 690         kmem_cache_destroy(ceph_cap_cachep);
 691         kmem_cache_destroy(ceph_cap_flush_cachep);
 692         kmem_cache_destroy(ceph_dentry_cachep);
 693         kmem_cache_destroy(ceph_file_cachep);
 694
 695         ceph_fscache_unregister();
 696 }
 697
 698
 699 /*
 700  * ceph_umount_begin - initiate forced umount.  Tear down down the
 701  * mount, skipping steps that may hang while waiting for server(s).
 702  */
 703 static void ceph_umount_begin(struct super_block *sb)
 704 {
 705         struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 706
 707         dout("ceph_umount_begin - starting forced umount\n");
 708         if (!fsc)
 709                 return;
 710         fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
 711         ceph_mdsc_force_umount(fsc->mdsc);
 712         return;
 713 }
 714
 715 static const struct super_operations ceph_super_ops = {
 716         .alloc_inode    = ceph_alloc_inode,
 717         .destroy_inode  = ceph_destroy_inode,
 718         .write_inode    = ceph_write_inode,
 719         .drop_inode     = ceph_drop_inode,
 720         .sync_fs        = ceph_sync_fs,
 721         .put_super      = ceph_put_super,
 722         .show_options   = ceph_show_options,
 723         .statfs         = ceph_statfs,
 724         .umount_begin   = ceph_umount_begin,
 725 };
 726
 727 /*
 728  * Bootstrap mount by opening the root directory.  Note the mount
 729  * @started time from caller, and time out if this takes too long.
 730  */
 731 static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 732                                        const char *path,
 733                                        unsigned long started)
 734 {
 735         struct ceph_mds_client *mdsc = fsc->mdsc;
 736         struct ceph_mds_request *req = NULL;
 737         int err;
 738         struct dentry *root;
 739
 740         /* open dir */
 741         dout("open_root_inode opening '%s'\n", path);
 742         req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
 743         if (IS_ERR(req))
 744                 return ERR_CAST(req);
 745         req->r_path1 = kstrdup(path, GFP_NOFS);
 746         if (!req->r_path1) {
 747                 root = ERR_PTR(-ENOMEM);
 748                 goto out;
 749         }
 750
 751         req->r_ino1.ino = CEPH_INO_ROOT;
 752         req->r_ino1.snap = CEPH_NOSNAP;
 753         req->r_started = started;
 754         req->r_timeout = fsc->client->options->mount_timeout;
 755         req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 756         req->r_num_caps = 2;
 757         err = ceph_mdsc_do_request(mdsc, NULL, req);
 758         if (err == 0) {
 759                 struct inode *inode = req->r_target_inode;
 760                 req->r_target_inode = NULL;
 761                 dout("open_root_inode success\n");
 762                 if (ceph_ino(inode) == CEPH_INO_ROOT &&
 763                     fsc->sb->s_root == NULL) {
 764                         root = d_make_root(inode);
 765                         if (!root) {
 766                                 root = ERR_PTR(-ENOMEM);
 767                                 goto out;
 768                         }
 769                 } else {
 770                         root = d_obtain_root(inode);
 771                 }
 772                 ceph_init_dentry(root);
 773                 dout("open_root_inode success, root dentry is %p\n", root);
 774         } else {
 775                 root = ERR_PTR(err);
 776         }
 777 out:
 778         ceph_mdsc_put_request(req);
 779         return root;
 780 }
 781
 782
 783
 784
 785 /*
 786  * mount: join the ceph cluster, and open root directory.
 787  */
 788 static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 789                       const char *path)
 790 {
 791         int err;
 792         unsigned long started = jiffies;  /* note the start time */
 793         struct dentry *root;
 794         int first = 0;   /* first vfsmount for this super_block */
 795
 796         dout("mount start\n");
 797         mutex_lock(&fsc->client->mount_mutex);
 798
 799         err = __ceph_open_session(fsc->client, started);
 800         if (err < 0)
 801                 goto out;
 802
 803         dout("mount opening root\n");
 804         root = open_root_dentry(fsc, "", started);
 805         if (IS_ERR(root)) {
 806                 err = PTR_ERR(root);
 807                 goto out;
 808         }
 809         if (fsc->sb->s_root) {
 810                 dput(root);
 811         } else {
 812                 fsc->sb->s_root = root;
 813                 first = 1;
 814
 815                 err = ceph_fs_debugfs_init(fsc);
 816                 if (err < 0)
 817                         goto fail;
 818         }
 819
 820         if (path[0] == 0) {
 821                 dget(root);
 822         } else {
 823                 dout("mount opening base mountpoint\n");
 824                 root = open_root_dentry(fsc, path, started);
 825                 if (IS_ERR(root)) {
 826                         err = PTR_ERR(root);
 827                         goto fail;
 828                 }
 829         }
 830
 831         fsc->mount_state = CEPH_MOUNT_MOUNTED;
 832         dout("mount success\n");
 833         mutex_unlock(&fsc->client->mount_mutex);
 834         return root;
 835
 836 out:
 837         mutex_unlock(&fsc->client->mount_mutex);
 838         return ERR_PTR(err);
 839
 840 fail:
 841         if (first) {
 842                 dput(fsc->sb->s_root);
 843                 fsc->sb->s_root = NULL;
 844         }
 845         goto out;
 846 }
 847
 848 static int ceph_set_super(struct super_block *s, void *data)
 849 {
 850         struct ceph_fs_client *fsc = data;
 851         int ret;
 852
 853         dout("set_super %p data %p\n", s, data);
 854
 855         s->s_flags = fsc->mount_options->sb_flags;
 856         s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 857
 858         s->s_xattr = ceph_xattr_handlers;
 859         s->s_fs_info = fsc;
 860         fsc->sb = s;
 861
 862         s->s_op = &ceph_super_ops;
 863         s->s_export_op = &ceph_export_ops;
 864
 865         s->s_time_gran = 1000;  /* 1000 ns == 1 us */
 866
 867         ret = set_anon_super(s, NULL);  /* what is that second arg for? */
 868         if (ret != 0)
 869                 goto fail;
 870
 871         return ret;
 872
 873 fail:
 874         s->s_fs_info = NULL;
 875         fsc->sb = NULL;
 876         return ret;
 877 }
 878
 879 /*
 880  * share superblock if same fs AND options
 881  */
 882 static int ceph_compare_super(struct super_block *sb, void *data)
 883 {
 884         struct ceph_fs_client *new = data;
 885         struct ceph_mount_options *fsopt = new->mount_options;
 886         struct ceph_options *opt = new->client->options;
 887         struct ceph_fs_client *other = ceph_sb_to_client(sb);
 888
 889         dout("ceph_compare_super %p\n", sb);
 890
 891         if (compare_mount_options(fsopt, opt, other)) {
 892                 dout("monitor(s)/mount options don't match\n");
 893                 return 0;
 894         }
 895         if ((opt->flags & CEPH_OPT_FSID) &&
 896             ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
 897                 dout("fsid doesn't match\n");
 898                 return 0;
 899         }
 900         if (fsopt->sb_flags != other->mount_options->sb_flags) {
 901                 dout("flags differ\n");
 902                 return 0;
 903         }
 904         return 1;
 905 }
 906
 907 /*
 908  * construct our own bdi so we can control readahead, etc.
 909  */
 910 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 911
 912 static int ceph_register_bdi(struct super_block *sb,
 913                              struct ceph_fs_client *fsc)
 914 {
 915         int err;
 916
 917         /* set ra_pages based on rasize mount option? */
 918         if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
 919                 fsc->backing_dev_info.ra_pages =
 920                         (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
 921                         >> PAGE_SHIFT;
 922         else
 923                 fsc->backing_dev_info.ra_pages =
 924                         VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 925
 926         err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 927                            atomic_long_inc_return(&bdi_seq));
 928         if (!err)
 929                 sb->s_bdi = &fsc->backing_dev_info;
 930         return err;
 931 }
 932
 933 static struct dentry *ceph_mount(struct file_system_type *fs_type,
 934                        int flags, const char *dev_name, void *data)
 935 {
 936         struct super_block *sb;
 937         struct ceph_fs_client *fsc;
 938         struct dentry *res;
 939         int err;
 940         int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 941         const char *path = NULL;
 942         struct ceph_mount_options *fsopt = NULL;
 943         struct ceph_options *opt = NULL;
 944
 945         dout("ceph_mount\n");
 946
 947 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 948         flags |= MS_POSIXACL;
 949 #endif
 950         err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
 951         if (err < 0) {
 952                 res = ERR_PTR(err);
 953                 goto out_final;
 954         }
 955
 956         /* create client (which we may/may not use) */
 957         fsc = create_fs_client(fsopt, opt);
 958         if (IS_ERR(fsc)) {
 959                 res = ERR_CAST(fsc);
 960                 destroy_mount_options(fsopt);
 961                 ceph_destroy_options(opt);
 962                 goto out_final;
 963         }
 964
 965         err = ceph_mdsc_init(fsc);
 966         if (err < 0) {
 967                 res = ERR_PTR(err);
 968                 goto out;
 969         }
 970
 971         if (ceph_test_opt(fsc->client, NOSHARE))
 972                 compare_super = NULL;
 973         sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);
 974         if (IS_ERR(sb)) {
 975                 res = ERR_CAST(sb);
 976                 goto out;
 977         }
 978
 979         if (ceph_sb_to_client(sb) != fsc) {
 980                 ceph_mdsc_destroy(fsc);
 981                 destroy_fs_client(fsc);
 982                 fsc = ceph_sb_to_client(sb);
 983                 dout("get_sb got existing client %p\n", fsc);
 984         } else {
 985                 dout("get_sb using new client %p\n", fsc);
 986                 err = ceph_register_bdi(sb, fsc);
 987                 if (err < 0) {
 988                         res = ERR_PTR(err);
 989                         goto out_splat;
 990                 }
 991         }
 992
 993         res = ceph_real_mount(fsc, path);
 994         if (IS_ERR(res))
 995                 goto out_splat;
 996         dout("root %p inode %p ino %llx.%llx\n", res,
 997              d_inode(res), ceph_vinop(d_inode(res)));
 998         return res;
 999
1000 out_splat:
1001         ceph_mdsc_close_sessions(fsc->mdsc);
1002         deactivate_locked_super(sb);
1003         goto out_final;
1004
1005 out:
1006         ceph_mdsc_destroy(fsc);
1007         destroy_fs_client(fsc);
1008 out_final:
1009         dout("ceph_mount fail %ld\n", PTR_ERR(res));
1010         return res;
1011 }
1012
1013 static void ceph_kill_sb(struct super_block *s)
1014 {
1015         struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1016         dev_t dev = s->s_dev;
1017
1018         dout("kill_sb %p\n", s);
1019
1020         ceph_mdsc_pre_umount(fsc->mdsc);
1021         generic_shutdown_super(s);
1022         ceph_mdsc_destroy(fsc);
1023
1024         destroy_fs_client(fsc);
1025         free_anon_bdev(dev);
1026 }
1027
1028 static struct file_system_type ceph_fs_type = {
1029         .owner          = THIS_MODULE,
1030         .name           = "ceph",
1031         .mount          = ceph_mount,
1032         .kill_sb        = ceph_kill_sb,
1033         .fs_flags       = FS_RENAME_DOES_D_MOVE,
1034 };
1035 MODULE_ALIAS_FS("ceph");
1036
1037 static int __init init_ceph(void)
1038 {
1039         int ret = init_caches();
1040         if (ret)
1041                 goto out;
1042
1043         ceph_flock_init();
1044         ceph_xattr_init();
1045         ret = ceph_snap_init();
1046         if (ret)
1047                 goto out_xattr;
1048         ret = register_filesystem(&ceph_fs_type);
1049         if (ret)
1050                 goto out_snap;
1051
1052         pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1053
1054         return 0;
1055
1056 out_snap:
1057         ceph_snap_exit();
1058 out_xattr:
1059         ceph_xattr_exit();
1060         destroy_caches();
1061 out:
1062         return ret;
1063 }
1064
1065 static void __exit exit_ceph(void)
1066 {
1067         dout("exit_ceph\n");
1068         unregister_filesystem(&ceph_fs_type);
1069         ceph_snap_exit();
1070         ceph_xattr_exit();
1071         destroy_caches();
1072 }
1073
1074 module_init(init_ceph);
1075 module_exit(exit_ceph);
1076
1077 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1078 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1079 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1080 MODULE_DESCRIPTION("Ceph filesystem for Linux");
1081 MODULE_LICENSE("GPL");