These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / proc / base.c
index 093ca14..b7de324 100644 (file)
@@ -196,22 +196,214 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
        return result;
 }
 
-static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
-                           struct pid *pid, struct task_struct *task)
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+                                    size_t _count, loff_t *pos)
 {
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       char *page;
+       unsigned long count = _count;
+       unsigned long arg_start, arg_end, env_start, env_end;
+       unsigned long len1, len2, len;
+       unsigned long p;
+       char c;
+       ssize_t rv;
+
+       BUG_ON(*pos < 0);
+
+       tsk = get_proc_task(file_inode(file));
+       if (!tsk)
+               return -ESRCH;
+       mm = get_task_mm(tsk);
+       put_task_struct(tsk);
+       if (!mm)
+               return 0;
+       /* Check if process spawned far enough to have cmdline. */
+       if (!mm->env_end) {
+               rv = 0;
+               goto out_mmput;
+       }
+
+       page = (char *)__get_free_page(GFP_TEMPORARY);
+       if (!page) {
+               rv = -ENOMEM;
+               goto out_mmput;
+       }
+
+       down_read(&mm->mmap_sem);
+       arg_start = mm->arg_start;
+       arg_end = mm->arg_end;
+       env_start = mm->env_start;
+       env_end = mm->env_end;
+       up_read(&mm->mmap_sem);
+
+       BUG_ON(arg_start > arg_end);
+       BUG_ON(env_start > env_end);
+
+       len1 = arg_end - arg_start;
+       len2 = env_end - env_start;
+
+       /* Empty ARGV. */
+       if (len1 == 0) {
+               rv = 0;
+               goto out_free_page;
+       }
        /*
-        * Rely on struct seq_operations::show() being called once
-        * per internal buffer allocation. See single_open(), traverse().
+        * Inherently racy -- command line shares address space
+        * with code and data.
         */
-       BUG_ON(m->size < PAGE_SIZE);
-       m->count += get_cmdline(task, m->buf, PAGE_SIZE);
-       return 0;
+       rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+       if (rv <= 0)
+               goto out_free_page;
+
+       rv = 0;
+
+       if (c == '\0') {
+               /* Command line (set of strings) occupies whole ARGV. */
+               if (len1 <= *pos)
+                       goto out_free_page;
+
+               p = arg_start + *pos;
+               len = len1 - *pos;
+               while (count > 0 && len > 0) {
+                       unsigned int _count;
+                       int nr_read;
+
+                       _count = min3(count, len, PAGE_SIZE);
+                       nr_read = access_remote_vm(mm, p, page, _count, 0);
+                       if (nr_read < 0)
+                               rv = nr_read;
+                       if (nr_read <= 0)
+                               goto out_free_page;
+
+                       if (copy_to_user(buf, page, nr_read)) {
+                               rv = -EFAULT;
+                               goto out_free_page;
+                       }
+
+                       p       += nr_read;
+                       len     -= nr_read;
+                       buf     += nr_read;
+                       count   -= nr_read;
+                       rv      += nr_read;
+               }
+       } else {
+               /*
+                * Command line (1 string) occupies ARGV and maybe
+                * extends into ENVP.
+                */
+               if (len1 + len2 <= *pos)
+                       goto skip_argv_envp;
+               if (len1 <= *pos)
+                       goto skip_argv;
+
+               p = arg_start + *pos;
+               len = len1 - *pos;
+               while (count > 0 && len > 0) {
+                       unsigned int _count, l;
+                       int nr_read;
+                       bool final;
+
+                       _count = min3(count, len, PAGE_SIZE);
+                       nr_read = access_remote_vm(mm, p, page, _count, 0);
+                       if (nr_read < 0)
+                               rv = nr_read;
+                       if (nr_read <= 0)
+                               goto out_free_page;
+
+                       /*
+                        * Command line can be shorter than whole ARGV
+                        * even if last "marker" byte says it is not.
+                        */
+                       final = false;
+                       l = strnlen(page, nr_read);
+                       if (l < nr_read) {
+                               nr_read = l;
+                               final = true;
+                       }
+
+                       if (copy_to_user(buf, page, nr_read)) {
+                               rv = -EFAULT;
+                               goto out_free_page;
+                       }
+
+                       p       += nr_read;
+                       len     -= nr_read;
+                       buf     += nr_read;
+                       count   -= nr_read;
+                       rv      += nr_read;
+
+                       if (final)
+                               goto out_free_page;
+               }
+skip_argv:
+               /*
+                * Command line (1 string) occupies ARGV and
+                * extends into ENVP.
+                */
+               if (len1 <= *pos) {
+                       p = env_start + *pos - len1;
+                       len = len1 + len2 - *pos;
+               } else {
+                       p = env_start;
+                       len = len2;
+               }
+               while (count > 0 && len > 0) {
+                       unsigned int _count, l;
+                       int nr_read;
+                       bool final;
+
+                       _count = min3(count, len, PAGE_SIZE);
+                       nr_read = access_remote_vm(mm, p, page, _count, 0);
+                       if (nr_read < 0)
+                               rv = nr_read;
+                       if (nr_read <= 0)
+                               goto out_free_page;
+
+                       /* Find EOS. */
+                       final = false;
+                       l = strnlen(page, nr_read);
+                       if (l < nr_read) {
+                               nr_read = l;
+                               final = true;
+                       }
+
+                       if (copy_to_user(buf, page, nr_read)) {
+                               rv = -EFAULT;
+                               goto out_free_page;
+                       }
+
+                       p       += nr_read;
+                       len     -= nr_read;
+                       buf     += nr_read;
+                       count   -= nr_read;
+                       rv      += nr_read;
+
+                       if (final)
+                               goto out_free_page;
+               }
+skip_argv_envp:
+               ;
+       }
+
+out_free_page:
+       free_page((unsigned long)page);
+out_mmput:
+       mmput(mm);
+       if (rv > 0)
+               *pos += rv;
+       return rv;
 }
 
+static const struct file_operations proc_pid_cmdline_ops = {
+       .read   = proc_pid_cmdline_read,
+       .llseek = generic_file_llseek,
+};
+
 static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
                         struct pid *pid, struct task_struct *task)
 {
-       struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+       struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (mm && !IS_ERR(mm)) {
                unsigned int nwords = 0;
                do {
@@ -238,13 +430,11 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 
        wchan = get_wchan(task);
 
-       if (lookup_symbol_name(wchan, symname) < 0) {
-               if (!ptrace_may_access(task, PTRACE_MODE_READ))
-                       return 0;
-               seq_printf(m, "%lu", wchan);
-       } else {
+       if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
+                       && !lookup_symbol_name(wchan, symname))
                seq_printf(m, "%s", symname);
-       }
+       else
+               seq_putc(m, '0');
 
        return 0;
 }
@@ -255,7 +445,7 @@ static int lock_trace(struct task_struct *task)
        int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
        if (err)
                return err;
-       if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+       if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
                mutex_unlock(&task->signal->cred_guard_mutex);
                return -EPERM;
        }
@@ -304,14 +494,17 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 }
 #endif
 
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
 /*
  * Provides /proc/PID/schedstat
  */
 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
                              struct pid *pid, struct task_struct *task)
 {
-       seq_printf(m, "%llu %llu %lu\n",
+       if (unlikely(!sched_info_on()))
+               seq_printf(m, "0 0 0\n");
+       else
+               seq_printf(m, "%llu %llu %lu\n",
                   (unsigned long long)task->se.sum_exec_runtime,
                   (unsigned long long)task->sched_info.run_delay,
                   task->sched_info.pcount);
@@ -505,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode)
         */
        task = get_proc_task(inode);
        if (task) {
-               allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+               allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
                put_task_struct(task);
        }
        return allowed;
@@ -540,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
                return true;
        if (in_group_p(pid->pid_gid))
                return true;
-       return ptrace_may_access(task, PTRACE_MODE_READ);
+       return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 }
 
 
@@ -617,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
        struct mm_struct *mm = ERR_PTR(-ESRCH);
 
        if (task) {
-               mm = mm_access(task, mode);
+               mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
                put_task_struct(task);
 
                if (!IS_ERR_OR_NULL(mm)) {
@@ -840,6 +1033,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
+/*
+ * /proc/pid/oom_adj exists solely for backwards compatibility with previous
+ * kernels.  The effective policy is defined by oom_score_adj, which has a
+ * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
+ * Values written to oom_adj are simply mapped linearly to oom_score_adj.
+ * Processes that become oom disabled via oom_adj will still be oom disabled
+ * with this implementation.
+ *
+ * oom_adj cannot be removed since existing userspace binaries use it.
+ */
 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
 {
@@ -1035,10 +1238,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
 {
        struct inode * inode = file_inode(file);
-       char *page, *tmp;
-       ssize_t length;
        uid_t loginuid;
        kuid_t kloginuid;
+       int rv;
 
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1047,46 +1249,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        }
        rcu_read_unlock();
 
-       if (count >= PAGE_SIZE)
-               count = PAGE_SIZE - 1;
-
        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }
-       page = (char*)__get_free_page(GFP_TEMPORARY);
-       if (!page)
-               return -ENOMEM;
-       length = -EFAULT;
-       if (copy_from_user(page, buf, count))
-               goto out_free_page;
 
-       page[count] = '\0';
-       loginuid = simple_strtoul(page, &tmp, 10);
-       if (tmp == page) {
-               length = -EINVAL;
-               goto out_free_page;
-
-       }
+       rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+       if (rv < 0)
+               return rv;
 
        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
-               if (!uid_valid(kloginuid)) {
-                       length = -EINVAL;
-                       goto out_free_page;
-               }
+               if (!uid_valid(kloginuid))
+                       return -EINVAL;
        }
 
-       length = audit_set_loginuid(kloginuid);
-       if (likely(length == 0))
-               length = count;
-
-out_free_page:
-       free_page((unsigned long) page);
-       return length;
+       rv = audit_set_loginuid(kloginuid);
+       if (rv < 0)
+               return rv;
+       return count;
 }
 
 static const struct file_operations proc_loginuid_operations = {
@@ -1140,8 +1324,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
 {
        struct task_struct *task;
-       char buffer[PROC_NUMBUF], *end;
+       char buffer[PROC_NUMBUF];
        int make_it_fail;
+       int rv;
 
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
@@ -1150,9 +1335,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-       make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
-       if (*end)
-               return -EINVAL;
+       rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+       if (rv < 0)
+               return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;
 
@@ -1380,7 +1565,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
                return -ENOENT;
 }
 
-static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
 {
        struct inode *inode = d_inode(dentry);
        struct path path;
@@ -1394,7 +1579,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
        if (error)
                goto out;
 
-       nd_jump_link(nd, &path);
+       nd_jump_link(&path);
        return NULL;
 out:
        return ERR_PTR(error);
@@ -1641,8 +1826,6 @@ end_instantiate:
        return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
 /*
  * dname_to_vma_addr - maps a dentry name into two unsigned longs
  * which represent vma start and end addresses.
@@ -1669,17 +1852,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        if (flags & LOOKUP_RCU)
                return -ECHILD;
 
-       if (!capable(CAP_SYS_ADMIN)) {
-               status = -EPERM;
-               goto out_notask;
-       }
-
        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
                goto out_notask;
 
-       mm = mm_access(task, PTRACE_MODE_READ);
+       mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR_OR_NULL(mm))
                goto out;
 
@@ -1762,6 +1940,29 @@ struct map_files_info {
        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+       .readlink       = proc_pid_readlink,
+       .follow_link    = proc_map_files_follow_link,
+       .setattr        = proc_setattr,
+};
+
 static int
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
@@ -1777,7 +1978,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
        ei = PROC_I(inode);
        ei->op.proc_get_link = proc_map_files_get_link;
 
-       inode->i_op = &proc_pid_link_inode_operations;
+       inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;
        inode->i_mode = S_IFLNK;
 
@@ -1801,17 +2002,13 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
        int result;
        struct mm_struct *mm;
 
-       result = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        result = -ENOENT;
        task = get_proc_task(dir);
        if (!task)
                goto out;
 
        result = -EACCES;
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+       if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;
 
        result = -ENOENT;
@@ -1858,17 +2055,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
        struct map_files_info *p;
        int ret;
 
-       ret = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out;
 
        ret = -EACCES;
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+       if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;
 
        ret = 0;
@@ -2050,7 +2243,6 @@ static const struct file_operations proc_timers_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
-#endif /* CONFIG_CHECKPOINT_RESTORE */
 
 static int proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2286,35 +2478,24 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 {
        struct task_struct *task;
        struct mm_struct *mm;
-       char buffer[PROC_NUMBUF], *end;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;
 
-       ret = -EFAULT;
-       memset(buffer, 0, sizeof(buffer));
-       if (count > sizeof(buffer) - 1)
-               count = sizeof(buffer) - 1;
-       if (copy_from_user(buffer, buf, count))
-               goto out_no_task;
-
-       ret = -EINVAL;
-       val = (unsigned int)simple_strtoul(buffer, &end, 0);
-       if (*end == '\n')
-               end++;
-       if (end - buffer == 0)
-               goto out_no_task;
+       ret = kstrtouint_from_user(buf, count, 0, &val);
+       if (ret < 0)
+               return ret;
 
        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;
 
-       ret = end - buffer;
        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
+       ret = 0;
 
        for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
                if (val & mask)
@@ -2327,7 +2508,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
  out_no_mm:
        put_task_struct(task);
  out_no_task:
-       return ret;
+       if (ret < 0)
+               return ret;
+       return count;
 }
 
 static const struct file_operations proc_coredump_filter_operations = {
@@ -2348,7 +2531,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
        if (result)
                return result;
 
-       if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+       if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
                result = -EACCES;
                goto out_unlock;
        }
@@ -2549,9 +2732,7 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
@@ -2572,7 +2753,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-       ONE("cmdline",    S_IRUGO, proc_pid_cmdline),
+       REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
        REG("maps",       S_IRUGO, proc_pid_maps_operations),
@@ -2600,7 +2781,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
        ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -2918,11 +3099,11 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-       ONE("cmdline",   S_IRUGO, proc_pid_cmdline),
+       REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
        REG("maps",      S_IRUGO, proc_tid_maps_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_PROC_CHILDREN
        REG("children",  S_IRUGO, proc_tid_children_operations),
 #endif
 #ifdef CONFIG_NUMA
@@ -2948,7 +3129,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
        ONE("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP