Merge "Generating a kvmfornfv rpm for Apex installer which includes the patch for...

[kvmfornfv.git] / kernel / kernel / fork.c
diff --git a/kernel/kernel/fork.c b/kernel/kernel/fork.c

index 1b0e656..3929b80 100644 (file)
--- a/kernel/kernel/fork.c
+++ b/kernel/kernel/fork.c
@@ -253,6 +253,7 @@ void __put_task_struct(struct task_struct *tsk)
         WARN_ON(atomic_read(&tsk->usage));
         WARN_ON(tsk == current);
  
+       cgroup_free(tsk);
         task_numa_free(tsk);
         security_task_free(tsk);
         exit_creds(tsk);
@@ -300,6 +301,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
         max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
  }
  
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
+
  void __init fork_init(void)
  {
  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -308,7 +314,7 @@ void __init fork_init(void)
  #endif
         /* create a slab on which task_structs can be allocated */
         task_struct_cachep =
-               kmem_cache_create("task_struct", sizeof(struct task_struct),
+               kmem_cache_create("task_struct", arch_task_struct_size,
                         ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
  #endif
  
@@ -387,6 +393,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
  #endif
         tsk->splice_pipe = NULL;
         tsk->task_frag.page = NULL;
+       tsk->wake_q.next = NULL;
  
         account_kernel_stack(ti, 1);
  
@@ -462,8 +469,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                 tmp->vm_mm = mm;
                 if (anon_vma_fork(tmp, mpnt))
                         goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~VM_LOCKED;
+               tmp->vm_flags &=
+                       ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
                 tmp->vm_next = tmp->vm_prev = NULL;
+               tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                 file = tmp->vm_file;
                 if (file) {
                         struct inode *inode = file_inode(file);
@@ -589,7 +598,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
  #endif
  }
  
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+       struct user_namespace *user_ns)
  {
         mm->mmap = NULL;
         mm->mm_rb = RB_ROOT;
@@ -629,6 +639,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
         if (init_new_context(p, mm))
                 goto fail_nocontext;
  
+       mm->user_ns = get_user_ns(user_ns);
         return mm;
  
  fail_nocontext:
@@ -674,7 +685,7 @@ struct mm_struct *mm_alloc(void)
                 return NULL;
  
         memset(mm, 0, sizeof(*mm));
-       return mm_init(mm, current);
+       return mm_init(mm, current, current_user_ns());
  }
  
  /*
@@ -689,6 +700,7 @@ void __mmdrop(struct mm_struct *mm)
         destroy_context(mm);
         mmu_notifier_mm_destroy(mm);
         check_mm(mm);
+       put_user_ns(mm->user_ns);
         free_mm(mm);
  }
  EXPORT_SYMBOL_GPL(__mmdrop);
@@ -780,6 +792,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
  }
  EXPORT_SYMBOL(get_mm_exe_file);
  
+/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+       struct file *exe_file = NULL;
+       struct mm_struct *mm;
+
+       task_lock(task);
+       mm = task->mm;
+       if (mm) {
+               if (!(task->flags & PF_KTHREAD))
+                       exe_file = get_mm_exe_file(mm);
+       }
+       task_unlock(task);
+       return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
  /**
   * get_task_mm - acquire a reference to the task's mm
   *
@@ -895,14 +930,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
         deactivate_mm(tsk, mm);
  
         /*
-        * If we're exiting normally, clear a user-space tid field if
-        * requested.  We leave this alone when dying by signal, to leave
-        * the value intact in a core dump, and to save the unnecessary
-        * trouble, say, a killed vfork parent shouldn't touch this mm.
-        * Userland only wants this done for a sys_exit.
+        * Signal userspace if we're not exiting with a core dump
+        * because we want to leave the value intact for debugging
+        * purposes.
          */
         if (tsk->clear_child_tid) {
-               if (!(tsk->flags & PF_SIGNALED) &&
+               if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                     atomic_read(&mm->mm_users) > 1) {
                         /*
                          * We don't check the error code - if userspace has
@@ -938,7 +971,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
  
         memcpy(mm, oldmm, sizeof(*mm));
  
-       if (!mm_init(mm, tsk))
+       if (!mm_init(mm, tsk, mm->user_ns))
                 goto fail_nomem;
  
         err = dup_mmap(mm, oldmm);
@@ -1093,6 +1126,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
         rcu_assign_pointer(tsk->sighand, sig);
         if (!sig)
                 return -ENOMEM;
+
         atomic_set(&sig->count, 1);
         memcpy(sig->action, current->sighand->action, sizeof(sig->action));
         return 0;
@@ -1117,13 +1151,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
  {
         unsigned long cpu_limit;
  
-       /* Thread group counters. */
-       thread_group_cputime_init(sig);
-
-       cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+       cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
         if (cpu_limit != RLIM_INFINITY) {
                 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
-               sig->cputimer.running = 1;
+               sig->cputimer.running = true;
         }
  
         /* The timer lists. */
@@ -1157,6 +1188,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         init_sigpending(&sig->shared_pending);
         INIT_LIST_HEAD(&sig->posix_timers);
         seqlock_init(&sig->stats_lock);
+       prev_cputime_init(&sig->prev_cputime);
  
         hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         sig->real_timer.function = it_real_fn;
@@ -1170,10 +1202,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         tty_audit_fork(sig);
         sched_autogroup_fork(sig);
  
-#ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->group_rwsem);
-#endif
-
         sig->oom_score_adj = current->signal->oom_score_adj;
         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
  
@@ -1270,10 +1298,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                         unsigned long stack_size,
                                         int __user *child_tidptr,
                                         struct pid *pid,
-                                       int trace)
+                                       int trace,
+                                       unsigned long tls)
  {
         int retval;
         struct task_struct *p;
+       void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
  
         if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                 return ERR_PTR(-EINVAL);
@@ -1308,10 +1338,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  
         /*
          * If the new process will be in a different pid or user namespace
-        * do not allow it to share a thread group or signal handlers or
-        * parent with the forking task.
+        * do not allow it to share a thread group with the forking task.
          */
-       if (clone_flags & CLONE_SIGHAND) {
+       if (clone_flags & CLONE_THREAD) {
                 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                     (task_active_pid_ns(current) !=
                                 current->nsproxy->pid_ns_for_children))
@@ -1371,14 +1400,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  
         p->utime = p->stime = p->gtime = 0;
         p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+       prev_cputime_init(&p->prev_cputime);
+
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-       raw_spin_lock_init(&p->vtime_lock);
-       seqcount_init(&p->vtime_seq);
+       seqcount_init(&p->vtime_seqcount);
         p->vtime_snap = 0;
-       p->vtime_snap_whence = VTIME_SLEEPING;
+       p->vtime_snap_whence = VTIME_INACTIVE;
  #endif
  
  #if defined(SPLIT_RSS_COUNTING)
@@ -1396,8 +1423,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         p->real_start_time = ktime_get_boot_ns();
         p->io_context = NULL;
         p->audit_context = NULL;
-       if (clone_flags & CLONE_THREAD)
-               threadgroup_change_begin(current);
         cgroup_fork(p);
  #ifdef CONFIG_NUMA
         p->mempolicy = mpol_dup(p->mempolicy);
@@ -1481,7 +1506,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         retval = copy_io(clone_flags, p);
         if (retval)
                 goto bad_fork_cleanup_namespaces;
-       retval = copy_thread(clone_flags, stack_start, stack_size, p);
+       retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
         if (retval)
                 goto bad_fork_cleanup_io;
  
@@ -1549,6 +1574,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         INIT_LIST_HEAD(&p->thread_group);
         p->task_works = NULL;
  
+       threadgroup_change_begin(current);
+       /*
+        * Ensure that the cgroup subsystem policies allow the new process to be
+        * forked. It should be noted the the new process's css_set can be changed
+        * between here and cgroup_post_fork() if an organisation operation is in
+        * progress.
+        */
+       retval = cgroup_can_fork(p, cgrp_ss_priv);
+       if (retval)
+               goto bad_fork_free_pid;
+
         /*
          * Make it visible to the rest of the system, but dont wake it up yet.
          * Need tasklist lock for parent etc handling!
@@ -1585,7 +1621,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                 spin_unlock(&current->sighand->siglock);
                 write_unlock_irq(&tasklist_lock);
                 retval = -ERESTARTNOINTR;
-               goto bad_fork_free_pid;
+               goto bad_fork_cancel_cgroup;
         }
  
         if (likely(p->pid)) {
@@ -1627,9 +1663,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         write_unlock_irq(&tasklist_lock);
  
         proc_fork_connector(p);
-       cgroup_post_fork(p);
-       if (clone_flags & CLONE_THREAD)
-               threadgroup_change_end(current);
+       cgroup_post_fork(p, cgrp_ss_priv);
+       threadgroup_change_end(current);
         perf_event_fork(p);
  
         trace_task_newtask(p, clone_flags);
@@ -1637,7 +1672,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  
         return p;
  
+bad_fork_cancel_cgroup:
+       cgroup_cancel_fork(p, cgrp_ss_priv);
  bad_fork_free_pid:
+       threadgroup_change_end(current);
         if (pid != &init_struct_pid)
                 free_pid(pid);
  bad_fork_cleanup_io:
@@ -1668,8 +1706,6 @@ bad_fork_cleanup_policy:
         mpol_put(p->mempolicy);
  bad_fork_cleanup_threadgroup_lock:
  #endif
-       if (clone_flags & CLONE_THREAD)
-               threadgroup_change_end(current);
         delayacct_tsk_free(p);
  bad_fork_cleanup_count:
         atomic_dec(&p->cred->user->processes);
@@ -1693,7 +1729,7 @@ static inline void init_idle_pids(struct pid_link *links)
  struct task_struct *fork_idle(int cpu)
  {
         struct task_struct *task;
-       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
         if (!IS_ERR(task)) {
                 init_idle_pids(task->pids);
                 init_idle(task, cpu);
@@ -1708,11 +1744,12 @@ struct task_struct *fork_idle(int cpu)
   * It copies the process, and if successful kick-starts
   * it and waits for it to finish using the VM if required.
   */
-long do_fork(unsigned long clone_flags,
+long _do_fork(unsigned long clone_flags,
               unsigned long stack_start,
               unsigned long stack_size,
               int __user *parent_tidptr,
-             int __user *child_tidptr)
+             int __user *child_tidptr,
+             unsigned long tls)
  {
         struct task_struct *p;
         int trace = 0;
@@ -1737,7 +1774,7 @@ long do_fork(unsigned long clone_flags,
         }
  
         p = copy_process(clone_flags, stack_start, stack_size,
-                        child_tidptr, NULL, trace);
+                        child_tidptr, NULL, trace, tls);
         /*
          * Do this prior waking up the new thread - the thread pointer
          * might get invalid after that point, if the thread exits quickly.
@@ -1778,20 +1815,34 @@ long do_fork(unsigned long clone_flags,
         return nr;
  }
  
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+/* For compatibility with architectures that call do_fork directly rather than
+ * using the syscall entry points below. */
+long do_fork(unsigned long clone_flags,
+             unsigned long stack_start,
+             unsigned long stack_size,
+             int __user *parent_tidptr,
+             int __user *child_tidptr)
+{
+       return _do_fork(clone_flags, stack_start, stack_size,
+                       parent_tidptr, child_tidptr, 0);
+}
+#endif
+
  /*
   * Create a kernel thread.
   */
  pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
  {
-       return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-               (unsigned long)arg, NULL, NULL);
+       return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+               (unsigned long)arg, NULL, NULL, 0);
  }
  
  #ifdef __ARCH_WANT_SYS_FORK
  SYSCALL_DEFINE0(fork)
  {
  #ifdef CONFIG_MMU
-       return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+       return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
  #else
         /* can not support in nommu mode */
         return -EINVAL;
@@ -1802,8 +1853,8 @@ SYSCALL_DEFINE0(fork)
  #ifdef __ARCH_WANT_SYS_VFORK
  SYSCALL_DEFINE0(vfork)
  {
-       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-                       0, NULL, NULL);
+       return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+                       0, NULL, NULL, 0);
  }
  #endif
  
@@ -1811,27 +1862,27 @@ SYSCALL_DEFINE0(vfork)
  #ifdef CONFIG_CLONE_BACKWARDS
  SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                  int __user *, parent_tidptr,
-                int, tls_val,
+                unsigned long, tls,
                  int __user *, child_tidptr)
  #elif defined(CONFIG_CLONE_BACKWARDS2)
  SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                  int __user *, parent_tidptr,
                  int __user *, child_tidptr,
-                int, tls_val)
+                unsigned long, tls)
  #elif defined(CONFIG_CLONE_BACKWARDS3)
  SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int, stack_size,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
-               int, tls_val)
+               unsigned long, tls)
  #else
  SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                  int __user *, parent_tidptr,
                  int __user *, child_tidptr,
-                int, tls_val)
+                unsigned long, tls)
  #endif
  {
-       return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+       return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
  }
  #endif
  
@@ -1966,7 +2017,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
         int err;
  
         /*
-        * If unsharing a user namespace must also unshare the thread.
+        * If unsharing a user namespace must also unshare the thread group
+        * and unshare the filesystem root and working directories.
          */
         if (unshare_flags & CLONE_NEWUSER)
                 unshare_flags |= CLONE_THREAD | CLONE_FS;