These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / kernel / fork.c
index 1b0e656..46c1e83 100644 (file)
@@ -253,6 +253,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
 
+       cgroup_free(tsk);
        task_numa_free(tsk);
        security_task_free(tsk);
        exit_creds(tsk);
@@ -300,6 +301,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
+
 void __init fork_init(void)
 {
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -308,7 +314,7 @@ void __init fork_init(void)
 #endif
        /* create a slab on which task_structs can be allocated */
        task_struct_cachep =
-               kmem_cache_create("task_struct", sizeof(struct task_struct),
+               kmem_cache_create("task_struct", arch_task_struct_size,
                        ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
@@ -387,6 +393,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 #endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
+       tsk->wake_q.next = NULL;
 
        account_kernel_stack(ti, 1);
 
@@ -462,8 +469,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~VM_LOCKED;
+               tmp->vm_flags &=
+                       ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
                tmp->vm_next = tmp->vm_prev = NULL;
+               tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file_inode(file);
@@ -1093,6 +1102,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
        rcu_assign_pointer(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;
+
        atomic_set(&sig->count, 1);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        return 0;
@@ -1117,13 +1127,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
        unsigned long cpu_limit;
 
-       /* Thread group counters. */
-       thread_group_cputime_init(sig);
-
-       cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+       cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (cpu_limit != RLIM_INFINITY) {
                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
-               sig->cputimer.running = 1;
+               sig->cputimer.running = true;
        }
 
        /* The timer lists. */
@@ -1157,6 +1164,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
        seqlock_init(&sig->stats_lock);
+       prev_cputime_init(&sig->prev_cputime);
 
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
@@ -1170,10 +1178,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        tty_audit_fork(sig);
        sched_autogroup_fork(sig);
 
-#ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->group_rwsem);
-#endif
-
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
@@ -1270,10 +1274,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
                                        struct pid *pid,
-                                       int trace)
+                                       int trace,
+                                       unsigned long tls)
 {
        int retval;
        struct task_struct *p;
+       void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
@@ -1308,10 +1314,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        /*
         * If the new process will be in a different pid or user namespace
-        * do not allow it to share a thread group or signal handlers or
-        * parent with the forking task.
+        * do not allow it to share a thread group with the forking task.
         */
-       if (clone_flags & CLONE_SIGHAND) {
+       if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) !=
                                current->nsproxy->pid_ns_for_children))
@@ -1371,9 +1376,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+       prev_cputime_init(&p->prev_cputime);
+
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        raw_spin_lock_init(&p->vtime_lock);
        seqcount_init(&p->vtime_seq);
@@ -1396,8 +1400,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        p->audit_context = NULL;
-       if (clone_flags & CLONE_THREAD)
-               threadgroup_change_begin(current);
+       threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1481,7 +1484,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-       retval = copy_thread(clone_flags, stack_start, stack_size, p);
+       retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
        if (retval)
                goto bad_fork_cleanup_io;
 
@@ -1549,6 +1552,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
 
+       /*
+        * Ensure that the cgroup subsystem policies allow the new process to be
+        * forked. It should be noted the the new process's css_set can be changed
+        * between here and cgroup_post_fork() if an organisation operation is in
+        * progress.
+        */
+       retval = cgroup_can_fork(p, cgrp_ss_priv);
+       if (retval)
+               goto bad_fork_free_pid;
+
        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
@@ -1585,7 +1598,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                spin_unlock(&current->sighand->siglock);
                write_unlock_irq(&tasklist_lock);
                retval = -ERESTARTNOINTR;
-               goto bad_fork_free_pid;
+               goto bad_fork_cancel_cgroup;
        }
 
        if (likely(p->pid)) {
@@ -1627,9 +1640,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
 
        proc_fork_connector(p);
-       cgroup_post_fork(p);
-       if (clone_flags & CLONE_THREAD)
-               threadgroup_change_end(current);
+       cgroup_post_fork(p, cgrp_ss_priv);
+       threadgroup_change_end(current);
        perf_event_fork(p);
 
        trace_task_newtask(p, clone_flags);
@@ -1637,6 +1649,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        return p;
 
+bad_fork_cancel_cgroup:
+       cgroup_cancel_fork(p, cgrp_ss_priv);
 bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
@@ -1668,8 +1682,7 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-       if (clone_flags & CLONE_THREAD)
-               threadgroup_change_end(current);
+       threadgroup_change_end(current);
        delayacct_tsk_free(p);
 bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
@@ -1693,7 +1706,7 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
-       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
@@ -1708,11 +1721,12 @@ struct task_struct *fork_idle(int cpu)
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-long do_fork(unsigned long clone_flags,
+long _do_fork(unsigned long clone_flags,
              unsigned long stack_start,
              unsigned long stack_size,
              int __user *parent_tidptr,
-             int __user *child_tidptr)
+             int __user *child_tidptr,
+             unsigned long tls)
 {
        struct task_struct *p;
        int trace = 0;
@@ -1737,7 +1751,7 @@ long do_fork(unsigned long clone_flags,
        }
 
        p = copy_process(clone_flags, stack_start, stack_size,
-                        child_tidptr, NULL, trace);
+                        child_tidptr, NULL, trace, tls);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1778,20 +1792,34 @@ long do_fork(unsigned long clone_flags,
        return nr;
 }
 
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+/* For compatibility with architectures that call do_fork directly rather than
+ * using the syscall entry points below. */
+long do_fork(unsigned long clone_flags,
+             unsigned long stack_start,
+             unsigned long stack_size,
+             int __user *parent_tidptr,
+             int __user *child_tidptr)
+{
+       return _do_fork(clone_flags, stack_start, stack_size,
+                       parent_tidptr, child_tidptr, 0);
+}
+#endif
+
 /*
  * Create a kernel thread.
  */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-       return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-               (unsigned long)arg, NULL, NULL);
+       return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+               (unsigned long)arg, NULL, NULL, 0);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-       return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+       return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
 #else
        /* can not support in nommu mode */
        return -EINVAL;
@@ -1802,8 +1830,8 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-                       0, NULL, NULL);
+       return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+                       0, NULL, NULL, 0);
 }
 #endif
 
@@ -1811,27 +1839,27 @@ SYSCALL_DEFINE0(vfork)
 #ifdef CONFIG_CLONE_BACKWARDS
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
-                int, tls_val,
+                unsigned long, tls,
                 int __user *, child_tidptr)
 #elif defined(CONFIG_CLONE_BACKWARDS2)
 SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
-                int, tls_val)
+                unsigned long, tls)
 #elif defined(CONFIG_CLONE_BACKWARDS3)
 SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
-               int, tls_val)
+               unsigned long, tls)
 #else
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
-                int, tls_val)
+                unsigned long, tls)
 #endif
 {
-       return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+       return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
 }
 #endif
 
@@ -1966,7 +1994,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
        int err;
 
        /*
-        * If unsharing a user namespace must also unshare the thread.
+        * If unsharing a user namespace must also unshare the thread group
+        * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;