These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / core / sock.c
index 6317c71..9c32342 100644 (file)
 #include <linux/ipsec.h>
 #include <net/cls_cgroup.h>
 #include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
 
 #include <linux/filter.h>
 
@@ -421,13 +422,23 @@ static void sock_warn_obsolete_bsdism(const char *name)
        }
 }
 
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+static bool sock_needs_netstamp(const struct sock *sk)
+{
+       switch (sk->sk_family) {
+       case AF_UNSPEC:
+       case AF_UNIX:
+               return false;
+       default:
+               return true;
+       }
+}
 
 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 {
        if (sk->sk_flags & flags) {
                sk->sk_flags &= ~flags;
-               if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+               if (sock_needs_netstamp(sk) &&
+                   !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                        net_disable_timestamp();
        }
 }
@@ -861,7 +872,8 @@ set_rcvbuf:
 
                if (val & SOF_TIMESTAMPING_OPT_ID &&
                    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
-                       if (sk->sk_protocol == IPPROTO_TCP) {
+                       if (sk->sk_protocol == IPPROTO_TCP &&
+                           sk->sk_type == SOCK_STREAM) {
                                if (sk->sk_state != TCP_ESTABLISHED) {
                                        ret = -EINVAL;
                                        break;
@@ -987,6 +999,10 @@ set_rcvbuf:
                                         sk->sk_max_pacing_rate);
                break;
 
+       case SO_INCOMING_CPU:
+               sk->sk_incoming_cpu = val;
+               break;
+
        default:
                ret = -ENOPROTOOPT;
                break;
@@ -1393,9 +1409,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx);
  *     @family: protocol family
  *     @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
  *     @prot: struct proto associated with this new sock instance
+ *     @kern: is this to be a kernel socket?
  */
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-                     struct proto *prot)
+                     struct proto *prot, int kern)
 {
        struct sock *sk;
 
@@ -1408,7 +1425,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                 */
                sk->sk_prot = sk->sk_prot_creator = prot;
                sock_lock_init(sk);
-               sock_net_set(sk, get_net(net));
+               sk->sk_net_refcnt = kern ? 0 : 1;
+               if (likely(sk->sk_net_refcnt))
+                       get_net(net);
+               sock_net_set(sk, net);
                atomic_set(&sk->sk_wmem_alloc, 1);
 
                sock_update_classid(sk);
@@ -1419,7 +1439,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 }
 EXPORT_SYMBOL(sk_alloc);
 
-static void __sk_free(struct sock *sk)
+void sk_destruct(struct sock *sk)
 {
        struct sk_filter *filter;
 
@@ -1442,10 +1462,19 @@ static void __sk_free(struct sock *sk)
        if (sk->sk_peer_cred)
                put_cred(sk->sk_peer_cred);
        put_pid(sk->sk_peer_pid);
-       put_net(sock_net(sk));
+       if (likely(sk->sk_net_refcnt))
+               put_net(sock_net(sk));
        sk_prot_free(sk->sk_prot_creator, sk);
 }
 
+static void __sk_free(struct sock *sk)
+{
+       if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+               sock_diag_broadcast_destroy(sk);
+       else
+               sk_destruct(sk);
+}
+
 void sk_free(struct sock *sk)
 {
        /*
@@ -1458,25 +1487,6 @@ void sk_free(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_free);
 
-/*
- * Last sock_put should drop reference to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking reference to stopping namespace
- * is not an option.
- * Take reference to a socket to remove it from hash _alive_ and after that
- * destroy it in the context of init_net.
- */
-void sk_release_kernel(struct sock *sk)
-{
-       if (sk == NULL || sk->sk_socket == NULL)
-               return;
-
-       sock_hold(sk);
-       sock_release(sk->sk_socket);
-       sock_net_set(sk, get_net(&init_net));
-       sock_put(sk);
-}
-EXPORT_SYMBOL(sk_release_kernel);
-
 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
 {
        if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
@@ -1502,7 +1512,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                sock_copy(newsk, sk);
 
                /* SANITY */
-               get_net(sock_net(newsk));
+               if (likely(newsk->sk_net_refcnt))
+                       get_net(sock_net(newsk));
                sk_node_init(&newsk->sk_node);
                sock_lock_init(newsk);
                bh_lock_sock(newsk);
@@ -1518,7 +1529,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                skb_queue_head_init(&newsk->sk_receive_queue);
                skb_queue_head_init(&newsk->sk_write_queue);
 
-               spin_lock_init(&newsk->sk_dst_lock);
                rwlock_init(&newsk->sk_callback_lock);
                lockdep_set_class_and_name(&newsk->sk_callback_lock,
                                af_callback_keys + newsk->sk_family,
@@ -1541,7 +1551,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                         */
                        is_charged = sk_filter_charge(newsk, filter);
 
-               if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+               if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
                        /* It is still raw copy of parent, so invalidate
                         * destructor and make plain sk_free() */
                        newsk->sk_destruct = NULL;
@@ -1582,7 +1592,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                if (newsk->sk_prot->sockets_allocated)
                        sk_sockets_allocated_inc(newsk);
 
-               if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+               if (sock_needs_netstamp(sk) &&
+                   newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                        net_enable_timestamp();
        }
 out:
@@ -1592,7 +1603,9 @@ EXPORT_SYMBOL_GPL(sk_clone_lock);
 
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
-       __sk_dst_set(sk, dst);
+       u32 max_segs = 1;
+
+       sk_dst_set(sk, dst);
        sk->sk_route_caps = dst->dev->features;
        if (sk->sk_route_caps & NETIF_F_GSO)
                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
@@ -1603,9 +1616,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
                } else {
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                        sk->sk_gso_max_size = dst->dev->gso_max_size;
-                       sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+                       max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
                }
        }
+       sk->sk_gso_max_segs = max_segs;
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
@@ -1640,6 +1654,28 @@ void sock_wfree(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sock_wfree);
 
+void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+       skb_orphan(skb);
+       skb->sk = sk;
+#ifdef CONFIG_INET
+       if (unlikely(!sk_fullsock(sk))) {
+               skb->destructor = sock_edemux;
+               sock_hold(sk);
+               return;
+       }
+#endif
+       skb->destructor = sock_wfree;
+       skb_set_hash_from_sk(skb, sk);
+       /*
+        * We used to take a refcount on sk, but following operation
+        * is enough to guarantee sk_free() wont free this sock until
+        * all in-flight packets are completed
+        */
+       atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+}
+EXPORT_SYMBOL(skb_set_owner_w);
+
 void skb_orphan_partial(struct sk_buff *skb)
 {
        /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
@@ -1777,7 +1813,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
 {
        DEFINE_WAIT(wait);
 
-       clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+       sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
        for (;;) {
                if (!timeo)
                        break;
@@ -1823,7 +1859,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
                        break;
 
-               set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+               sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                err = -EAGAIN;
                if (!timeo)
@@ -1853,6 +1889,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
 }
 EXPORT_SYMBOL(sock_alloc_send_skb);
 
+int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+                  struct sockcm_cookie *sockc)
+{
+       struct cmsghdr *cmsg;
+
+       for_each_cmsghdr(cmsg, msg) {
+               if (!CMSG_OK(msg, cmsg))
+                       return -EINVAL;
+               if (cmsg->cmsg_level != SOL_SOCKET)
+                       continue;
+               switch (cmsg->cmsg_type) {
+               case SO_MARK:
+                       if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+                               return -EPERM;
+                       if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+                               return -EINVAL;
+                       sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+                       break;
+               default:
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+EXPORT_SYMBOL(sock_cmsg_send);
+
 /* On 32bit arches, an skb frag is limited to 2^15 */
 #define SKB_FRAG_PAGE_ORDER    get_order(32768)
 
@@ -1880,8 +1942,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 
        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER) {
-               pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
-                                         __GFP_NOWARN | __GFP_NORETRY,
+               /* Avoid direct reclaim but allow kswapd to wake */
+               pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+                                         __GFP_COMP | __GFP_NOWARN |
+                                         __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
@@ -1969,21 +2033,22 @@ static void __release_sock(struct sock *sk)
  * sk_wait_data - wait for data to arrive at sk_receive_queue
  * @sk:    sock to wait on
  * @timeo: for how long
+ * @skb:   last skb seen on sk_receive_queue
  *
  * Now socket state including sk->sk_err is changed only under lock,
  * hence we may omit checks after joining wait queue.
  * We check receive queue before schedule() only as optimization;
  * it is very likely that release_sock() added new data.
  */
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
 {
        int rc;
        DEFINE_WAIT(wait);
 
        prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-       set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-       rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
-       clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+       sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+       rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+       sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        finish_wait(sk_sleep(sk), &wait);
        return rc;
 }
@@ -2078,14 +2143,15 @@ suppress_allocation:
 EXPORT_SYMBOL(__sk_mem_schedule);
 
 /**
- *     __sk_reclaim - reclaim memory_allocated
+ *     __sk_mem_reclaim - reclaim memory_allocated
  *     @sk: socket
+ *     @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
  */
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
 {
-       sk_memory_allocated_sub(sk,
-                               sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
-       sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+       amount >>= SK_MEM_QUANTUM_SHIFT;
+       sk_memory_allocated_sub(sk, amount);
+       sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
 
        if (sk_under_memory_pressure(sk) &&
            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
@@ -2270,7 +2336,6 @@ static void sock_def_write_space(struct sock *sk)
 
 static void sock_def_destruct(struct sock *sk)
 {
-       kfree(sk->sk_protinfo);
 }
 
 void sk_send_sigurg(struct sock *sk)
@@ -2321,7 +2386,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        } else
                sk->sk_wq       =       NULL;
 
-       spin_lock_init(&sk->sk_dst_lock);
        rwlock_init(&sk->sk_callback_lock);
        lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
@@ -2353,6 +2417,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
        sk->sk_max_pacing_rate = ~0U;
        sk->sk_pacing_rate = ~0U;
+       sk->sk_incoming_cpu = -1;
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.txt for details)
@@ -2478,7 +2543,8 @@ void sock_enable_timestamp(struct sock *sk, int flag)
                 * time stamping, but time stamping might have been on
                 * already because of the other one
                 */
-               if (!(previous_flags & SK_FLAGS_TIMESTAMP))
+               if (sock_needs_netstamp(sk) &&
+                   !(previous_flags & SK_FLAGS_TIMESTAMP))
                        net_enable_timestamp();
        }
 }
@@ -2739,10 +2805,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
                return;
        kfree(rsk_prot->slab_name);
        rsk_prot->slab_name = NULL;
-       if (rsk_prot->slab) {
-               kmem_cache_destroy(rsk_prot->slab);
-               rsk_prot->slab = NULL;
-       }
+       kmem_cache_destroy(rsk_prot->slab);
+       rsk_prot->slab = NULL;
 }
 
 static int req_prot_init(const struct proto *prot)
@@ -2759,7 +2823,7 @@ static int req_prot_init(const struct proto *prot)
 
        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                           rsk_prot->obj_size, 0,
-                                          0, NULL);
+                                          prot->slab_flags, NULL);
 
        if (!rsk_prot->slab) {
                pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -2827,10 +2891,8 @@ void proto_unregister(struct proto *prot)
        list_del(&prot->node);
        mutex_unlock(&proto_list_mutex);
 
-       if (prot->slab != NULL) {
-               kmem_cache_destroy(prot->slab);
-               prot->slab = NULL;
-       }
+       kmem_cache_destroy(prot->slab);
+       prot->slab = NULL;
 
        req_prot_cleanup(prot->rsk_prot);