These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / net / core / sock.c
diff --git a/kernel/net/core/sock.c b/kernel/net/core/sock.c

index 6317c71..9c32342 100644 (file)
--- a/kernel/net/core/sock.c
+++ b/kernel/net/core/sock.c
@@ -131,6 +131,7 @@
  #include <linux/ipsec.h>
  #include <net/cls_cgroup.h>
  #include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
  
  #include <linux/filter.h>
  
@@ -421,13 +422,23 @@ static void sock_warn_obsolete_bsdism(const char *name)
         }
  }
  
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+static bool sock_needs_netstamp(const struct sock *sk)
+{
+       switch (sk->sk_family) {
+       case AF_UNSPEC:
+       case AF_UNIX:
+               return false;
+       default:
+               return true;
+       }
+}
  
  static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
  {
         if (sk->sk_flags & flags) {
                 sk->sk_flags &= ~flags;
-               if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+               if (sock_needs_netstamp(sk) &&
+                   !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                         net_disable_timestamp();
         }
  }
@@ -861,7 +872,8 @@ set_rcvbuf:
  
                 if (val & SOF_TIMESTAMPING_OPT_ID &&
                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
-                       if (sk->sk_protocol == IPPROTO_TCP) {
+                       if (sk->sk_protocol == IPPROTO_TCP &&
+                           sk->sk_type == SOCK_STREAM) {
                                 if (sk->sk_state != TCP_ESTABLISHED) {
                                         ret = -EINVAL;
                                         break;
@@ -987,6 +999,10 @@ set_rcvbuf:
                                          sk->sk_max_pacing_rate);
                 break;
  
+       case SO_INCOMING_CPU:
+               sk->sk_incoming_cpu = val;
+               break;
+
         default:
                 ret = -ENOPROTOOPT;
                 break;
@@ -1393,9 +1409,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx);
   *     @family: protocol family
   *     @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
   *     @prot: struct proto associated with this new sock instance
+ *     @kern: is this to be a kernel socket?
   */
  struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-                     struct proto *prot)
+                     struct proto *prot, int kern)
  {
         struct sock *sk;
  
@@ -1408,7 +1425,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                  */
                 sk->sk_prot = sk->sk_prot_creator = prot;
                 sock_lock_init(sk);
-               sock_net_set(sk, get_net(net));
+               sk->sk_net_refcnt = kern ? 0 : 1;
+               if (likely(sk->sk_net_refcnt))
+                       get_net(net);
+               sock_net_set(sk, net);
                 atomic_set(&sk->sk_wmem_alloc, 1);
  
                 sock_update_classid(sk);
@@ -1419,7 +1439,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
  }
  EXPORT_SYMBOL(sk_alloc);
  
-static void __sk_free(struct sock *sk)
+void sk_destruct(struct sock *sk)
  {
         struct sk_filter *filter;
  
@@ -1442,10 +1462,19 @@ static void __sk_free(struct sock *sk)
         if (sk->sk_peer_cred)
                 put_cred(sk->sk_peer_cred);
         put_pid(sk->sk_peer_pid);
-       put_net(sock_net(sk));
+       if (likely(sk->sk_net_refcnt))
+               put_net(sock_net(sk));
         sk_prot_free(sk->sk_prot_creator, sk);
  }
  
+static void __sk_free(struct sock *sk)
+{
+       if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+               sock_diag_broadcast_destroy(sk);
+       else
+               sk_destruct(sk);
+}
+
  void sk_free(struct sock *sk)
  {
         /*
@@ -1458,25 +1487,6 @@ void sk_free(struct sock *sk)
  }
  EXPORT_SYMBOL(sk_free);
  
-/*
- * Last sock_put should drop reference to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking reference to stopping namespace
- * is not an option.
- * Take reference to a socket to remove it from hash _alive_ and after that
- * destroy it in the context of init_net.
- */
-void sk_release_kernel(struct sock *sk)
-{
-       if (sk == NULL || sk->sk_socket == NULL)
-               return;
-
-       sock_hold(sk);
-       sock_release(sk->sk_socket);
-       sock_net_set(sk, get_net(&init_net));
-       sock_put(sk);
-}
-EXPORT_SYMBOL(sk_release_kernel);
-
  static void sk_update_clone(const struct sock *sk, struct sock *newsk)
  {
         if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
@@ -1502,7 +1512,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                 sock_copy(newsk, sk);
  
                 /* SANITY */
-               get_net(sock_net(newsk));
+               if (likely(newsk->sk_net_refcnt))
+                       get_net(sock_net(newsk));
                 sk_node_init(&newsk->sk_node);
                 sock_lock_init(newsk);
                 bh_lock_sock(newsk);
@@ -1518,7 +1529,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                 skb_queue_head_init(&newsk->sk_receive_queue);
                 skb_queue_head_init(&newsk->sk_write_queue);
  
-               spin_lock_init(&newsk->sk_dst_lock);
                 rwlock_init(&newsk->sk_callback_lock);
                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
                                 af_callback_keys + newsk->sk_family,
@@ -1541,7 +1551,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                          */
                         is_charged = sk_filter_charge(newsk, filter);
  
-               if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+               if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
                         /* It is still raw copy of parent, so invalidate
                          * destructor and make plain sk_free() */
                         newsk->sk_destruct = NULL;
@@ -1582,7 +1592,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                 if (newsk->sk_prot->sockets_allocated)
                         sk_sockets_allocated_inc(newsk);
  
-               if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+               if (sock_needs_netstamp(sk) &&
+                   newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                         net_enable_timestamp();
         }
  out:
@@ -1592,7 +1603,9 @@ EXPORT_SYMBOL_GPL(sk_clone_lock);
  
  void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
  {
-       __sk_dst_set(sk, dst);
+       u32 max_segs = 1;
+
+       sk_dst_set(sk, dst);
         sk->sk_route_caps = dst->dev->features;
         if (sk->sk_route_caps & NETIF_F_GSO)
                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
@@ -1603,9 +1616,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
                 } else {
                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                         sk->sk_gso_max_size = dst->dev->gso_max_size;
-                       sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+                       max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
                 }
         }
+       sk->sk_gso_max_segs = max_segs;
  }
  EXPORT_SYMBOL_GPL(sk_setup_caps);
  
@@ -1640,6 +1654,28 @@ void sock_wfree(struct sk_buff *skb)
  }
  EXPORT_SYMBOL(sock_wfree);
  
+void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
+{
+       skb_orphan(skb);
+       skb->sk = sk;
+#ifdef CONFIG_INET
+       if (unlikely(!sk_fullsock(sk))) {
+               skb->destructor = sock_edemux;
+               sock_hold(sk);
+               return;
+       }
+#endif
+       skb->destructor = sock_wfree;
+       skb_set_hash_from_sk(skb, sk);
+       /*
+        * We used to take a refcount on sk, but following operation
+        * is enough to guarantee sk_free() wont free this sock until
+        * all in-flight packets are completed
+        */
+       atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+}
+EXPORT_SYMBOL(skb_set_owner_w);
+
  void skb_orphan_partial(struct sk_buff *skb)
  {
         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
@@ -1777,7 +1813,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
  {
         DEFINE_WAIT(wait);
  
-       clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+       sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
         for (;;) {
                 if (!timeo)
                         break;
@@ -1823,7 +1859,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
                         break;
  
-               set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+               sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                 err = -EAGAIN;
                 if (!timeo)
@@ -1853,6 +1889,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
  }
  EXPORT_SYMBOL(sock_alloc_send_skb);
  
+int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+                  struct sockcm_cookie *sockc)
+{
+       struct cmsghdr *cmsg;
+
+       for_each_cmsghdr(cmsg, msg) {
+               if (!CMSG_OK(msg, cmsg))
+                       return -EINVAL;
+               if (cmsg->cmsg_level != SOL_SOCKET)
+                       continue;
+               switch (cmsg->cmsg_type) {
+               case SO_MARK:
+                       if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+                               return -EPERM;
+                       if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+                               return -EINVAL;
+                       sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+                       break;
+               default:
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+EXPORT_SYMBOL(sock_cmsg_send);
+
  /* On 32bit arches, an skb frag is limited to 2^15 */
  #define SKB_FRAG_PAGE_ORDER    get_order(32768)
  
@@ -1880,8 +1942,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
  
         pfrag->offset = 0;
         if (SKB_FRAG_PAGE_ORDER) {
-               pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
-                                         __GFP_NOWARN | __GFP_NORETRY,
+               /* Avoid direct reclaim but allow kswapd to wake */
+               pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+                                         __GFP_COMP | __GFP_NOWARN |
+                                         __GFP_NORETRY,
                                           SKB_FRAG_PAGE_ORDER);
                 if (likely(pfrag->page)) {
                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
@@ -1969,21 +2033,22 @@ static void __release_sock(struct sock *sk)
   * sk_wait_data - wait for data to arrive at sk_receive_queue
   * @sk:    sock to wait on
   * @timeo: for how long
+ * @skb:   last skb seen on sk_receive_queue
   *
   * Now socket state including sk->sk_err is changed only under lock,
   * hence we may omit checks after joining wait queue.
   * We check receive queue before schedule() only as optimization;
   * it is very likely that release_sock() added new data.
   */
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
  {
         int rc;
         DEFINE_WAIT(wait);
  
         prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-       set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-       rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
-       clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+       sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+       rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
+       sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
         finish_wait(sk_sleep(sk), &wait);
         return rc;
  }
@@ -2078,14 +2143,15 @@ suppress_allocation:
  EXPORT_SYMBOL(__sk_mem_schedule);
  
  /**
- *     __sk_reclaim - reclaim memory_allocated
+ *     __sk_mem_reclaim - reclaim memory_allocated
   *     @sk: socket
+ *     @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
   */
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
  {
-       sk_memory_allocated_sub(sk,
-                               sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
-       sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+       amount >>= SK_MEM_QUANTUM_SHIFT;
+       sk_memory_allocated_sub(sk, amount);
+       sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
  
         if (sk_under_memory_pressure(sk) &&
             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
@@ -2270,7 +2336,6 @@ static void sock_def_write_space(struct sock *sk)
  
  static void sock_def_destruct(struct sock *sk)
  {
-       kfree(sk->sk_protinfo);
  }
  
  void sk_send_sigurg(struct sock *sk)
@@ -2321,7 +2386,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
         } else
                 sk->sk_wq       =       NULL;
  
-       spin_lock_init(&sk->sk_dst_lock);
         rwlock_init(&sk->sk_callback_lock);
         lockdep_set_class_and_name(&sk->sk_callback_lock,
                         af_callback_keys + sk->sk_family,
@@ -2353,6 +2417,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
  
         sk->sk_max_pacing_rate = ~0U;
         sk->sk_pacing_rate = ~0U;
+       sk->sk_incoming_cpu = -1;
         /*
          * Before updating sk_refcnt, we must commit prior changes to memory
          * (Documentation/RCU/rculist_nulls.txt for details)
@@ -2478,7 +2543,8 @@ void sock_enable_timestamp(struct sock *sk, int flag)
                  * time stamping, but time stamping might have been on
                  * already because of the other one
                  */
-               if (!(previous_flags & SK_FLAGS_TIMESTAMP))
+               if (sock_needs_netstamp(sk) &&
+                   !(previous_flags & SK_FLAGS_TIMESTAMP))
                         net_enable_timestamp();
         }
  }
@@ -2739,10 +2805,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
                 return;
         kfree(rsk_prot->slab_name);
         rsk_prot->slab_name = NULL;
-       if (rsk_prot->slab) {
-               kmem_cache_destroy(rsk_prot->slab);
-               rsk_prot->slab = NULL;
-       }
+       kmem_cache_destroy(rsk_prot->slab);
+       rsk_prot->slab = NULL;
  }
  
  static int req_prot_init(const struct proto *prot)
@@ -2759,7 +2823,7 @@ static int req_prot_init(const struct proto *prot)
  
         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                            rsk_prot->obj_size, 0,
-                                          0, NULL);
+                                          prot->slab_flags, NULL);
  
         if (!rsk_prot->slab) {
                 pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -2827,10 +2891,8 @@ void proto_unregister(struct proto *prot)
         list_del(&prot->node);
         mutex_unlock(&proto_list_mutex);
  
-       if (prot->slab != NULL) {
-               kmem_cache_destroy(prot->slab);
-               prot->slab = NULL;
-       }
+       kmem_cache_destroy(prot->slab);
+       prot->slab = NULL;
  
         req_prot_cleanup(prot->rsk_prot);