These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / net / core / skbuff.c
diff --git a/kernel/net/core/skbuff.c b/kernel/net/core/skbuff.c

index fc09e8f..12780dc 100644 (file)
--- a/kernel/net/core/skbuff.c
+++ b/kernel/net/core/skbuff.c
@@ -80,6 +80,8 @@
  
  struct kmem_cache *skbuff_head_cache __read_mostly;
  static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
  
  /**
   *     skb_panic - private function for out-of-line support
@@ -348,95 +350,20 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
  }
  EXPORT_SYMBOL(build_skb);
  
-struct netdev_alloc_cache {
-       struct page_frag        frag;
-       /* we maintain a pagecount bias, so that we dont dirty cache line
-        * containing page->_count every time we allocate a fragment.
-        */
-       unsigned int            pagecnt_bias;
-};
-static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
  static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
-
-static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
-                                      gfp_t gfp_mask)
-{
-       const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
-       struct page *page = NULL;
-       gfp_t gfp = gfp_mask;
-
-       if (order) {
-               gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-                           __GFP_NOMEMALLOC;
-               page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
-               nc->frag.size = PAGE_SIZE << (page ? order : 0);
-       }
-
-       if (unlikely(!page))
-               page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
-       nc->frag.page = page;
-
-       return page;
-}
-
-static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
-                              unsigned int fragsz, gfp_t gfp_mask)
-{
-       struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
-       struct page *page = nc->frag.page;
-       unsigned int size;
-       int offset;
-
-       if (unlikely(!page)) {
-refill:
-               page = __page_frag_refill(nc, gfp_mask);
-               if (!page)
-                       return NULL;
-
-               /* if size can vary use frag.size else just use PAGE_SIZE */
-               size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
-               /* Even if we own the page, we do not use atomic_set().
-                * This would break get_page_unless_zero() users.
-                */
-               atomic_add(size - 1, &page->_count);
-
-               /* reset page count bias and offset to start of new frag */
-               nc->pagecnt_bias = size;
-               nc->frag.offset = size;
-       }
-
-       offset = nc->frag.offset - fragsz;
-       if (unlikely(offset < 0)) {
-               if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
-                       goto refill;
-
-               /* if size can vary use frag.size else just use PAGE_SIZE */
-               size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
-               /* OK, page count is 0, we can safely set it */
-               atomic_set(&page->_count, size);
-
-               /* reset page count bias and offset to start of new frag */
-               nc->pagecnt_bias = size;
-               offset = size - fragsz;
-       }
-
-       nc->pagecnt_bias--;
-       nc->frag.offset = offset;
-
-       return page_address(page) + offset;
-}
+static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
  
  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  {
+       struct page_frag_cache *nc;
         unsigned long flags;
         void *data;
  
         local_lock_irqsave(netdev_alloc_lock, flags);
-       data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
+       nc = this_cpu_ptr(&netdev_alloc_cache);
+       data = __alloc_page_frag(nc, fragsz, gfp_mask);
         local_unlock_irqrestore(netdev_alloc_lock, flags);
         return data;
  }
@@ -456,7 +383,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
  
  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  {
-       return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+       struct page_frag_cache *nc;
+       void *data;
+
+       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+       data = __alloc_page_frag(nc, fragsz, gfp_mask);
+       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+       return data;
  }
  
  void *napi_alloc_frag(unsigned int fragsz)
@@ -466,76 +399,70 @@ void *napi_alloc_frag(unsigned int fragsz)
  EXPORT_SYMBOL(napi_alloc_frag);
  
  /**
- *     __alloc_rx_skb - allocate an skbuff for rx
- *     @length: length to allocate
+ *     __netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *     @dev: network device to receive on
+ *     @len: length to allocate
   *     @gfp_mask: get_free_pages mask, passed to alloc_skb
- *     @flags: If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- *             allocations in case we have to fallback to __alloc_skb()
- *             If SKB_ALLOC_NAPI is set, page fragment will be allocated
- *             from napi_cache instead of netdev_cache.
   *
   *     Allocate a new &sk_buff and assign it a usage count of one. The
- *     buffer has unspecified headroom built in. Users should allocate
+ *     buffer has NET_SKB_PAD headroom built in. Users should allocate
   *     the headroom they think they need without accounting for the
   *     built in space. The built in space is used for optimisations.
   *
   *     %NULL is returned if there is no free memory.
   */
-static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
-                                     int flags)
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+                                  gfp_t gfp_mask)
  {
-       struct sk_buff *skb = NULL;
-       unsigned int fragsz = SKB_DATA_ALIGN(length) +
-                             SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       struct page_frag_cache *nc;
+       unsigned long flags;
+       struct sk_buff *skb;
+       bool pfmemalloc;
+       void *data;
  
-       if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
-               void *data;
+       len += NET_SKB_PAD;
  
-               if (sk_memalloc_socks())
-                       gfp_mask |= __GFP_MEMALLOC;
+       if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+           (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+               skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+               if (!skb)
+                       goto skb_fail;
+               goto skb_success;
+       }
  
-               data = (flags & SKB_ALLOC_NAPI) ?
-                       __napi_alloc_frag(fragsz, gfp_mask) :
-                       __netdev_alloc_frag(fragsz, gfp_mask);
+       len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       len = SKB_DATA_ALIGN(len);
  
-               if (likely(data)) {
-                       skb = build_skb(data, fragsz);
-                       if (unlikely(!skb))
-                               put_page(virt_to_head_page(data));
-               }
-       } else {
-               skb = __alloc_skb(length, gfp_mask,
-                                 SKB_ALLOC_RX, NUMA_NO_NODE);
-       }
-       return skb;
-}
+       if (sk_memalloc_socks())
+               gfp_mask |= __GFP_MEMALLOC;
  
-/**
- *     __netdev_alloc_skb - allocate an skbuff for rx on a specific device
- *     @dev: network device to receive on
- *     @length: length to allocate
- *     @gfp_mask: get_free_pages mask, passed to alloc_skb
- *
- *     Allocate a new &sk_buff and assign it a usage count of one. The
- *     buffer has NET_SKB_PAD headroom built in. Users should allocate
- *     the headroom they think they need without accounting for the
- *     built in space. The built in space is used for optimisations.
- *
- *     %NULL is returned if there is no free memory.
- */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
-                                  unsigned int length, gfp_t gfp_mask)
-{
-       struct sk_buff *skb;
+       local_lock_irqsave(netdev_alloc_lock, flags);
+
+       nc = this_cpu_ptr(&netdev_alloc_cache);
+       data = __alloc_page_frag(nc, len, gfp_mask);
+       pfmemalloc = nc->pfmemalloc;
  
-       length += NET_SKB_PAD;
-       skb = __alloc_rx_skb(length, gfp_mask, 0);
+       local_unlock_irqrestore(netdev_alloc_lock, flags);
  
-       if (likely(skb)) {
-               skb_reserve(skb, NET_SKB_PAD);
-               skb->dev = dev;
+       if (unlikely(!data))
+               return NULL;
+
+       skb = __build_skb(data, len);
+       if (unlikely(!skb)) {
+               skb_free_frag(data);
+               return NULL;
         }
  
+       /* use OR instead of assignment to avoid clearing of bits in mask */
+       if (pfmemalloc)
+               skb->pfmemalloc = 1;
+       skb->head_frag = 1;
+
+skb_success:
+       skb_reserve(skb, NET_SKB_PAD);
+       skb->dev = dev;
+
+skb_fail:
         return skb;
  }
  EXPORT_SYMBOL(__netdev_alloc_skb);
@@ -543,7 +470,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
  /**
   *     __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
   *     @napi: napi instance this buffer was allocated for
- *     @length: length to allocate
+ *     @len: length to allocate
   *     @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
   *
   *     Allocate a new sk_buff for use in NAPI receive.  This buffer will
@@ -553,19 +480,54 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
   *
   *     %NULL is returned if there is no free memory.
   */
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
-                                unsigned int length, gfp_t gfp_mask)
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
+                                gfp_t gfp_mask)
  {
+       struct page_frag_cache *nc;
         struct sk_buff *skb;
+       void *data;
+       bool pfmemalloc;
  
-       length += NET_SKB_PAD + NET_IP_ALIGN;
-       skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+       len += NET_SKB_PAD + NET_IP_ALIGN;
  
-       if (likely(skb)) {
-               skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-               skb->dev = napi->dev;
+       if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+           (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+               skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+               if (!skb)
+                       goto skb_fail;
+               goto skb_success;
         }
  
+       len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       len = SKB_DATA_ALIGN(len);
+
+       if (sk_memalloc_socks())
+               gfp_mask |= __GFP_MEMALLOC;
+
+       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+       data = __alloc_page_frag(nc, len, gfp_mask);
+       pfmemalloc = nc->pfmemalloc;
+       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
+
+       if (unlikely(!data))
+               return NULL;
+
+       skb = __build_skb(data, len);
+       if (unlikely(!skb)) {
+               skb_free_frag(data);
+               return NULL;
+       }
+
+       /* use OR instead of assignment to avoid clearing of bits in mask */
+       if (pfmemalloc)
+               skb->pfmemalloc = 1;
+       skb->head_frag = 1;
+
+skb_success:
+       skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+       skb->dev = napi->dev;
+
+skb_fail:
         return skb;
  }
  EXPORT_SYMBOL(__napi_alloc_skb);
@@ -613,10 +575,12 @@ static void skb_clone_fraglist(struct sk_buff *skb)
  
  static void skb_free_head(struct sk_buff *skb)
  {
+       unsigned char *head = skb->head;
+
         if (skb->head_frag)
-               put_page(virt_to_head_page(skb->head));
+               skb_free_frag(head);
         else
-               kfree(skb->head);
+               kfree(head);
  }
  
  static void skb_release_data(struct sk_buff *skb)
@@ -1920,15 +1884,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
         return false;
  }
  
+ssize_t skb_socket_splice(struct sock *sk,
+                         struct pipe_inode_info *pipe,
+                         struct splice_pipe_desc *spd)
+{
+       int ret;
+
+       /* Drop the socket lock, otherwise we have reverse
+        * locking dependencies between sk_lock and i_mutex
+        * here as compared to sendfile(). We enter here
+        * with the socket lock held, and splice_to_pipe() will
+        * grab the pipe inode lock. For sendfile() emulation,
+        * we call into ->sendpage() with the i_mutex lock held
+        * and networking will grab the socket lock.
+        */
+       release_sock(sk);
+       ret = splice_to_pipe(pipe, spd);
+       lock_sock(sk);
+
+       return ret;
+}
+
  /*
   * Map data from the skb to a pipe. Should handle both the linear part,
   * the fragments, and the frag list. It does NOT handle frag lists within
   * the frag list, if such a thing exists. We'd probably need to recurse to
   * handle that cleanly.
   */
-int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                     struct pipe_inode_info *pipe, unsigned int tlen,
-                   unsigned int flags)
+                   unsigned int flags,
+                   ssize_t (*splice_cb)(struct sock *,
+                                        struct pipe_inode_info *,
+                                        struct splice_pipe_desc *))
  {
         struct partial_page partial[MAX_SKB_FRAGS];
         struct page *pages[MAX_SKB_FRAGS];
@@ -1941,7 +1929,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
                 .spd_release = sock_spd_release,
         };
         struct sk_buff *frag_iter;
-       struct sock *sk = skb->sk;
         int ret = 0;
  
         /*
@@ -1964,23 +1951,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
         }
  
  done:
-       if (spd.nr_pages) {
-               /*
-                * Drop the socket lock, otherwise we have reverse
-                * locking dependencies between sk_lock and i_mutex
-                * here as compared to sendfile(). We enter here
-                * with the socket lock held, and splice_to_pipe() will
-                * grab the pipe inode lock. For sendfile() emulation,
-                * we call into ->sendpage() with the i_mutex lock held
-                * and networking will grab the socket lock.
-                */
-               release_sock(sk);
-               ret = splice_to_pipe(pipe, &spd);
-               lock_sock(sk);
-       }
+       if (spd.nr_pages)
+               ret = splice_cb(sk, pipe, &spd);
  
         return ret;
  }
+EXPORT_SYMBOL_GPL(skb_splice_bits);
  
  /**
   *     skb_store_bits - store bits from kernel buffer to skb
@@ -2965,6 +2941,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
  }
  EXPORT_SYMBOL(skb_append_datato_frags);
  
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+                        int offset, size_t size)
+{
+       int i = skb_shinfo(skb)->nr_frags;
+
+       if (skb_can_coalesce(skb, i, page, offset)) {
+               skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+       } else if (i < MAX_SKB_FRAGS) {
+               get_page(page);
+               skb_fill_page_desc(skb, i, page, offset, size);
+       } else {
+               return -EMSGSIZE;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(skb_append_pagefrags);
+
  /**
   *     skb_pull_rcsum - pull skb and update receive checksum
   *     @skb: buffer to update
@@ -2978,11 +2972,12 @@ EXPORT_SYMBOL(skb_append_datato_frags);
   */
  unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
  {
+       unsigned char *data = skb->data;
+
         BUG_ON(len > skb->len);
-       skb->len -= len;
-       BUG_ON(skb->len < skb->data_len);
-       skb_postpull_rcsum(skb, skb->data, len);
-       return skb->data += len;
+       __skb_pull(skb, len);
+       skb_postpull_rcsum(skb, data, len);
+       return skb->data;
  }
  EXPORT_SYMBOL_GPL(skb_pull_rcsum);
  
@@ -3662,7 +3657,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
         serr->ee.ee_info = tstype;
         if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
                 serr->ee.ee_data = skb_shinfo(skb)->tskey;
-               if (sk->sk_protocol == IPPROTO_TCP)
+               if (sk->sk_protocol == IPPROTO_TCP &&
+                   sk->sk_type == SOCK_STREAM)
                         serr->ee.ee_data -= sk->sk_tskey;
         }
  
@@ -4032,6 +4028,92 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
  }
  EXPORT_SYMBOL(skb_checksum_setup);
  
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+                                              unsigned int transport_len)
+{
+       struct sk_buff *skb_chk;
+       unsigned int len = skb_transport_offset(skb) + transport_len;
+       int ret;
+
+       if (skb->len < len)
+               return NULL;
+       else if (skb->len == len)
+               return skb;
+
+       skb_chk = skb_clone(skb, GFP_ATOMIC);
+       if (!skb_chk)
+               return NULL;
+
+       ret = pskb_trim_rcsum(skb_chk, len);
+       if (ret) {
+               kfree_skb(skb_chk);
+               return NULL;
+       }
+
+       return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and free any returned skb if it
+ * differs from the provided skb.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+                                    unsigned int transport_len,
+                                    __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+       struct sk_buff *skb_chk;
+       unsigned int offset = skb_transport_offset(skb);
+       __sum16 ret;
+
+       skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+       if (!skb_chk)
+               goto err;
+
+       if (!pskb_may_pull(skb_chk, offset))
+               goto err;
+
+       __skb_pull(skb_chk, offset);
+       ret = skb_chkf(skb_chk);
+       __skb_push(skb_chk, offset);
+
+       if (ret)
+               goto err;
+
+       return skb_chk;
+
+err:
+       if (skb_chk && skb_chk != skb)
+               kfree_skb(skb_chk);
+
+       return NULL;
+
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
  void __skb_warn_lro_forwarding(const struct sk_buff *skb)
  {
         net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
@@ -4201,7 +4283,8 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
                 return NULL;
         }
  
-       memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+       memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
+               2 * ETH_ALEN);
         skb->mac_header += VLAN_HLEN;
         return skb;
  }
@@ -4385,7 +4468,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                 return NULL;
  
         gfp_head = gfp_mask;
-       if (gfp_head & __GFP_WAIT)
+       if (gfp_head & __GFP_DIRECT_RECLAIM)
                 gfp_head |= __GFP_REPEAT;
  
         *errcode = -ENOBUFS;
@@ -4400,7 +4483,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
  
                 while (order) {
                         if (npages >= 1 << order) {
-                               page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
+                               page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
                                                    __GFP_COMP |
                                                    __GFP_NOWARN |
                                                    __GFP_NORETRY,