These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / packet / af_packet.c
index f9f2592..2c23bae 100644 (file)
@@ -93,6 +93,7 @@
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
+#include <linux/bpf.h>
 
 #include "internal.h"
 
@@ -230,6 +231,8 @@ struct packet_skb_cb {
        } sa;
 };
 
+#define vio_le() virtio_legacy_is_little_endian()
+
 #define PACKET_SKB_CB(__skb)   ((struct packet_skb_cb *)((__skb)->cb))
 
 #define GET_PBDQC_FROM_RB(x)   ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
@@ -519,13 +522,11 @@ static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 }
 
 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
-               int tx_ring,
                struct sk_buff_head *rb_queue)
 {
        struct tpacket_kbdq_core *pkc;
 
-       pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
-                       GET_PBDQC_FROM_RB(&po->rx_ring);
+       pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
 
        spin_lock_bh(&rb_queue->lock);
        pkc->delete_blk_timer = 1;
@@ -544,15 +545,11 @@ static void prb_init_blk_timer(struct packet_sock *po,
        pkc->retire_blk_timer.expires = jiffies;
 }
 
-static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
+static void prb_setup_retire_blk_timer(struct packet_sock *po)
 {
        struct tpacket_kbdq_core *pkc;
 
-       if (tx_ring)
-               BUG();
-
-       pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
-                       GET_PBDQC_FROM_RB(&po->rx_ring);
+       pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
 }
 
@@ -608,7 +605,7 @@ static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
 static void init_prb_bdqc(struct packet_sock *po,
                        struct packet_ring_buffer *rb,
                        struct pgv *pg_vec,
-                       union tpacket_req_u *req_u, int tx_ring)
+                       union tpacket_req_u *req_u)
 {
        struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
        struct tpacket_block_desc *pbd;
@@ -635,7 +632,7 @@ static void init_prb_bdqc(struct packet_sock *po,
 
        p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
        prb_init_ft_ops(p1, req_u);
-       prb_setup_retire_blk_timer(po, tx_ring);
+       prb_setup_retire_blk_timer(po);
        prb_open_block(p1, pbd);
 }
 
@@ -1235,27 +1232,81 @@ static void packet_free_pending(struct packet_sock *po)
        free_percpu(po->tx_ring.pending_refcnt);
 }
 
-static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+#define ROOM_POW_OFF   2
+#define ROOM_NONE      0x0
+#define ROOM_LOW       0x1
+#define ROOM_NORMAL    0x2
+
+static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+{
+       int idx, len;
+
+       len = po->rx_ring.frame_max + 1;
+       idx = po->rx_ring.head;
+       if (pow_off)
+               idx += len >> pow_off;
+       if (idx >= len)
+               idx -= len;
+       return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+{
+       int idx, len;
+
+       len = po->rx_ring.prb_bdqc.knum_blocks;
+       idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+       if (pow_off)
+               idx += len >> pow_off;
+       if (idx >= len)
+               idx -= len;
+       return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
 {
        struct sock *sk = &po->sk;
-       bool has_room;
+       int ret = ROOM_NONE;
+
+       if (po->prot_hook.func != tpacket_rcv) {
+               int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+                                         - (skb ? skb->truesize : 0);
+               if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+                       return ROOM_NORMAL;
+               else if (avail > 0)
+                       return ROOM_LOW;
+               else
+                       return ROOM_NONE;
+       }
 
-       if (po->prot_hook.func != tpacket_rcv)
-               return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
-                       <= sk->sk_rcvbuf;
+       if (po->tp_version == TPACKET_V3) {
+               if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
+                       ret = ROOM_NORMAL;
+               else if (__tpacket_v3_has_room(po, 0))
+                       ret = ROOM_LOW;
+       } else {
+               if (__tpacket_has_room(po, ROOM_POW_OFF))
+                       ret = ROOM_NORMAL;
+               else if (__tpacket_has_room(po, 0))
+                       ret = ROOM_LOW;
+       }
 
-       spin_lock(&sk->sk_receive_queue.lock);
-       if (po->tp_version == TPACKET_V3)
-               has_room = prb_lookup_block(po, &po->rx_ring,
-                                           po->rx_ring.prb_bdqc.kactive_blk_num,
-                                           TP_STATUS_KERNEL);
-       else
-               has_room = packet_lookup_frame(po, &po->rx_ring,
-                                              po->rx_ring.head,
-                                              TP_STATUS_KERNEL);
-       spin_unlock(&sk->sk_receive_queue.lock);
+       return ret;
+}
 
-       return has_room;
+static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+{
+       int ret;
+       bool has_room;
+
+       spin_lock_bh(&po->sk.sk_receive_queue.lock);
+       ret = __packet_rcv_has_room(po, skb);
+       has_room = ret == ROOM_NORMAL;
+       if (po->pressure == has_room)
+               po->pressure = !has_room;
+       spin_unlock_bh(&po->sk.sk_receive_queue.lock);
+
+       return ret;
 }
 
 static void packet_sock_destruct(struct sock *sk)
@@ -1273,6 +1324,20 @@ static void packet_sock_destruct(struct sock *sk)
        sk_refcnt_debug_dec(sk);
 }
 
+static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
+{
+       u32 rxhash;
+       int i, count = 0;
+
+       rxhash = skb_get_hash(skb);
+       for (i = 0; i < ROLLOVER_HLEN; i++)
+               if (po->rollover->history[i] == rxhash)
+                       count++;
+
+       po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
+       return count > (ROLLOVER_HLEN >> 1);
+}
+
 static unsigned int fanout_demux_hash(struct packet_fanout *f,
                                      struct sk_buff *skb,
                                      unsigned int num)
@@ -1305,22 +1370,40 @@ static unsigned int fanout_demux_rnd(struct packet_fanout *f,
 
 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
                                          struct sk_buff *skb,
-                                         unsigned int idx, unsigned int skip,
+                                         unsigned int idx, bool try_self,
                                          unsigned int num)
 {
-       unsigned int i, j;
+       struct packet_sock *po, *po_next, *po_skip = NULL;
+       unsigned int i, j, room = ROOM_NONE;
 
-       i = j = min_t(int, f->next[idx], num - 1);
+       po = pkt_sk(f->arr[idx]);
+
+       if (try_self) {
+               room = packet_rcv_has_room(po, skb);
+               if (room == ROOM_NORMAL ||
+                   (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
+                       return idx;
+               po_skip = po;
+       }
+
+       i = j = min_t(int, po->rollover->sock, num - 1);
        do {
-               if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+               po_next = pkt_sk(f->arr[i]);
+               if (po_next != po_skip && !po_next->pressure &&
+                   packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
                        if (i != j)
-                               f->next[idx] = i;
+                               po->rollover->sock = i;
+                       atomic_long_inc(&po->rollover->num);
+                       if (room == ROOM_LOW)
+                               atomic_long_inc(&po->rollover->num_huge);
                        return i;
                }
+
                if (++i == num)
                        i = 0;
        } while (i != j);
 
+       atomic_long_inc(&po->rollover->num_failed);
        return idx;
 }
 
@@ -1331,6 +1414,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f,
        return skb_get_queue_mapping(skb) % num;
 }
 
+static unsigned int fanout_demux_bpf(struct packet_fanout *f,
+                                    struct sk_buff *skb,
+                                    unsigned int num)
+{
+       struct bpf_prog *prog;
+       unsigned int ret = 0;
+
+       rcu_read_lock();
+       prog = rcu_dereference(f->bpf_prog);
+       if (prog)
+               ret = bpf_prog_run_clear_cb(prog, skb) % num;
+       rcu_read_unlock();
+
+       return ret;
+}
+
 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
 {
        return f->flags & (flag >> 8);
@@ -1341,17 +1440,17 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
 {
        struct packet_fanout *f = pt->af_packet_priv;
        unsigned int num = READ_ONCE(f->num_members);
+       struct net *net = read_pnet(&f->net);
        struct packet_sock *po;
        unsigned int idx;
 
-       if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
-           !num) {
+       if (!net_eq(dev_net(dev), net) || !num) {
                kfree_skb(skb);
                return 0;
        }
 
        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
-               skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
+               skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
                if (!skb)
                        return 0;
        }
@@ -1373,17 +1472,18 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
                idx = fanout_demux_qm(f, skb, num);
                break;
        case PACKET_FANOUT_ROLLOVER:
-               idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
+               idx = fanout_demux_rollover(f, skb, 0, false, num);
+               break;
+       case PACKET_FANOUT_CBPF:
+       case PACKET_FANOUT_EBPF:
+               idx = fanout_demux_bpf(f, skb, num);
                break;
        }
 
-       po = pkt_sk(f->arr[idx]);
-       if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
-           unlikely(!packet_rcv_has_room(po, skb))) {
-               idx = fanout_demux_rollover(f, skb, idx, idx, num);
-               po = pkt_sk(f->arr[idx]);
-       }
+       if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
+               idx = fanout_demux_rollover(f, skb, idx, true, num);
 
+       po = pkt_sk(f->arr[idx]);
        return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
 }
 
@@ -1420,10 +1520,107 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
 
 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
 {
-       if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
-               return true;
+       if (sk->sk_family != PF_PACKET)
+               return false;
 
-       return false;
+       return ptype->af_packet_priv == pkt_sk(sk)->fanout;
+}
+
+static void fanout_init_data(struct packet_fanout *f)
+{
+       switch (f->type) {
+       case PACKET_FANOUT_LB:
+               atomic_set(&f->rr_cur, 0);
+               break;
+       case PACKET_FANOUT_CBPF:
+       case PACKET_FANOUT_EBPF:
+               RCU_INIT_POINTER(f->bpf_prog, NULL);
+               break;
+       }
+}
+
+static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
+{
+       struct bpf_prog *old;
+
+       spin_lock(&f->lock);
+       old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
+       rcu_assign_pointer(f->bpf_prog, new);
+       spin_unlock(&f->lock);
+
+       if (old) {
+               synchronize_net();
+               bpf_prog_destroy(old);
+       }
+}
+
+static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data,
+                               unsigned int len)
+{
+       struct bpf_prog *new;
+       struct sock_fprog fprog;
+       int ret;
+
+       if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+               return -EPERM;
+       if (len != sizeof(fprog))
+               return -EINVAL;
+       if (copy_from_user(&fprog, data, len))
+               return -EFAULT;
+
+       ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
+       if (ret)
+               return ret;
+
+       __fanout_set_data_bpf(po->fanout, new);
+       return 0;
+}
+
+static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data,
+                               unsigned int len)
+{
+       struct bpf_prog *new;
+       u32 fd;
+
+       if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
+               return -EPERM;
+       if (len != sizeof(fd))
+               return -EINVAL;
+       if (copy_from_user(&fd, data, len))
+               return -EFAULT;
+
+       new = bpf_prog_get(fd);
+       if (IS_ERR(new))
+               return PTR_ERR(new);
+       if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+               bpf_prog_put(new);
+               return -EINVAL;
+       }
+
+       __fanout_set_data_bpf(po->fanout, new);
+       return 0;
+}
+
+static int fanout_set_data(struct packet_sock *po, char __user *data,
+                          unsigned int len)
+{
+       switch (po->fanout->type) {
+       case PACKET_FANOUT_CBPF:
+               return fanout_set_data_cbpf(po, data, len);
+       case PACKET_FANOUT_EBPF:
+               return fanout_set_data_ebpf(po, data, len);
+       default:
+               return -EINVAL;
+       };
+}
+
+static void fanout_release_data(struct packet_fanout *f)
+{
+       switch (f->type) {
+       case PACKET_FANOUT_CBPF:
+       case PACKET_FANOUT_EBPF:
+               __fanout_set_data_bpf(f, NULL);
+       };
 }
 
 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
@@ -1443,6 +1640,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
        case PACKET_FANOUT_CPU:
        case PACKET_FANOUT_RND:
        case PACKET_FANOUT_QM:
+       case PACKET_FANOUT_CBPF:
+       case PACKET_FANOUT_EBPF:
                break;
        default:
                return -EINVAL;
@@ -1454,6 +1653,16 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
        if (po->fanout)
                return -EALREADY;
 
+       if (type == PACKET_FANOUT_ROLLOVER ||
+           (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
+               po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
+               if (!po->rollover)
+                       return -ENOMEM;
+               atomic_long_set(&po->rollover->num, 0);
+               atomic_long_set(&po->rollover->num_huge, 0);
+               atomic_long_set(&po->rollover->num_failed, 0);
+       }
+
        mutex_lock(&fanout_mutex);
        match = NULL;
        list_for_each_entry(f, &fanout_list, list) {
@@ -1475,10 +1684,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
                match->id = id;
                match->type = type;
                match->flags = flags;
-               atomic_set(&match->rr_cur, 0);
                INIT_LIST_HEAD(&match->list);
                spin_lock_init(&match->lock);
                atomic_set(&match->sk_ref, 0);
+               fanout_init_data(match);
                match->prot_hook.type = po->prot_hook.type;
                match->prot_hook.dev = po->prot_hook.dev;
                match->prot_hook.func = packet_rcv_fanout;
@@ -1502,6 +1711,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
        }
 out:
        mutex_unlock(&fanout_mutex);
+       if (err) {
+               kfree(po->rollover);
+               po->rollover = NULL;
+       }
        return err;
 }
 
@@ -1520,9 +1733,27 @@ static void fanout_release(struct sock *sk)
        if (atomic_dec_and_test(&f->sk_ref)) {
                list_del(&f->list);
                dev_remove_pack(&f->prot_hook);
+               fanout_release_data(f);
                kfree(f);
        }
        mutex_unlock(&fanout_mutex);
+
+       if (po->rollover)
+               kfree_rcu(po->rollover, rcu);
+}
+
+static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
+                                         struct sk_buff *skb)
+{
+       /* Earlier code assumed this would be a VLAN pkt, double-check
+        * this now that we have the actual packet in hand. We can only
+        * do this check on Ethernet devices.
+        */
+       if (unlikely(dev->type != ARPHRD_ETHER))
+               return false;
+
+       skb_reset_mac_header(skb);
+       return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
 }
 
 static const struct proto_ops packet_ops;
@@ -1686,18 +1917,10 @@ retry:
                goto retry;
        }
 
-       if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
-               /* Earlier code assumed this would be a VLAN pkt,
-                * double-check this now that we have the actual
-                * packet in hand.
-                */
-               struct ethhdr *ehdr;
-               skb_reset_mac_header(skb);
-               ehdr = eth_hdr(skb);
-               if (ehdr->h_proto != htons(ETH_P_8021Q)) {
-                       err = -EMSGSIZE;
-                       goto out_unlock;
-               }
+       if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
+           !packet_extra_vlan_len_allowed(dev, skb)) {
+               err = -EMSGSIZE;
+               goto out_unlock;
        }
 
        skb->protocol = proto;
@@ -1723,16 +1946,16 @@ out_free:
        return err;
 }
 
-static unsigned int run_filter(const struct sk_buff *skb,
-                                     const struct sock *sk,
-                                     unsigned int res)
+static unsigned int run_filter(struct sk_buff *skb,
+                              const struct sock *sk,
+                              unsigned int res)
 {
        struct sk_filter *filter;
 
        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
-               res = SK_RUN_FILTER(filter, skb);
+               res = bpf_prog_run_clear_cb(filter->prog, skb);
        rcu_read_unlock();
 
        return res;
@@ -2107,8 +2330,8 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 static bool ll_header_truncated(const struct net_device *dev, int len)
 {
        /* net device doesn't like empty head */
-       if (unlikely(len <= dev->hard_header_len)) {
-               net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
+       if (unlikely(len < dev->hard_header_len)) {
+               net_warn_ratelimited("%s: packet size is too short (%d < %d)\n",
                                     current->comm, len, dev->hard_header_len);
                return true;
        }
@@ -2116,6 +2339,15 @@ static bool ll_header_truncated(const struct net_device *dev, int len)
        return false;
 }
 
+static void tpacket_set_protocol(const struct net_device *dev,
+                                struct sk_buff *skb)
+{
+       if (dev->type == ARPHRD_ETHER) {
+               skb_reset_mac_header(skb);
+               skb->protocol = eth_hdr(skb)->h_proto;
+       }
+}
+
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
                void *frame, struct net_device *dev, int size_max,
                __be16 proto, unsigned char *addr, int hlen)
@@ -2152,8 +2384,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);
 
-       if (!packet_use_direct_xmit(po))
-               skb_probe_transport_header(skb, 0);
        if (unlikely(po->tp_tx_has_off)) {
                int off_min, off_max, off;
                off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
@@ -2199,6 +2429,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
                                dev->hard_header_len);
                if (unlikely(err))
                        return err;
+               if (!skb->protocol)
+                       tpacket_set_protocol(dev, skb);
 
                data += dev->hard_header_len;
                to_write -= dev->hard_header_len;
@@ -2233,6 +2465,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
                len = ((to_write > len_max) ? len_max : to_write);
        }
 
+       skb_probe_transport_header(skb, 0);
+
        return tp_len;
 }
 
@@ -2277,12 +2511,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_put;
 
-       reserve = dev->hard_header_len + VLAN_HLEN;
+       if (po->sk.sk_socket->type == SOCK_RAW)
+               reserve = dev->hard_header_len;
        size_max = po->tx_ring.frame_size
                - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
-       if (size_max > dev->mtu + reserve)
-               size_max = dev->mtu + reserve;
+       if (size_max > dev->mtu + reserve + VLAN_HLEN)
+               size_max = dev->mtu + reserve + VLAN_HLEN;
 
        do {
                ph = packet_current_frame(po, &po->tx_ring,
@@ -2309,18 +2544,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
                tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
                                          addr, hlen);
                if (likely(tp_len >= 0) &&
-                   tp_len > dev->mtu + dev->hard_header_len) {
-                       struct ethhdr *ehdr;
-                       /* Earlier code assumed this would be a VLAN pkt,
-                        * double-check this now that we have the actual
-                        * packet in hand.
-                        */
+                   tp_len > dev->mtu + reserve &&
+                   !packet_extra_vlan_len_allowed(dev, skb))
+                       tp_len = -EMSGSIZE;
 
-                       skb_reset_mac_header(skb);
-                       ehdr = eth_hdr(skb);
-                       if (ehdr->h_proto != htons(ETH_P_8021Q))
-                               tp_len = -EMSGSIZE;
-               }
                if (unlikely(tp_len < 0)) {
                        if (po->tp_loss) {
                                __packet_set_status(po, ph,
@@ -2414,6 +2641,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
        __be16 proto;
        unsigned char *addr;
        int err, reserve = 0;
+       struct sockcm_cookie sockc;
        struct virtio_net_hdr vnet_hdr = { 0 };
        int offset = 0;
        int vnet_hdr_len;
@@ -2449,6 +2677,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_unlock;
 
+       sockc.mark = sk->sk_mark;
+       if (msg->msg_controllen) {
+               err = sock_cmsg_send(sk, msg, &sockc);
+               if (unlikely(err))
+                       goto out_unlock;
+       }
+
        if (sock->type == SOCK_RAW)
                reserve = dev->hard_header_len;
        if (po->has_vnet_hdr) {
@@ -2466,15 +2701,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
                        goto out_unlock;
 
                if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
-                   (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
-                    __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
-                     __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
-                       vnet_hdr.hdr_len = __cpu_to_virtio16(false,
-                                __virtio16_to_cpu(false, vnet_hdr.csum_start) +
-                               __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
+                   (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
+                    __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 >
+                     __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len)))
+                       vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(),
+                                __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
+                               __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2);
 
                err = -EINVAL;
-               if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
+               if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len)
                        goto out_unlock;
 
                if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
@@ -2517,7 +2752,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
        hlen = LL_RESERVED_SPACE(dev);
        tlen = dev->needed_tailroom;
        skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
-                              __virtio16_to_cpu(false, vnet_hdr.hdr_len),
+                              __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),
                               msg->msg_flags & MSG_DONTWAIT, &err);
        if (skb == NULL)
                goto out_unlock;
@@ -2541,31 +2776,23 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
        sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 
-       if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
-               /* Earlier code assumed this would be a VLAN pkt,
-                * double-check this now that we have the actual
-                * packet in hand.
-                */
-               struct ethhdr *ehdr;
-               skb_reset_mac_header(skb);
-               ehdr = eth_hdr(skb);
-               if (ehdr->h_proto != htons(ETH_P_8021Q)) {
-                       err = -EMSGSIZE;
-                       goto out_free;
-               }
+       if (!gso_type && (len > dev->mtu + reserve + extra_len) &&
+           !packet_extra_vlan_len_allowed(dev, skb)) {
+               err = -EMSGSIZE;
+               goto out_free;
        }
 
        skb->protocol = proto;
        skb->dev = dev;
        skb->priority = sk->sk_priority;
-       skb->mark = sk->sk_mark;
+       skb->mark = sockc.mark;
 
        packet_pick_tx_queue(dev, skb);
 
        if (po->has_vnet_hdr) {
                if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
-                       u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
-                       u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
+                       u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start);
+                       u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset);
                        if (!skb_partial_csum_set(skb, s, o)) {
                                err = -EINVAL;
                                goto out_free;
@@ -2573,7 +2800,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
                }
 
                skb_shinfo(skb)->gso_size =
-                       __virtio16_to_cpu(false, vnet_hdr.gso_size);
+                       __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size);
                skb_shinfo(skb)->gso_type = gso_type;
 
                /* Header must be checked, and gso_segs computed. */
@@ -2583,8 +2810,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
                len += vnet_hdr_len;
        }
 
-       if (!packet_use_direct_xmit(po))
-               skb_probe_transport_header(skb, reserve);
+       skb_probe_transport_header(skb, reserve);
+
        if (unlikely(extra_len == 4))
                skb->no_fcs = 1;
 
@@ -2687,22 +2914,40 @@ static int packet_release(struct socket *sock)
  *     Attach a packet hook.
  */
 
-static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
+static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
+                         __be16 proto)
 {
        struct packet_sock *po = pkt_sk(sk);
        struct net_device *dev_curr;
        __be16 proto_curr;
        bool need_rehook;
+       struct net_device *dev = NULL;
+       int ret = 0;
+       bool unlisted = false;
 
-       if (po->fanout) {
-               if (dev)
-                       dev_put(dev);
-
+       if (po->fanout)
                return -EINVAL;
-       }
 
        lock_sock(sk);
        spin_lock(&po->bind_lock);
+       rcu_read_lock();
+
+       if (name) {
+               dev = dev_get_by_name_rcu(sock_net(sk), name);
+               if (!dev) {
+                       ret = -ENODEV;
+                       goto out_unlock;
+               }
+       } else if (ifindex) {
+               dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+               if (!dev) {
+                       ret = -ENODEV;
+                       goto out_unlock;
+               }
+       }
+
+       if (dev)
+               dev_hold(dev);
 
        proto_curr = po->prot_hook.type;
        dev_curr = po->prot_hook.dev;
@@ -2710,14 +2955,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
        need_rehook = proto_curr != proto || dev_curr != dev;
 
        if (need_rehook) {
-               unregister_prot_hook(sk, true);
+               if (po->running) {
+                       rcu_read_unlock();
+                       __unregister_prot_hook(sk, true);
+                       rcu_read_lock();
+                       dev_curr = po->prot_hook.dev;
+                       if (dev)
+                               unlisted = !dev_get_by_index_rcu(sock_net(sk),
+                                                                dev->ifindex);
+               }
 
                po->num = proto;
                po->prot_hook.type = proto;
-               po->prot_hook.dev = dev;
 
-               po->ifindex = dev ? dev->ifindex : 0;
-               packet_cached_dev_assign(po, dev);
+               if (unlikely(unlisted)) {
+                       dev_put(dev);
+                       po->prot_hook.dev = NULL;
+                       po->ifindex = -1;
+                       packet_cached_dev_reset(po);
+               } else {
+                       po->prot_hook.dev = dev;
+                       po->ifindex = dev ? dev->ifindex : 0;
+                       packet_cached_dev_assign(po, dev);
+               }
        }
        if (dev_curr)
                dev_put(dev_curr);
@@ -2725,7 +2985,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
        if (proto == 0 || !need_rehook)
                goto out_unlock;
 
-       if (!dev || (dev->flags & IFF_UP)) {
+       if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
                register_prot_hook(sk);
        } else {
                sk->sk_err = ENETDOWN;
@@ -2734,9 +2994,10 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
        }
 
 out_unlock:
+       rcu_read_unlock();
        spin_unlock(&po->bind_lock);
        release_sock(sk);
-       return 0;
+       return ret;
 }
 
 /*
@@ -2748,8 +3009,6 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
 {
        struct sock *sk = sock->sk;
        char name[15];
-       struct net_device *dev;
-       int err = -ENODEV;
 
        /*
         *      Check legality
@@ -2759,19 +3018,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
                return -EINVAL;
        strlcpy(name, uaddr->sa_data, sizeof(name));
 
-       dev = dev_get_by_name(sock_net(sk), name);
-       if (dev)
-               err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
-       return err;
+       return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
 }
 
 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
        struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
        struct sock *sk = sock->sk;
-       struct net_device *dev = NULL;
-       int err;
-
 
        /*
         *      Check legality
@@ -2782,16 +3035,8 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len
        if (sll->sll_family != AF_PACKET)
                return -EINVAL;
 
-       if (sll->sll_ifindex) {
-               err = -ENODEV;
-               dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
-               if (dev == NULL)
-                       goto out;
-       }
-       err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
-
-out:
-       return err;
+       return packet_do_bind(sk, NULL, sll->sll_ifindex,
+                             sll->sll_protocol ? : pkt_sk(sk)->num);
 }
 
 static struct proto packet_proto = {
@@ -2821,7 +3066,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
        sock->state = SS_UNCONNECTED;
 
        err = -ENOBUFS;
-       sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+       sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
        if (sk == NULL)
                goto out;
 
@@ -2851,6 +3096,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 
        spin_lock_init(&po->bind_lock);
        mutex_init(&po->pg_vec_lock);
+       po->rollover = NULL;
        po->prot_hook.func = packet_rcv;
 
        if (sock->type == SOCK_PACKET)
@@ -2928,6 +3174,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
        if (skb == NULL)
                goto out;
 
+       if (pkt_sk(sk)->pressure)
+               packet_rcv_has_room(pkt_sk(sk), NULL);
+
        if (pkt_sk(sk)->has_vnet_hdr) {
                struct virtio_net_hdr vnet_hdr = { 0 };
 
@@ -2943,9 +3192,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
                        /* This is a hint as to how much should be linear. */
                        vnet_hdr.hdr_len =
-                               __cpu_to_virtio16(false, skb_headlen(skb));
+                               __cpu_to_virtio16(vio_le(), skb_headlen(skb));
                        vnet_hdr.gso_size =
-                               __cpu_to_virtio16(false, sinfo->gso_size);
+                               __cpu_to_virtio16(vio_le(), sinfo->gso_size);
                        if (sinfo->gso_type & SKB_GSO_TCPV4)
                                vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
                        else if (sinfo->gso_type & SKB_GSO_TCPV6)
@@ -2963,9 +3212,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-                       vnet_hdr.csum_start = __cpu_to_virtio16(false,
+                       vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(),
                                          skb_checksum_start_offset(skb));
-                       vnet_hdr.csum_offset = __cpu_to_virtio16(false,
+                       vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(),
                                                         skb->csum_offset);
                } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                        vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
@@ -3432,6 +3681,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
                return fanout_add(sk, val & 0xffff, val >> 16);
        }
+       case PACKET_FANOUT_DATA:
+       {
+               if (!po->fanout)
+                       return -EINVAL;
+
+               return fanout_set_data(po, optval, optlen);
+       }
        case PACKET_TX_HAS_OFF:
        {
                unsigned int val;
@@ -3471,6 +3727,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
        struct packet_sock *po = pkt_sk(sk);
        void *data = &val;
        union tpacket_stats_u st;
+       struct tpacket_rollover_stats rstats;
 
        if (level != SOL_PACKET)
                return -ENOPROTOOPT;
@@ -3546,6 +3803,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
                        ((u32)po->fanout->flags << 24)) :
                       0);
                break;
+       case PACKET_ROLLOVER_STATS:
+               if (!po->rollover)
+                       return -EINVAL;
+               rstats.tp_all = atomic_long_read(&po->rollover->num);
+               rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
+               rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
+               data = &rstats;
+               lv = sizeof(rstats);
+               break;
        case PACKET_TX_HAS_OFF:
                val = po->tp_tx_has_off;
                break;
@@ -3683,6 +3949,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
                        TP_STATUS_KERNEL))
                        mask |= POLLIN | POLLRDNORM;
        }
+       if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
+               po->pressure = 0;
        spin_unlock_bh(&sk->sk_receive_queue.lock);
        spin_lock_bh(&sk->sk_write_queue.lock);
        if (po->tx_ring.pg_vec) {
@@ -3842,7 +4110,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                err = -EINVAL;
                if (unlikely((int)req->tp_block_size <= 0))
                        goto out;
-               if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
+               if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
                        goto out;
                if (po->tp_version >= TPACKET_V3 &&
                    (int)(req->tp_block_size -
@@ -3854,8 +4122,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
                        goto out;
 
-               rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
-               if (unlikely(rb->frames_per_block <= 0))
+               rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
+               if (unlikely(rb->frames_per_block == 0))
                        goto out;
                if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
                                        req->tp_frame_nr))
@@ -3872,7 +4140,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                 * it above but just being paranoid
                 */
                        if (!tx_ring)
-                               init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
+                               init_prb_bdqc(po, rb, pg_vec, req_u);
                        break;
                default:
                        break;
@@ -3932,7 +4200,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
        if (closing && (po->tp_version > TPACKET_V2)) {
                /* Because we don't support block-based V3 on tx-ring */
                if (!tx_ring)
-                       prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
+                       prb_shutdown_retire_blk_timer(po, rb_queue);
        }
        release_sock(sk);