These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / ipv4 / inet_hashtables.c
index c6fb80b..ccc5980 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
+#include <linux/vmalloc.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
@@ -90,10 +91,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
                    const unsigned short snum)
 {
-       struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-
-       atomic_inc(&hashinfo->bsockets);
-
        inet_sk(sk)->inet_num = snum;
        sk_add_bind_node(sk, &tb->owners);
        tb->num_owners++;
@@ -111,8 +108,6 @@ static void __inet_put_port(struct sock *sk)
        struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
        struct inet_bind_bucket *tb;
 
-       atomic_dec(&hashinfo->bsockets);
-
        spin_lock(&head->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
        __sk_del_bind_node(sk);
@@ -131,7 +126,7 @@ void inet_put_port(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_put_port);
 
-int __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(const struct sock *sk, struct sock *child)
 {
        struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
        unsigned short port = inet_sk(child)->inet_num;
@@ -142,6 +137,10 @@ int __inet_inherit_port(struct sock *sk, struct sock *child)
 
        spin_lock(&head->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
+       if (unlikely(!tb)) {
+               spin_unlock(&head->lock);
+               return -ENOENT;
+       }
        if (tb->port != port) {
                /* NOTE: using tproxy and redirecting skbs to a proxy
                 * on a different listener port breaks the assumption
@@ -190,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
                                return -1;
                        score += 4;
                }
+               if (sk->sk_incoming_cpu == raw_smp_processor_id())
+                       score++;
        }
        return score;
 }
@@ -348,7 +349,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
        struct sock *sk2;
        const struct hlist_nulls_node *node;
        struct inet_timewait_sock *tw = NULL;
-       int twrefcnt = 0;
 
        spin_lock(lock);
 
@@ -376,21 +376,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
        WARN_ON(!sk_unhashed(sk));
        __sk_nulls_add_node_rcu(sk, &head->chain);
        if (tw) {
-               twrefcnt = inet_twsk_unhash(tw);
+               sk_nulls_del_node_init_rcu((struct sock *)tw);
                NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
        }
        spin_unlock(lock);
-       if (twrefcnt)
-               inet_twsk_put(tw);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 
        if (twp) {
                *twp = tw;
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
-               inet_twsk_deschedule(tw);
-
-               inet_twsk_put(tw);
+               inet_twsk_deschedule_put(tw);
        }
        return 0;
 
@@ -399,23 +395,27 @@ not_unique:
        return -EADDRNOTAVAIL;
 }
 
-static inline u32 inet_sk_port_offset(const struct sock *sk)
+static u32 inet_sk_port_offset(const struct sock *sk)
 {
        const struct inet_sock *inet = inet_sk(sk);
+
        return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
                                          inet->inet_daddr,
                                          inet->inet_dport);
 }
 
-int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
+/* insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk)
 {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
        struct hlist_nulls_head *list;
        struct inet_ehash_bucket *head;
        spinlock_t *lock;
-       int twrefcnt = 0;
+       bool ret = true;
 
-       WARN_ON(!sk_unhashed(sk));
+       WARN_ON_ONCE(!sk_unhashed(sk));
 
        sk->sk_hash = sk_ehashfn(sk);
        head = inet_ehash_bucket(hashinfo, sk->sk_hash);
@@ -423,25 +423,41 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
        lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 
        spin_lock(lock);
-       __sk_nulls_add_node_rcu(sk, list);
-       if (tw) {
-               WARN_ON(sk->sk_hash != tw->tw_hash);
-               twrefcnt = inet_twsk_unhash(tw);
+       if (osk) {
+               WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+               ret = sk_nulls_del_node_init_rcu(osk);
        }
+       if (ret)
+               __sk_nulls_add_node_rcu(sk, list);
        spin_unlock(lock);
-       sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-       return twrefcnt;
+       return ret;
 }
-EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
 
-int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
+{
+       bool ok = inet_ehash_insert(sk, osk);
+
+       if (ok) {
+               sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+       } else {
+               percpu_counter_inc(sk->sk_prot->orphan_count);
+               sk->sk_state = TCP_CLOSE;
+               sock_set_flag(sk, SOCK_DEAD);
+               inet_csk_destroy_sock(sk);
+       }
+       return ok;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
+
+void __inet_hash(struct sock *sk, struct sock *osk)
 {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
        struct inet_listen_hashbucket *ilb;
 
-       if (sk->sk_state != TCP_LISTEN)
-               return __inet_hash_nolisten(sk, tw);
-
+       if (sk->sk_state != TCP_LISTEN) {
+               inet_ehash_nolisten(sk, osk);
+               return;
+       }
        WARN_ON(!sk_unhashed(sk));
        ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 
@@ -449,7 +465,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw)
        __sk_nulls_add_node_rcu(sk, &ilb->head);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        spin_unlock(&ilb->lock);
-       return 0;
 }
 EXPORT_SYMBOL(__inet_hash);
 
@@ -496,7 +511,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct inet_bind_bucket *tb;
        int ret;
        struct net *net = sock_net(sk);
-       int twrefcnt = 1;
 
        if (!snum) {
                int i, remaining, low, high, port;
@@ -507,8 +521,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                inet_get_local_port_range(net, &low, &high);
                remaining = (high - low) + 1;
 
+               /* By starting with offset being an even number,
+                * we tend to leave about 50% of ports for other uses,
+                * like bind(0).
+                */
+               offset &= ~1;
+
                local_bh_disable();
-               for (i = 1; i <= remaining; i++) {
+               for (i = 0; i < remaining; i++) {
                        port = low + (i + offset) % remaining;
                        if (inet_is_local_reserved_port(net, port))
                                continue;
@@ -552,25 +572,20 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                return -EADDRNOTAVAIL;
 
 ok:
-               hint += i;
+               hint += (i + 2) & ~1;
 
                /* Head lock still held and bh's disabled */
                inet_bind_hash(sk, tb, port);
                if (sk_unhashed(sk)) {
                        inet_sk(sk)->inet_sport = htons(port);
-                       twrefcnt += __inet_hash_nolisten(sk, tw);
+                       inet_ehash_nolisten(sk, (struct sock *)tw);
                }
                if (tw)
-                       twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
+                       inet_twsk_bind_unhash(tw, hinfo);
                spin_unlock(&head->lock);
 
-               if (tw) {
-                       inet_twsk_deschedule(tw);
-                       while (twrefcnt) {
-                               twrefcnt--;
-                               inet_twsk_put(tw);
-                       }
-               }
+               if (tw)
+                       inet_twsk_deschedule_put(tw);
 
                ret = 0;
                goto out;
@@ -580,7 +595,7 @@ ok:
        tb  = inet_csk(sk)->icsk_bind_hash;
        spin_lock_bh(&head->lock);
        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-               __inet_hash_nolisten(sk, NULL);
+               inet_ehash_nolisten(sk, NULL);
                spin_unlock_bh(&head->lock);
                return 0;
        } else {
@@ -599,7 +614,11 @@ out:
 int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk)
 {
-       return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
+       u32 port_offset = 0;
+
+       if (!inet_sk(sk)->inet_num)
+               port_offset = inet_sk_port_offset(sk);
+       return __inet_hash_connect(death_row, sk, port_offset,
                                   __inet_check_established);
 }
 EXPORT_SYMBOL_GPL(inet_hash_connect);
@@ -608,7 +627,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 {
        int i;
 
-       atomic_set(&h->bsockets, 0);
        for (i = 0; i < INET_LHTABLE_SIZE; i++) {
                spin_lock_init(&h->listening_hash[i].lock);
                INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
@@ -616,3 +634,32 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
                }
 }
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
+
+int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+       unsigned int locksz = sizeof(spinlock_t);
+       unsigned int i, nblocks = 1;
+
+       if (locksz != 0) {
+               /* allocate 2 cache lines or at least one spinlock per cpu */
+               nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
+               nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+
+               /* no more locks than number of hash buckets */
+               nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+               hashinfo->ehash_locks = kmalloc_array(nblocks, locksz,
+                                                     GFP_KERNEL | __GFP_NOWARN);
+               if (!hashinfo->ehash_locks)
+                       hashinfo->ehash_locks = vmalloc(nblocks * locksz);
+
+               if (!hashinfo->ehash_locks)
+                       return -ENOMEM;
+
+               for (i = 0; i < nblocks; i++)
+                       spin_lock_init(&hashinfo->ehash_locks[i]);
+       }
+       hashinfo->ehash_locks_mask = nblocks - 1;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);