These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / ipv4 / tcp.c
index bb2ce74..036a76b 100644 (file)
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <asm/unaligned.h>
 #include <net/busy_poll.h>
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
@@ -388,6 +389,7 @@ void tcp_init_sock(struct sock *sk)
 
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+       tp->rtt_min[0].rtt = ~0U;
 
        /* So many TCP implementations out there (incorrectly) count the
         * initial SYN frame in their delayed-ACK and congestion control
@@ -450,11 +452,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
        unsigned int mask;
        struct sock *sk = sock->sk;
        const struct tcp_sock *tp = tcp_sk(sk);
+       int state;
 
        sock_rps_record_flow(sk);
 
        sock_poll_wait(file, sk_sleep(sk), wait);
-       if (sk->sk_state == TCP_LISTEN)
+
+       state = sk_state_load(sk);
+       if (state == TCP_LISTEN)
                return inet_csk_listen_poll(sk);
 
        /* Socket is not locked. We are protected from async events
@@ -491,14 +496,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
         * blocking on fresh not-connected or disconnected socket. --ANK
         */
-       if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+       if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
                mask |= POLLHUP;
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 
        /* Connected or passive Fast Open socket? */
-       if (sk->sk_state != TCP_SYN_SENT &&
-           (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+       if (state != TCP_SYN_SENT &&
+           (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
                int target = sock_rcvlowat(sk, 0, INT_MAX);
 
                if (tp->urg_seq == tp->copied_seq &&
@@ -506,9 +511,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                    tp->urg_data)
                        target++;
 
-               /* Potential race condition. If read of tp below will
-                * escape above sk->sk_state, we can be illegally awaken
-                * in SYN_* states. */
                if (tp->rcv_nxt - tp->copied_seq >= target)
                        mask |= POLLIN | POLLRDNORM;
 
@@ -516,8 +518,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                        if (sk_stream_is_writeable(sk)) {
                                mask |= POLLOUT | POLLWRNORM;
                        } else {  /* send SIGIO later */
-                               set_bit(SOCK_ASYNC_NOSPACE,
-                                       &sk->sk_socket->flags);
+                               sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
                                /* Race breaker. If space is freed after
@@ -627,6 +628,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
        sk_mem_charge(sk, skb->truesize);
        if (tp->nonagle & TCP_NAGLE_PUSH)
                tp->nonagle &= ~TCP_NAGLE_PUSH;
+
+       tcp_slow_start_after_idle_check(sk);
 }
 
 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
@@ -695,8 +698,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
        struct tcp_splice_state *tss = rd_desc->arg.data;
        int ret;
 
-       ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
-                             tss->flags);
+       ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+                             min(rd_desc->count, len), tss->flags,
+                             skb_socket_splice);
        if (ret > 0)
                rd_desc->count -= ret;
        return ret;
@@ -779,7 +783,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
                                ret = -EAGAIN;
                                break;
                        }
-                       sk_wait_data(sk, &timeo);
+                       sk_wait_data(sk, &timeo, NULL);
                        if (signal_pending(current)) {
                                ret = sock_intr_errno(timeo);
                                break;
@@ -809,16 +813,28 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 }
 EXPORT_SYMBOL(tcp_splice_read);
 
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
+struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+                                   bool force_schedule)
 {
        struct sk_buff *skb;
 
        /* The TCP header must be at least 32-bit aligned.  */
        size = ALIGN(size, 4);
 
+       if (unlikely(tcp_under_memory_pressure(sk)))
+               sk_mem_reclaim_partial(sk);
+
        skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
-       if (skb) {
-               if (sk_wmem_schedule(sk, skb->truesize)) {
+       if (likely(skb)) {
+               bool mem_scheduled;
+
+               if (force_schedule) {
+                       mem_scheduled = true;
+                       sk_forced_mem_schedule(sk, skb->truesize);
+               } else {
+                       mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
+               }
+               if (likely(mem_scheduled)) {
                        skb_reserve(skb, sk->sk_prot->max_header);
                        /*
                         * Make sure that we have exactly size bytes
@@ -885,11 +901,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
         */
        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
            !tcp_passive_fastopen(sk)) {
-               if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+               err = sk_stream_wait_connect(sk, &timeo);
+               if (err != 0)
                        goto out_err;
        }
 
-       clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+       sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
        mss_now = tcp_send_mss(sk, &size_goal, flags);
        copied = 0;
@@ -908,7 +925,8 @@ new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_sndbuf;
 
-                       skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+                       skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
+                                                 skb_queue_empty(&sk->sk_write_queue));
                        if (!skb)
                                goto wait_for_memory;
 
@@ -921,7 +939,7 @@ new_segment:
 
                i = skb_shinfo(skb)->nr_frags;
                can_coalesce = skb_can_coalesce(skb, i, page, offset);
-               if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+               if (!can_coalesce && i >= sysctl_max_skb_frags) {
                        tcp_mark_push(tp, skb);
                        goto new_segment;
                }
@@ -951,7 +969,8 @@ new_segment:
 
                copied += copy;
                offset += copy;
-               if (!(size -= copy)) {
+               size -= copy;
+               if (!size) {
                        tcp_tx_timestamp(sk, skb);
                        goto out;
                }
@@ -972,7 +991,8 @@ wait_for_memory:
                tcp_push(sk, flags & ~MSG_MORE, mss_now,
                         TCP_NAGLE_PUSH, size_goal);
 
-               if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+               err = sk_stream_wait_memory(sk, &timeo);
+               if (err != 0)
                        goto do_error;
 
                mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -987,6 +1007,9 @@ do_error:
        if (copied)
                goto out;
 out_err:
+       /* make sure we wake any epoll edge trigger waiter */
+       if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+               sk->sk_write_space(sk);
        return sk_stream_error(sk, flags, err);
 }
 
@@ -1092,7 +1115,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
         */
        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
            !tcp_passive_fastopen(sk)) {
-               if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+               err = sk_stream_wait_connect(sk, &timeo);
+               if (err != 0)
                        goto do_error;
        }
 
@@ -1110,7 +1134,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
        }
 
        /* This should be in poll */
-       clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+       sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
        mss_now = tcp_send_mss(sk, &size_goal, flags);
 
@@ -1144,7 +1168,8 @@ new_segment:
 
                        skb = sk_stream_alloc_skb(sk,
                                                  select_size(sk, sg),
-                                                 sk->sk_allocation);
+                                                 sk->sk_allocation,
+                                                 skb_queue_empty(&sk->sk_write_queue));
                        if (!skb)
                                goto wait_for_memory;
 
@@ -1187,7 +1212,7 @@ new_segment:
 
                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
-                               if (i == MAX_SKB_FRAGS || !sg) {
+                               if (i == sysctl_max_skb_frags || !sg) {
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                }
@@ -1247,7 +1272,8 @@ wait_for_memory:
                        tcp_push(sk, flags & ~MSG_MORE, mss_now,
                                 TCP_NAGLE_PUSH, size_goal);
 
-               if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+               err = sk_stream_wait_memory(sk, &timeo);
+               if (err != 0)
                        goto do_error;
 
                mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1275,6 +1301,9 @@ do_error:
                goto out;
 out_err:
        err = sk_stream_error(sk, flags, err);
+       /* make sure we wake any epoll edge trigger waiter */
+       if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+               sk->sk_write_space(sk);
        release_sock(sk);
        return err;
 }
@@ -1554,7 +1583,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
        int target;             /* Read at least this many bytes */
        long timeo;
        struct task_struct *user_recv = NULL;
-       struct sk_buff *skb;
+       struct sk_buff *skb, *last;
        u32 urg_hole = 0;
 
        if (unlikely(flags & MSG_ERRQUEUE))
@@ -1614,7 +1643,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
                /* Next get a buffer. */
 
+               last = skb_peek_tail(&sk->sk_receive_queue);
                skb_queue_walk(&sk->sk_receive_queue, skb) {
+                       last = skb;
                        /* Now that we have two receive queues this
                         * shouldn't happen.
                         */
@@ -1733,15 +1764,17 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                        /* Do not sleep, just process backlog. */
                        release_sock(sk);
                        lock_sock(sk);
-               } else
-                       sk_wait_data(sk, &timeo);
+               } else {
+                       sk_wait_data(sk, &timeo, last);
+               }
 
                if (user_recv) {
                        int chunk;
 
                        /* __ Restore normal policy in scheduler __ */
 
-                       if ((chunk = len - tp->ucopy.len) != 0) {
+                       chunk = len - tp->ucopy.len;
+                       if (chunk != 0) {
                                NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
                                len -= chunk;
                                copied += chunk;
@@ -1752,7 +1785,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 do_prequeue:
                                tcp_prequeue_process(sk);
 
-                               if ((chunk = len - tp->ucopy.len) != 0) {
+                               chunk = len - tp->ucopy.len;
+                               if (chunk != 0) {
                                        NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
                                        len -= chunk;
                                        copied += chunk;
@@ -1900,7 +1934,7 @@ void tcp_set_state(struct sock *sk, int state)
        /* Change state AFTER socket is unhashed to avoid closed
         * socket sitting in hash tables.
         */
-       sk->sk_state = state;
+       sk_state_store(sk, state);
 
 #ifdef STATE_TRACE
        SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
@@ -2204,7 +2238,8 @@ int tcp_disconnect(struct sock *sk, int flags)
        sk->sk_shutdown = 0;
        sock_reset_flag(sk, SOCK_DONE);
        tp->srtt_us = 0;
-       if ((tp->write_seq += tp->max_window + 2) == 0)
+       tp->write_seq += tp->max_window + 2;
+       if (tp->write_seq == 0)
                tp->write_seq = 1;
        icsk->icsk_backoff = 0;
        tp->snd_cwnd = 2;
@@ -2227,13 +2262,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 }
 EXPORT_SYMBOL(tcp_disconnect);
 
-void tcp_sock_destruct(struct sock *sk)
-{
-       inet_sock_destruct(sk);
-
-       kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
-}
-
 static inline bool tcp_can_repair_sock(const struct sock *sk)
 {
        return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
@@ -2483,6 +2511,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                        icsk->icsk_syn_retries = val;
                break;
 
+       case TCP_SAVE_SYN:
+               if (val < 0 || val > 1)
+                       err = -EINVAL;
+               else
+                       tp->save_syn = val;
+               break;
+
        case TCP_LINGER2:
                if (val < 0)
                        tp->linger2 = -1;
@@ -2548,7 +2583,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                    TCPF_LISTEN))) {
                        tcp_fastopen_init_key_once(true);
 
-                       err = fastopen_init_queue(sk, val);
+                       fastopen_queue_tune(sk, val);
                } else {
                        err = -EINVAL;
                }
@@ -2599,15 +2634,19 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
 /* Return information about state of tcp endpoint in API format. */
 void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
-       const struct tcp_sock *tp = tcp_sk(sk);
+       const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 now = tcp_time_stamp;
        unsigned int start;
+       u64 rate64;
        u32 rate;
 
        memset(info, 0, sizeof(*info));
+       if (sk->sk_type != SOCK_STREAM)
+               return;
+
+       info->tcpi_state = sk_state_load(sk);
 
-       info->tcpi_state = sk->sk_state;
        info->tcpi_ca_state = icsk->icsk_ca_state;
        info->tcpi_retransmits = icsk->icsk_retransmits;
        info->tcpi_probes = icsk->icsk_probes_out;
@@ -2635,7 +2674,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        info->tcpi_snd_mss = tp->mss_cache;
        info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
 
-       if (sk->sk_state == TCP_LISTEN) {
+       if (info->tcpi_state == TCP_LISTEN) {
                info->tcpi_unacked = sk->sk_ack_backlog;
                info->tcpi_sacked = sk->sk_max_ack_backlog;
        } else {
@@ -2665,16 +2704,20 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        info->tcpi_total_retrans = tp->total_retrans;
 
        rate = READ_ONCE(sk->sk_pacing_rate);
-       info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+       rate64 = rate != ~0U ? rate : ~0ULL;
+       put_unaligned(rate64, &info->tcpi_pacing_rate);
 
        rate = READ_ONCE(sk->sk_max_pacing_rate);
-       info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+       rate64 = rate != ~0U ? rate : ~0ULL;
+       put_unaligned(rate64, &info->tcpi_max_pacing_rate);
 
        do {
                start = u64_stats_fetch_begin_irq(&tp->syncp);
-               info->tcpi_bytes_acked = tp->bytes_acked;
-               info->tcpi_bytes_received = tp->bytes_received;
+               put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
+               put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
        } while (u64_stats_fetch_retry_irq(&tp->syncp, start));
+       info->tcpi_segs_out = tp->segs_out;
+       info->tcpi_segs_in = tp->segs_in;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
@@ -2812,10 +2855,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                break;
 
        case TCP_FASTOPEN:
-               if (icsk->icsk_accept_queue.fastopenq)
-                       val = icsk->icsk_accept_queue.fastopenq->max_qlen;
-               else
-                       val = 0;
+               val = icsk->icsk_accept_queue.fastopenq.max_qlen;
                break;
 
        case TCP_TIMESTAMP:
@@ -2824,6 +2864,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
        case TCP_NOTSENT_LOWAT:
                val = tp->notsent_lowat;
                break;
+       case TCP_SAVE_SYN:
+               val = tp->save_syn;
+               break;
+       case TCP_SAVED_SYN: {
+               if (get_user(len, optlen))
+                       return -EFAULT;
+
+               lock_sock(sk);
+               if (tp->saved_syn) {
+                       if (len < tp->saved_syn[0]) {
+                               if (put_user(tp->saved_syn[0], optlen)) {
+                                       release_sock(sk);
+                                       return -EFAULT;
+                               }
+                               release_sock(sk);
+                               return -EINVAL;
+                       }
+                       len = tp->saved_syn[0];
+                       if (put_user(len, optlen)) {
+                               release_sock(sk);
+                               return -EFAULT;
+                       }
+                       if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+                               release_sock(sk);
+                               return -EFAULT;
+                       }
+                       tcp_saved_syn_free(tp);
+                       release_sock(sk);
+               } else {
+                       release_sock(sk);
+                       len = 0;
+                       if (put_user(len, optlen))
+                               return -EFAULT;
+               }
+               return 0;
+       }
        default:
                return -ENOPROTOOPT;
        }
@@ -3028,11 +3104,12 @@ __setup("thash_entries=", set_thash_entries);
 
 static void __init tcp_init_mem(void)
 {
-       unsigned long limit = nr_free_buffer_pages() / 8;
+       unsigned long limit = nr_free_buffer_pages() / 16;
+
        limit = max(limit, 128UL);
-       sysctl_tcp_mem[0] = limit / 4 * 3;
-       sysctl_tcp_mem[1] = limit;
-       sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+       sysctl_tcp_mem[0] = limit / 4 * 3;              /* 4.68 % */
+       sysctl_tcp_mem[1] = limit;                      /* 6.25 % */
+       sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;      /* 9.37 % */
 }
 
 void __init tcp_init(void)