Upgrade to 4.4.50-rt62
[kvmfornfv.git] / kernel / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 #include <linux/locallock.h>
66
67 #include <net/net_namespace.h>
68 #include <net/icmp.h>
69 #include <net/inet_hashtables.h>
70 #include <net/tcp.h>
71 #include <net/transp_v6.h>
72 #include <net/ipv6.h>
73 #include <net/inet_common.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 #include <net/busy_poll.h>
79
80 #include <linux/inet.h>
81 #include <linux/ipv6.h>
82 #include <linux/stddef.h>
83 #include <linux/proc_fs.h>
84 #include <linux/seq_file.h>
85
86 #include <linux/crypto.h>
87 #include <linux/scatterlist.h>
88
89 int sysctl_tcp_tw_reuse __read_mostly;
90 int sysctl_tcp_low_latency __read_mostly;
91 EXPORT_SYMBOL(sysctl_tcp_low_latency);
92
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
100
101 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 {
103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104                                           ip_hdr(skb)->saddr,
105                                           tcp_hdr(skb)->dest,
106                                           tcp_hdr(skb)->source);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112         struct tcp_sock *tp = tcp_sk(sk);
113
114         /* With PAWS, it is safe from the viewpoint
115            of data integrity. Even without PAWS it is safe provided sequence
116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118            Actually, the idea is close to VJ's one, only timestamp cache is
119            held not per host, but per port pair and TW bucket is used as state
120            holder.
121
122            If TW bucket has been already destroyed we fall back to VJ's scheme
123            and use initial timestamp retrieved from peer table.
124          */
125         if (tcptw->tw_ts_recent_stamp &&
126             (!twp || (sysctl_tcp_tw_reuse &&
127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129                 if (tp->write_seq == 0)
130                         tp->write_seq = 1;
131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133                 sock_hold(sktw);
134                 return 1;
135         }
136
137         return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
141 /* This will initiate an outgoing connection. */
142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145         struct inet_sock *inet = inet_sk(sk);
146         struct tcp_sock *tp = tcp_sk(sk);
147         __be16 orig_sport, orig_dport;
148         __be32 daddr, nexthop;
149         struct flowi4 *fl4;
150         struct rtable *rt;
151         int err;
152         struct ip_options_rcu *inet_opt;
153
154         if (addr_len < sizeof(struct sockaddr_in))
155                 return -EINVAL;
156
157         if (usin->sin_family != AF_INET)
158                 return -EAFNOSUPPORT;
159
160         nexthop = daddr = usin->sin_addr.s_addr;
161         inet_opt = rcu_dereference_protected(inet->inet_opt,
162                                              sock_owned_by_user(sk));
163         if (inet_opt && inet_opt->opt.srr) {
164                 if (!daddr)
165                         return -EINVAL;
166                 nexthop = inet_opt->opt.faddr;
167         }
168
169         orig_sport = inet->inet_sport;
170         orig_dport = usin->sin_port;
171         fl4 = &inet->cork.fl.u.ip4;
172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174                               IPPROTO_TCP,
175                               orig_sport, orig_dport, sk);
176         if (IS_ERR(rt)) {
177                 err = PTR_ERR(rt);
178                 if (err == -ENETUNREACH)
179                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180                 return err;
181         }
182
183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184                 ip_rt_put(rt);
185                 return -ENETUNREACH;
186         }
187
188         if (!inet_opt || !inet_opt->opt.srr)
189                 daddr = fl4->daddr;
190
191         if (!inet->inet_saddr)
192                 inet->inet_saddr = fl4->saddr;
193         sk_rcv_saddr_set(sk, inet->inet_saddr);
194
195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196                 /* Reset inherited state */
197                 tp->rx_opt.ts_recent       = 0;
198                 tp->rx_opt.ts_recent_stamp = 0;
199                 if (likely(!tp->repair))
200                         tp->write_seq      = 0;
201         }
202
203         if (tcp_death_row.sysctl_tw_recycle &&
204             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
205                 tcp_fetch_timewait_stamp(sk, &rt->dst);
206
207         inet->inet_dport = usin->sin_port;
208         sk_daddr_set(sk, daddr);
209
210         inet_csk(sk)->icsk_ext_hdr_len = 0;
211         if (inet_opt)
212                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
213
214         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
215
216         /* Socket identity is still unknown (sport may be zero).
217          * However we set state to SYN-SENT and not releasing socket
218          * lock select source port, enter ourselves into the hash tables and
219          * complete initialization after this.
220          */
221         tcp_set_state(sk, TCP_SYN_SENT);
222         err = inet_hash_connect(&tcp_death_row, sk);
223         if (err)
224                 goto failure;
225
226         sk_set_txhash(sk);
227
228         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
229                                inet->inet_sport, inet->inet_dport, sk);
230         if (IS_ERR(rt)) {
231                 err = PTR_ERR(rt);
232                 rt = NULL;
233                 goto failure;
234         }
235         /* OK, now commit destination to socket.  */
236         sk->sk_gso_type = SKB_GSO_TCPV4;
237         sk_setup_caps(sk, &rt->dst);
238
239         if (!tp->write_seq && likely(!tp->repair))
240                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
241                                                            inet->inet_daddr,
242                                                            inet->inet_sport,
243                                                            usin->sin_port);
244
245         inet->inet_id = tp->write_seq ^ jiffies;
246
247         err = tcp_connect(sk);
248
249         rt = NULL;
250         if (err)
251                 goto failure;
252
253         return 0;
254
255 failure:
256         /*
257          * This unhashes the socket and releases the local port,
258          * if necessary.
259          */
260         tcp_set_state(sk, TCP_CLOSE);
261         ip_rt_put(rt);
262         sk->sk_route_caps = 0;
263         inet->inet_dport = 0;
264         return err;
265 }
266 EXPORT_SYMBOL(tcp_v4_connect);
267
268 /*
269  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
270  * It can be called through tcp_release_cb() if socket was owned by user
271  * at the time tcp_v4_err() was called to handle ICMP message.
272  */
273 void tcp_v4_mtu_reduced(struct sock *sk)
274 {
275         struct dst_entry *dst;
276         struct inet_sock *inet = inet_sk(sk);
277         u32 mtu = tcp_sk(sk)->mtu_info;
278
279         dst = inet_csk_update_pmtu(sk, mtu);
280         if (!dst)
281                 return;
282
283         /* Something is about to be wrong... Remember soft error
284          * for the case, if this connection will not able to recover.
285          */
286         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
287                 sk->sk_err_soft = EMSGSIZE;
288
289         mtu = dst_mtu(dst);
290
291         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
292             ip_sk_accept_pmtu(sk) &&
293             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
294                 tcp_sync_mss(sk, mtu);
295
296                 /* Resend the TCP packet because it's
297                  * clear that the old packet has been
298                  * dropped. This is the new "fast" path mtu
299                  * discovery.
300                  */
301                 tcp_simple_retransmit(sk);
302         } /* else let the usual retransmit timer handle it */
303 }
304 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
305
306 static void do_redirect(struct sk_buff *skb, struct sock *sk)
307 {
308         struct dst_entry *dst = __sk_dst_check(sk, 0);
309
310         if (dst)
311                 dst->ops->redirect(dst, sk, skb);
312 }
313
314
315 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
316 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
317 {
318         struct request_sock *req = inet_reqsk(sk);
319         struct net *net = sock_net(sk);
320
321         /* ICMPs are not backlogged, hence we cannot get
322          * an established socket here.
323          */
324         if (seq != tcp_rsk(req)->snt_isn) {
325                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
326         } else if (abort) {
327                 /*
328                  * Still in SYN_RECV, just remove it silently.
329                  * There is no good way to pass the error to the newly
330                  * created socket, and POSIX does not want network
331                  * errors returned from accept().
332                  */
333                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
334                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
335         }
336         reqsk_put(req);
337 }
338 EXPORT_SYMBOL(tcp_req_err);
339
340 /*
341  * This routine is called by the ICMP module when it gets some
342  * sort of error condition.  If err < 0 then the socket should
343  * be closed and the error returned to the user.  If err > 0
344  * it's just the icmp type << 8 | icmp code.  After adjustment
345  * header points to the first 8 bytes of the tcp header.  We need
346  * to find the appropriate port.
347  *
348  * The locking strategy used here is very "optimistic". When
349  * someone else accesses the socket the ICMP is just dropped
350  * and for some paths there is no check at all.
351  * A more general error queue to queue errors for later handling
352  * is probably better.
353  *
354  */
355
356 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
357 {
358         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
359         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
360         struct inet_connection_sock *icsk;
361         struct tcp_sock *tp;
362         struct inet_sock *inet;
363         const int type = icmp_hdr(icmp_skb)->type;
364         const int code = icmp_hdr(icmp_skb)->code;
365         struct sock *sk;
366         struct sk_buff *skb;
367         struct request_sock *fastopen;
368         __u32 seq, snd_una;
369         __u32 remaining;
370         int err;
371         struct net *net = dev_net(icmp_skb->dev);
372
373         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
374                                        th->dest, iph->saddr, ntohs(th->source),
375                                        inet_iif(icmp_skb));
376         if (!sk) {
377                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
378                 return;
379         }
380         if (sk->sk_state == TCP_TIME_WAIT) {
381                 inet_twsk_put(inet_twsk(sk));
382                 return;
383         }
384         seq = ntohl(th->seq);
385         if (sk->sk_state == TCP_NEW_SYN_RECV)
386                 return tcp_req_err(sk, seq,
387                                   type == ICMP_PARAMETERPROB ||
388                                   type == ICMP_TIME_EXCEEDED ||
389                                   (type == ICMP_DEST_UNREACH &&
390                                    (code == ICMP_NET_UNREACH ||
391                                     code == ICMP_HOST_UNREACH)));
392
393         bh_lock_sock(sk);
394         /* If too many ICMPs get dropped on busy
395          * servers this needs to be solved differently.
396          * We do take care of PMTU discovery (RFC1191) special case :
397          * we can receive locally generated ICMP messages while socket is held.
398          */
399         if (sock_owned_by_user(sk)) {
400                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
401                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
402         }
403         if (sk->sk_state == TCP_CLOSE)
404                 goto out;
405
406         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
407                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
408                 goto out;
409         }
410
411         icsk = inet_csk(sk);
412         tp = tcp_sk(sk);
413         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
414         fastopen = tp->fastopen_rsk;
415         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
416         if (sk->sk_state != TCP_LISTEN &&
417             !between(seq, snd_una, tp->snd_nxt)) {
418                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
419                 goto out;
420         }
421
422         switch (type) {
423         case ICMP_REDIRECT:
424                 do_redirect(icmp_skb, sk);
425                 goto out;
426         case ICMP_SOURCE_QUENCH:
427                 /* Just silently ignore these. */
428                 goto out;
429         case ICMP_PARAMETERPROB:
430                 err = EPROTO;
431                 break;
432         case ICMP_DEST_UNREACH:
433                 if (code > NR_ICMP_UNREACH)
434                         goto out;
435
436                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
437                         /* We are not interested in TCP_LISTEN and open_requests
438                          * (SYN-ACKs send out by Linux are always <576bytes so
439                          * they should go through unfragmented).
440                          */
441                         if (sk->sk_state == TCP_LISTEN)
442                                 goto out;
443
444                         tp->mtu_info = info;
445                         if (!sock_owned_by_user(sk)) {
446                                 tcp_v4_mtu_reduced(sk);
447                         } else {
448                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
449                                         sock_hold(sk);
450                         }
451                         goto out;
452                 }
453
454                 err = icmp_err_convert[code].errno;
455                 /* check if icmp_skb allows revert of backoff
456                  * (see draft-zimmermann-tcp-lcd) */
457                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
458                         break;
459                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
460                     !icsk->icsk_backoff || fastopen)
461                         break;
462
463                 if (sock_owned_by_user(sk))
464                         break;
465
466                 icsk->icsk_backoff--;
467                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
468                                                TCP_TIMEOUT_INIT;
469                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
470
471                 skb = tcp_write_queue_head(sk);
472                 BUG_ON(!skb);
473
474                 remaining = icsk->icsk_rto -
475                             min(icsk->icsk_rto,
476                                 tcp_time_stamp - tcp_skb_timestamp(skb));
477
478                 if (remaining) {
479                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
480                                                   remaining, TCP_RTO_MAX);
481                 } else {
482                         /* RTO revert clocked out retransmission.
483                          * Will retransmit now */
484                         tcp_retransmit_timer(sk);
485                 }
486
487                 break;
488         case ICMP_TIME_EXCEEDED:
489                 err = EHOSTUNREACH;
490                 break;
491         default:
492                 goto out;
493         }
494
495         switch (sk->sk_state) {
496         case TCP_SYN_SENT:
497         case TCP_SYN_RECV:
498                 /* Only in fast or simultaneous open. If a fast open socket is
499                  * is already accepted it is treated as a connected one below.
500                  */
501                 if (fastopen && !fastopen->sk)
502                         break;
503
504                 if (!sock_owned_by_user(sk)) {
505                         sk->sk_err = err;
506
507                         sk->sk_error_report(sk);
508
509                         tcp_done(sk);
510                 } else {
511                         sk->sk_err_soft = err;
512                 }
513                 goto out;
514         }
515
516         /* If we've already connected we will keep trying
517          * until we time out, or the user gives up.
518          *
519          * rfc1122 4.2.3.9 allows to consider as hard errors
520          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
521          * but it is obsoleted by pmtu discovery).
522          *
523          * Note, that in modern internet, where routing is unreliable
524          * and in each dark corner broken firewalls sit, sending random
525          * errors ordered by their masters even this two messages finally lose
526          * their original sense (even Linux sends invalid PORT_UNREACHs)
527          *
528          * Now we are in compliance with RFCs.
529          *                                                      --ANK (980905)
530          */
531
532         inet = inet_sk(sk);
533         if (!sock_owned_by_user(sk) && inet->recverr) {
534                 sk->sk_err = err;
535                 sk->sk_error_report(sk);
536         } else  { /* Only an error on timeout */
537                 sk->sk_err_soft = err;
538         }
539
540 out:
541         bh_unlock_sock(sk);
542         sock_put(sk);
543 }
544
545 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
546 {
547         struct tcphdr *th = tcp_hdr(skb);
548
549         if (skb->ip_summed == CHECKSUM_PARTIAL) {
550                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
551                 skb->csum_start = skb_transport_header(skb) - skb->head;
552                 skb->csum_offset = offsetof(struct tcphdr, check);
553         } else {
554                 th->check = tcp_v4_check(skb->len, saddr, daddr,
555                                          csum_partial(th,
556                                                       th->doff << 2,
557                                                       skb->csum));
558         }
559 }
560
561 /* This routine computes an IPv4 TCP checksum. */
562 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
563 {
564         const struct inet_sock *inet = inet_sk(sk);
565
566         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
567 }
568 EXPORT_SYMBOL(tcp_v4_send_check);
569
570 static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
571 /*
572  *      This routine will send an RST to the other tcp.
573  *
574  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575  *                    for reset.
576  *      Answer: if a packet caused RST, it is not for a socket
577  *              existing in our system, if it is matched to a socket,
578  *              it is just duplicate segment or bug in other side's TCP.
579  *              So that we build reply only basing on parameters
580  *              arrived with segment.
581  *      Exception: precedence violation. We do not implement it in any case.
582  */
583
584 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
585 {
586         const struct tcphdr *th = tcp_hdr(skb);
587         struct {
588                 struct tcphdr th;
589 #ifdef CONFIG_TCP_MD5SIG
590                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591 #endif
592         } rep;
593         struct ip_reply_arg arg;
594 #ifdef CONFIG_TCP_MD5SIG
595         struct tcp_md5sig_key *key;
596         const __u8 *hash_location = NULL;
597         unsigned char newhash[16];
598         int genhash;
599         struct sock *sk1 = NULL;
600 #endif
601         struct net *net;
602
603         /* Never send a reset in response to a reset. */
604         if (th->rst)
605                 return;
606
607         /* If sk not NULL, it means we did a successful lookup and incoming
608          * route had to be correct. prequeue might have dropped our dst.
609          */
610         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
611                 return;
612
613         /* Swap the send and the receive. */
614         memset(&rep, 0, sizeof(rep));
615         rep.th.dest   = th->source;
616         rep.th.source = th->dest;
617         rep.th.doff   = sizeof(struct tcphdr) / 4;
618         rep.th.rst    = 1;
619
620         if (th->ack) {
621                 rep.th.seq = th->ack_seq;
622         } else {
623                 rep.th.ack = 1;
624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625                                        skb->len - (th->doff << 2));
626         }
627
628         memset(&arg, 0, sizeof(arg));
629         arg.iov[0].iov_base = (unsigned char *)&rep;
630         arg.iov[0].iov_len  = sizeof(rep.th);
631
632         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
633 #ifdef CONFIG_TCP_MD5SIG
634         hash_location = tcp_parse_md5sig_option(th);
635         if (!sk && hash_location) {
636                 /*
637                  * active side is lost. Try to find listening socket through
638                  * source port, and then find md5 key through listening socket.
639                  * we are not loose security here:
640                  * Incoming packet is checked with md5 hash with finding key,
641                  * no RST generated if md5 hash doesn't match.
642                  */
643                 sk1 = __inet_lookup_listener(net,
644                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
645                                              th->source, ip_hdr(skb)->daddr,
646                                              ntohs(th->source), inet_iif(skb));
647                 /* don't send rst if it can't find key */
648                 if (!sk1)
649                         return;
650                 rcu_read_lock();
651                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
652                                         &ip_hdr(skb)->saddr, AF_INET);
653                 if (!key)
654                         goto release_sk1;
655
656                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
657                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
658                         goto release_sk1;
659         } else {
660                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
661                                              &ip_hdr(skb)->saddr,
662                                              AF_INET) : NULL;
663         }
664
665         if (key) {
666                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
667                                    (TCPOPT_NOP << 16) |
668                                    (TCPOPT_MD5SIG << 8) |
669                                    TCPOLEN_MD5SIG);
670                 /* Update length and the length the header thinks exists */
671                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
672                 rep.th.doff = arg.iov[0].iov_len / 4;
673
674                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
675                                      key, ip_hdr(skb)->saddr,
676                                      ip_hdr(skb)->daddr, &rep.th);
677         }
678 #endif
679         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
680                                       ip_hdr(skb)->saddr, /* XXX */
681                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
682         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
683         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
684         /* When socket is gone, all binding information is lost.
685          * routing might fail in this case. No choice here, if we choose to force
686          * input interface, we will misroute in case of asymmetric route.
687          */
688         if (sk)
689                 arg.bound_dev_if = sk->sk_bound_dev_if;
690
691         arg.tos = ip_hdr(skb)->tos;
692
693         local_lock(tcp_sk_lock);
694         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
695                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
696                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
697                               &arg, arg.iov[0].iov_len);
698         local_unlock(tcp_sk_lock);
699
700         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
701         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
702
703 #ifdef CONFIG_TCP_MD5SIG
704 release_sk1:
705         if (sk1) {
706                 rcu_read_unlock();
707                 sock_put(sk1);
708         }
709 #endif
710 }
711
712 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
713    outside socket context is ugly, certainly. What can I do?
714  */
715
716 static void tcp_v4_send_ack(struct net *net,
717                             struct sk_buff *skb, u32 seq, u32 ack,
718                             u32 win, u32 tsval, u32 tsecr, int oif,
719                             struct tcp_md5sig_key *key,
720                             int reply_flags, u8 tos)
721 {
722         const struct tcphdr *th = tcp_hdr(skb);
723         struct {
724                 struct tcphdr th;
725                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
726 #ifdef CONFIG_TCP_MD5SIG
727                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
728 #endif
729                         ];
730         } rep;
731         struct ip_reply_arg arg;
732
733         memset(&rep.th, 0, sizeof(struct tcphdr));
734         memset(&arg, 0, sizeof(arg));
735
736         arg.iov[0].iov_base = (unsigned char *)&rep;
737         arg.iov[0].iov_len  = sizeof(rep.th);
738         if (tsecr) {
739                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
740                                    (TCPOPT_TIMESTAMP << 8) |
741                                    TCPOLEN_TIMESTAMP);
742                 rep.opt[1] = htonl(tsval);
743                 rep.opt[2] = htonl(tsecr);
744                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
745         }
746
747         /* Swap the send and the receive. */
748         rep.th.dest    = th->source;
749         rep.th.source  = th->dest;
750         rep.th.doff    = arg.iov[0].iov_len / 4;
751         rep.th.seq     = htonl(seq);
752         rep.th.ack_seq = htonl(ack);
753         rep.th.ack     = 1;
754         rep.th.window  = htons(win);
755
756 #ifdef CONFIG_TCP_MD5SIG
757         if (key) {
758                 int offset = (tsecr) ? 3 : 0;
759
760                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
761                                           (TCPOPT_NOP << 16) |
762                                           (TCPOPT_MD5SIG << 8) |
763                                           TCPOLEN_MD5SIG);
764                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
765                 rep.th.doff = arg.iov[0].iov_len/4;
766
767                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
768                                     key, ip_hdr(skb)->saddr,
769                                     ip_hdr(skb)->daddr, &rep.th);
770         }
771 #endif
772         arg.flags = reply_flags;
773         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774                                       ip_hdr(skb)->saddr, /* XXX */
775                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
776         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777         if (oif)
778                 arg.bound_dev_if = oif;
779         arg.tos = tos;
780         local_lock(tcp_sk_lock);
781         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
782                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
783                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
784                               &arg, arg.iov[0].iov_len);
785         local_unlock(tcp_sk_lock);
786
787         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
788 }
789
790 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
791 {
792         struct inet_timewait_sock *tw = inet_twsk(sk);
793         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
794
795         tcp_v4_send_ack(sock_net(sk), skb,
796                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
797                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
798                         tcp_time_stamp + tcptw->tw_ts_offset,
799                         tcptw->tw_ts_recent,
800                         tw->tw_bound_dev_if,
801                         tcp_twsk_md5_key(tcptw),
802                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
803                         tw->tw_tos
804                         );
805
806         inet_twsk_put(tw);
807 }
808
809 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
810                                   struct request_sock *req)
811 {
812         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
813          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
814          */
815         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
816                                              tcp_sk(sk)->snd_nxt;
817
818         /* RFC 7323 2.3
819          * The window field (SEG.WND) of every outgoing segment, with the
820          * exception of <SYN> segments, MUST be right-shifted by
821          * Rcv.Wind.Shift bits:
822          */
823         tcp_v4_send_ack(sock_net(sk), skb, seq,
824                         tcp_rsk(req)->rcv_nxt,
825                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
826                         tcp_time_stamp,
827                         req->ts_recent,
828                         0,
829                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
830                                           AF_INET),
831                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
832                         ip_hdr(skb)->tos);
833 }
834
835 /*
836  *      Send a SYN-ACK after having received a SYN.
837  *      This still operates on a request_sock only, not on a big
838  *      socket.
839  */
840 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
841                               struct flowi *fl,
842                               struct request_sock *req,
843                               struct tcp_fastopen_cookie *foc,
844                                   bool attach_req)
845 {
846         const struct inet_request_sock *ireq = inet_rsk(req);
847         struct flowi4 fl4;
848         int err = -1;
849         struct sk_buff *skb;
850
851         /* First, grab a route. */
852         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
853                 return -1;
854
855         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
856
857         if (skb) {
858                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
859
860                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
861                                             ireq->ir_rmt_addr,
862                                             ireq->opt);
863                 err = net_xmit_eval(err);
864         }
865
866         return err;
867 }
868
869 /*
870  *      IPv4 request_sock destructor.
871  */
872 static void tcp_v4_reqsk_destructor(struct request_sock *req)
873 {
874         kfree(inet_rsk(req)->opt);
875 }
876
877
878 #ifdef CONFIG_TCP_MD5SIG
879 /*
880  * RFC2385 MD5 checksumming requires a mapping of
881  * IP address->MD5 Key.
882  * We need to maintain these in the sk structure.
883  */
884
885 /* Find the Key structure for an address.  */
886 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
887                                          const union tcp_md5_addr *addr,
888                                          int family)
889 {
890         const struct tcp_sock *tp = tcp_sk(sk);
891         struct tcp_md5sig_key *key;
892         unsigned int size = sizeof(struct in_addr);
893         const struct tcp_md5sig_info *md5sig;
894
895         /* caller either holds rcu_read_lock() or socket lock */
896         md5sig = rcu_dereference_check(tp->md5sig_info,
897                                        sock_owned_by_user(sk) ||
898                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
899         if (!md5sig)
900                 return NULL;
901 #if IS_ENABLED(CONFIG_IPV6)
902         if (family == AF_INET6)
903                 size = sizeof(struct in6_addr);
904 #endif
905         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
906                 if (key->family != family)
907                         continue;
908                 if (!memcmp(&key->addr, addr, size))
909                         return key;
910         }
911         return NULL;
912 }
913 EXPORT_SYMBOL(tcp_md5_do_lookup);
914
915 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
916                                          const struct sock *addr_sk)
917 {
918         const union tcp_md5_addr *addr;
919
920         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
921         return tcp_md5_do_lookup(sk, addr, AF_INET);
922 }
923 EXPORT_SYMBOL(tcp_v4_md5_lookup);
924
925 /* This can be called on a newly created socket, from other files */
926 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
927                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
928 {
929         /* Add Key to the list */
930         struct tcp_md5sig_key *key;
931         struct tcp_sock *tp = tcp_sk(sk);
932         struct tcp_md5sig_info *md5sig;
933
934         key = tcp_md5_do_lookup(sk, addr, family);
935         if (key) {
936                 /* Pre-existing entry - just update that one. */
937                 memcpy(key->key, newkey, newkeylen);
938                 key->keylen = newkeylen;
939                 return 0;
940         }
941
942         md5sig = rcu_dereference_protected(tp->md5sig_info,
943                                            sock_owned_by_user(sk) ||
944                                            lockdep_is_held(&sk->sk_lock.slock));
945         if (!md5sig) {
946                 md5sig = kmalloc(sizeof(*md5sig), gfp);
947                 if (!md5sig)
948                         return -ENOMEM;
949
950                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
951                 INIT_HLIST_HEAD(&md5sig->head);
952                 rcu_assign_pointer(tp->md5sig_info, md5sig);
953         }
954
955         key = sock_kmalloc(sk, sizeof(*key), gfp);
956         if (!key)
957                 return -ENOMEM;
958         if (!tcp_alloc_md5sig_pool()) {
959                 sock_kfree_s(sk, key, sizeof(*key));
960                 return -ENOMEM;
961         }
962
963         memcpy(key->key, newkey, newkeylen);
964         key->keylen = newkeylen;
965         key->family = family;
966         memcpy(&key->addr, addr,
967                (family == AF_INET6) ? sizeof(struct in6_addr) :
968                                       sizeof(struct in_addr));
969         hlist_add_head_rcu(&key->node, &md5sig->head);
970         return 0;
971 }
972 EXPORT_SYMBOL(tcp_md5_do_add);
973
974 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
975 {
976         struct tcp_md5sig_key *key;
977
978         key = tcp_md5_do_lookup(sk, addr, family);
979         if (!key)
980                 return -ENOENT;
981         hlist_del_rcu(&key->node);
982         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
983         kfree_rcu(key, rcu);
984         return 0;
985 }
986 EXPORT_SYMBOL(tcp_md5_do_del);
987
988 static void tcp_clear_md5_list(struct sock *sk)
989 {
990         struct tcp_sock *tp = tcp_sk(sk);
991         struct tcp_md5sig_key *key;
992         struct hlist_node *n;
993         struct tcp_md5sig_info *md5sig;
994
995         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
996
997         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
998                 hlist_del_rcu(&key->node);
999                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1000                 kfree_rcu(key, rcu);
1001         }
1002 }
1003
1004 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1005                                  int optlen)
1006 {
1007         struct tcp_md5sig cmd;
1008         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1009
1010         if (optlen < sizeof(cmd))
1011                 return -EINVAL;
1012
1013         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1014                 return -EFAULT;
1015
1016         if (sin->sin_family != AF_INET)
1017                 return -EINVAL;
1018
1019         if (!cmd.tcpm_keylen)
1020                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1021                                       AF_INET);
1022
1023         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1024                 return -EINVAL;
1025
1026         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1027                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1028                               GFP_KERNEL);
1029 }
1030
1031 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1032                                         __be32 daddr, __be32 saddr, int nbytes)
1033 {
1034         struct tcp4_pseudohdr *bp;
1035         struct scatterlist sg;
1036
1037         bp = &hp->md5_blk.ip4;
1038
1039         /*
1040          * 1. the TCP pseudo-header (in the order: source IP address,
1041          * destination IP address, zero-padded protocol number, and
1042          * segment length)
1043          */
1044         bp->saddr = saddr;
1045         bp->daddr = daddr;
1046         bp->pad = 0;
1047         bp->protocol = IPPROTO_TCP;
1048         bp->len = cpu_to_be16(nbytes);
1049
1050         sg_init_one(&sg, bp, sizeof(*bp));
1051         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1052 }
1053
1054 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1055                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1056 {
1057         struct tcp_md5sig_pool *hp;
1058         struct hash_desc *desc;
1059
1060         hp = tcp_get_md5sig_pool();
1061         if (!hp)
1062                 goto clear_hash_noput;
1063         desc = &hp->md5_desc;
1064
1065         if (crypto_hash_init(desc))
1066                 goto clear_hash;
1067         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1068                 goto clear_hash;
1069         if (tcp_md5_hash_header(hp, th))
1070                 goto clear_hash;
1071         if (tcp_md5_hash_key(hp, key))
1072                 goto clear_hash;
1073         if (crypto_hash_final(desc, md5_hash))
1074                 goto clear_hash;
1075
1076         tcp_put_md5sig_pool();
1077         return 0;
1078
1079 clear_hash:
1080         tcp_put_md5sig_pool();
1081 clear_hash_noput:
1082         memset(md5_hash, 0, 16);
1083         return 1;
1084 }
1085
1086 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1087                         const struct sock *sk,
1088                         const struct sk_buff *skb)
1089 {
1090         struct tcp_md5sig_pool *hp;
1091         struct hash_desc *desc;
1092         const struct tcphdr *th = tcp_hdr(skb);
1093         __be32 saddr, daddr;
1094
1095         if (sk) { /* valid for establish/request sockets */
1096                 saddr = sk->sk_rcv_saddr;
1097                 daddr = sk->sk_daddr;
1098         } else {
1099                 const struct iphdr *iph = ip_hdr(skb);
1100                 saddr = iph->saddr;
1101                 daddr = iph->daddr;
1102         }
1103
1104         hp = tcp_get_md5sig_pool();
1105         if (!hp)
1106                 goto clear_hash_noput;
1107         desc = &hp->md5_desc;
1108
1109         if (crypto_hash_init(desc))
1110                 goto clear_hash;
1111
1112         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1113                 goto clear_hash;
1114         if (tcp_md5_hash_header(hp, th))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1117                 goto clear_hash;
1118         if (tcp_md5_hash_key(hp, key))
1119                 goto clear_hash;
1120         if (crypto_hash_final(desc, md5_hash))
1121                 goto clear_hash;
1122
1123         tcp_put_md5sig_pool();
1124         return 0;
1125
1126 clear_hash:
1127         tcp_put_md5sig_pool();
1128 clear_hash_noput:
1129         memset(md5_hash, 0, 16);
1130         return 1;
1131 }
1132 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1133
1134 #endif
1135
1136 /* Called with rcu_read_lock() */
1137 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1138                                     const struct sk_buff *skb)
1139 {
1140 #ifdef CONFIG_TCP_MD5SIG
1141         /*
1142          * This gets called for each TCP segment that arrives
1143          * so we want to be efficient.
1144          * We have 3 drop cases:
1145          * o No MD5 hash and one expected.
1146          * o MD5 hash and we're not expecting one.
1147          * o MD5 hash and its wrong.
1148          */
1149         const __u8 *hash_location = NULL;
1150         struct tcp_md5sig_key *hash_expected;
1151         const struct iphdr *iph = ip_hdr(skb);
1152         const struct tcphdr *th = tcp_hdr(skb);
1153         int genhash;
1154         unsigned char newhash[16];
1155
1156         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1157                                           AF_INET);
1158         hash_location = tcp_parse_md5sig_option(th);
1159
1160         /* We've parsed the options - do we have a hash? */
1161         if (!hash_expected && !hash_location)
1162                 return false;
1163
1164         if (hash_expected && !hash_location) {
1165                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1166                 return true;
1167         }
1168
1169         if (!hash_expected && hash_location) {
1170                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1171                 return true;
1172         }
1173
1174         /* Okay, so this is hash_expected and hash_location -
1175          * so we need to calculate the checksum.
1176          */
1177         genhash = tcp_v4_md5_hash_skb(newhash,
1178                                       hash_expected,
1179                                       NULL, skb);
1180
1181         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1182                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1183                                      &iph->saddr, ntohs(th->source),
1184                                      &iph->daddr, ntohs(th->dest),
1185                                      genhash ? " tcp_v4_calc_md5_hash failed"
1186                                      : "");
1187                 return true;
1188         }
1189         return false;
1190 #endif
1191         return false;
1192 }
1193
1194 static void tcp_v4_init_req(struct request_sock *req,
1195                             const struct sock *sk_listener,
1196                             struct sk_buff *skb)
1197 {
1198         struct inet_request_sock *ireq = inet_rsk(req);
1199
1200         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1201         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1202         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1203         ireq->opt = tcp_v4_save_options(skb);
1204 }
1205
1206 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1207                                           struct flowi *fl,
1208                                           const struct request_sock *req,
1209                                           bool *strict)
1210 {
1211         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1212
1213         if (strict) {
1214                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1215                         *strict = true;
1216                 else
1217                         *strict = false;
1218         }
1219
1220         return dst;
1221 }
1222
1223 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1224         .family         =       PF_INET,
1225         .obj_size       =       sizeof(struct tcp_request_sock),
1226         .rtx_syn_ack    =       tcp_rtx_synack,
1227         .send_ack       =       tcp_v4_reqsk_send_ack,
1228         .destructor     =       tcp_v4_reqsk_destructor,
1229         .send_reset     =       tcp_v4_send_reset,
1230         .syn_ack_timeout =      tcp_syn_ack_timeout,
1231 };
1232
1233 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1234         .mss_clamp      =       TCP_MSS_DEFAULT,
1235 #ifdef CONFIG_TCP_MD5SIG
1236         .req_md5_lookup =       tcp_v4_md5_lookup,
1237         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1238 #endif
1239         .init_req       =       tcp_v4_init_req,
1240 #ifdef CONFIG_SYN_COOKIES
1241         .cookie_init_seq =      cookie_v4_init_sequence,
1242 #endif
1243         .route_req      =       tcp_v4_route_req,
1244         .init_seq       =       tcp_v4_init_sequence,
1245         .send_synack    =       tcp_v4_send_synack,
1246 };
1247
1248 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1249 {
1250         /* Never answer to SYNs send to broadcast or multicast */
1251         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1252                 goto drop;
1253
1254         return tcp_conn_request(&tcp_request_sock_ops,
1255                                 &tcp_request_sock_ipv4_ops, sk, skb);
1256
1257 drop:
1258         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1259         return 0;
1260 }
1261 EXPORT_SYMBOL(tcp_v4_conn_request);
1262
1263
1264 /*
1265  * The three way handshake has completed - we got a valid synack -
1266  * now create the new socket.
1267  */
1268 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1269                                   struct request_sock *req,
1270                                   struct dst_entry *dst,
1271                                   struct request_sock *req_unhash,
1272                                   bool *own_req)
1273 {
1274         struct inet_request_sock *ireq;
1275         struct inet_sock *newinet;
1276         struct tcp_sock *newtp;
1277         struct sock *newsk;
1278 #ifdef CONFIG_TCP_MD5SIG
1279         struct tcp_md5sig_key *key;
1280 #endif
1281         struct ip_options_rcu *inet_opt;
1282
1283         if (sk_acceptq_is_full(sk))
1284                 goto exit_overflow;
1285
1286         newsk = tcp_create_openreq_child(sk, req, skb);
1287         if (!newsk)
1288                 goto exit_nonewsk;
1289
1290         newsk->sk_gso_type = SKB_GSO_TCPV4;
1291         inet_sk_rx_dst_set(newsk, skb);
1292
1293         newtp                 = tcp_sk(newsk);
1294         newinet               = inet_sk(newsk);
1295         ireq                  = inet_rsk(req);
1296         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1297         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1298         newinet->inet_saddr           = ireq->ir_loc_addr;
1299         inet_opt              = ireq->opt;
1300         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1301         ireq->opt             = NULL;
1302         newinet->mc_index     = inet_iif(skb);
1303         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1304         newinet->rcv_tos      = ip_hdr(skb)->tos;
1305         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1306         if (inet_opt)
1307                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1308         newinet->inet_id = newtp->write_seq ^ jiffies;
1309
1310         if (!dst) {
1311                 dst = inet_csk_route_child_sock(sk, newsk, req);
1312                 if (!dst)
1313                         goto put_and_exit;
1314         } else {
1315                 /* syncookie case : see end of cookie_v4_check() */
1316         }
1317         sk_setup_caps(newsk, dst);
1318
1319         tcp_ca_openreq_child(newsk, dst);
1320
1321         tcp_sync_mss(newsk, dst_mtu(dst));
1322         newtp->advmss = dst_metric_advmss(dst);
1323         if (tcp_sk(sk)->rx_opt.user_mss &&
1324             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1325                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1326
1327         tcp_initialize_rcv_mss(newsk);
1328
1329 #ifdef CONFIG_TCP_MD5SIG
1330         /* Copy over the MD5 key from the original socket */
1331         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1332                                 AF_INET);
1333         if (key) {
1334                 /*
1335                  * We're using one, so create a matching key
1336                  * on the newsk structure. If we fail to get
1337                  * memory, then we end up not copying the key
1338                  * across. Shucks.
1339                  */
1340                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1341                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1342                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1343         }
1344 #endif
1345
1346         if (__inet_inherit_port(sk, newsk) < 0)
1347                 goto put_and_exit;
1348         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1349         if (*own_req)
1350                 tcp_move_syn(newtp, req);
1351
1352         return newsk;
1353
1354 exit_overflow:
1355         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1356 exit_nonewsk:
1357         dst_release(dst);
1358 exit:
1359         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1360         return NULL;
1361 put_and_exit:
1362         inet_csk_prepare_forced_close(newsk);
1363         tcp_done(newsk);
1364         goto exit;
1365 }
1366 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1367
1368 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1369 {
1370 #ifdef CONFIG_SYN_COOKIES
1371         const struct tcphdr *th = tcp_hdr(skb);
1372
1373         if (!th->syn)
1374                 sk = cookie_v4_check(sk, skb);
1375 #endif
1376         return sk;
1377 }
1378
1379 /* The socket must have it's spinlock held when we get
1380  * here, unless it is a TCP_LISTEN socket.
1381  *
1382  * We have a potential double-lock case here, so even when
1383  * doing backlog processing we use the BH locking scheme.
1384  * This is because we cannot sleep with the original spinlock
1385  * held.
1386  */
1387 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1388 {
1389         struct sock *rsk;
1390
1391         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1392                 struct dst_entry *dst = sk->sk_rx_dst;
1393
1394                 sock_rps_save_rxhash(sk, skb);
1395                 sk_mark_napi_id(sk, skb);
1396                 if (dst) {
1397                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1398                             !dst->ops->check(dst, 0)) {
1399                                 dst_release(dst);
1400                                 sk->sk_rx_dst = NULL;
1401                         }
1402                 }
1403                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1404                 return 0;
1405         }
1406
1407         if (tcp_checksum_complete(skb))
1408                 goto csum_err;
1409
1410         if (sk->sk_state == TCP_LISTEN) {
1411                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1412
1413                 if (!nsk)
1414                         goto discard;
1415                 if (nsk != sk) {
1416                         sock_rps_save_rxhash(nsk, skb);
1417                         sk_mark_napi_id(nsk, skb);
1418                         if (tcp_child_process(sk, nsk, skb)) {
1419                                 rsk = nsk;
1420                                 goto reset;
1421                         }
1422                         return 0;
1423                 }
1424         } else
1425                 sock_rps_save_rxhash(sk, skb);
1426
1427         if (tcp_rcv_state_process(sk, skb)) {
1428                 rsk = sk;
1429                 goto reset;
1430         }
1431         return 0;
1432
1433 reset:
1434         tcp_v4_send_reset(rsk, skb);
1435 discard:
1436         kfree_skb(skb);
1437         /* Be careful here. If this function gets more complicated and
1438          * gcc suffers from register pressure on the x86, sk (in %ebx)
1439          * might be destroyed here. This current version compiles correctly,
1440          * but you have been warned.
1441          */
1442         return 0;
1443
1444 csum_err:
1445         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1446         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1447         goto discard;
1448 }
1449 EXPORT_SYMBOL(tcp_v4_do_rcv);
1450
1451 void tcp_v4_early_demux(struct sk_buff *skb)
1452 {
1453         const struct iphdr *iph;
1454         const struct tcphdr *th;
1455         struct sock *sk;
1456
1457         if (skb->pkt_type != PACKET_HOST)
1458                 return;
1459
1460         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1461                 return;
1462
1463         iph = ip_hdr(skb);
1464         th = tcp_hdr(skb);
1465
1466         if (th->doff < sizeof(struct tcphdr) / 4)
1467                 return;
1468
1469         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1470                                        iph->saddr, th->source,
1471                                        iph->daddr, ntohs(th->dest),
1472                                        skb->skb_iif);
1473         if (sk) {
1474                 skb->sk = sk;
1475                 skb->destructor = sock_edemux;
1476                 if (sk_fullsock(sk)) {
1477                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1478
1479                         if (dst)
1480                                 dst = dst_check(dst, 0);
1481                         if (dst &&
1482                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1483                                 skb_dst_set_noref(skb, dst);
1484                 }
1485         }
1486 }
1487
1488 /* Packet is added to VJ-style prequeue for processing in process
1489  * context, if a reader task is waiting. Apparently, this exciting
1490  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1491  * failed somewhere. Latency? Burstiness? Well, at least now we will
1492  * see, why it failed. 8)8)                               --ANK
1493  *
1494  */
1495 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1496 {
1497         struct tcp_sock *tp = tcp_sk(sk);
1498
1499         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1500                 return false;
1501
1502         if (skb->len <= tcp_hdrlen(skb) &&
1503             skb_queue_len(&tp->ucopy.prequeue) == 0)
1504                 return false;
1505
1506         /* Before escaping RCU protected region, we need to take care of skb
1507          * dst. Prequeue is only enabled for established sockets.
1508          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1509          * Instead of doing full sk_rx_dst validity here, let's perform
1510          * an optimistic check.
1511          */
1512         if (likely(sk->sk_rx_dst))
1513                 skb_dst_drop(skb);
1514         else
1515                 skb_dst_force_safe(skb);
1516
1517         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1518         tp->ucopy.memory += skb->truesize;
1519         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1520                 struct sk_buff *skb1;
1521
1522                 BUG_ON(sock_owned_by_user(sk));
1523
1524                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1525                         sk_backlog_rcv(sk, skb1);
1526                         NET_INC_STATS_BH(sock_net(sk),
1527                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1528                 }
1529
1530                 tp->ucopy.memory = 0;
1531         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1532                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1533                                            POLLIN | POLLRDNORM | POLLRDBAND);
1534                 if (!inet_csk_ack_scheduled(sk))
1535                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1536                                                   (3 * tcp_rto_min(sk)) / 4,
1537                                                   TCP_RTO_MAX);
1538         }
1539         return true;
1540 }
1541 EXPORT_SYMBOL(tcp_prequeue);
1542
1543 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1544 {
1545         struct tcphdr *th = (struct tcphdr *)skb->data;
1546         unsigned int eaten = skb->len;
1547         int err;
1548
1549         err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1550         if (!err) {
1551                 eaten -= skb->len;
1552                 TCP_SKB_CB(skb)->end_seq -= eaten;
1553         }
1554         return err;
1555 }
1556 EXPORT_SYMBOL(tcp_filter);
1557
1558 /*
1559  *      From tcp_input.c
1560  */
1561
1562 int tcp_v4_rcv(struct sk_buff *skb)
1563 {
1564         const struct iphdr *iph;
1565         const struct tcphdr *th;
1566         struct sock *sk;
1567         int ret;
1568         struct net *net = dev_net(skb->dev);
1569
1570         if (skb->pkt_type != PACKET_HOST)
1571                 goto discard_it;
1572
1573         /* Count it even if it's bad */
1574         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1575
1576         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1577                 goto discard_it;
1578
1579         th = tcp_hdr(skb);
1580
1581         if (th->doff < sizeof(struct tcphdr) / 4)
1582                 goto bad_packet;
1583         if (!pskb_may_pull(skb, th->doff * 4))
1584                 goto discard_it;
1585
1586         /* An explanation is required here, I think.
1587          * Packet length and doff are validated by header prediction,
1588          * provided case of th->doff==0 is eliminated.
1589          * So, we defer the checks. */
1590
1591         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1592                 goto csum_error;
1593
1594         th = tcp_hdr(skb);
1595         iph = ip_hdr(skb);
1596         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1597          * barrier() makes sure compiler wont play fool^Waliasing games.
1598          */
1599         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1600                 sizeof(struct inet_skb_parm));
1601         barrier();
1602
1603         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1604         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1605                                     skb->len - th->doff * 4);
1606         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1607         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1608         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1609         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1610         TCP_SKB_CB(skb)->sacked  = 0;
1611
1612 lookup:
1613         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1614         if (!sk)
1615                 goto no_tcp_socket;
1616
1617 process:
1618         if (sk->sk_state == TCP_TIME_WAIT)
1619                 goto do_time_wait;
1620
1621         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1622                 struct request_sock *req = inet_reqsk(sk);
1623                 struct sock *nsk;
1624
1625                 sk = req->rsk_listener;
1626                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1627                         reqsk_put(req);
1628                         goto discard_it;
1629                 }
1630                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1631                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1632                         goto lookup;
1633                 }
1634                 sock_hold(sk);
1635                 nsk = tcp_check_req(sk, skb, req, false);
1636                 if (!nsk) {
1637                         reqsk_put(req);
1638                         goto discard_and_relse;
1639                 }
1640                 if (nsk == sk) {
1641                         reqsk_put(req);
1642                 } else if (tcp_child_process(sk, nsk, skb)) {
1643                         tcp_v4_send_reset(nsk, skb);
1644                         goto discard_and_relse;
1645                 } else {
1646                         sock_put(sk);
1647                         return 0;
1648                 }
1649         }
1650         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1651                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1652                 goto discard_and_relse;
1653         }
1654
1655         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1656                 goto discard_and_relse;
1657
1658         if (tcp_v4_inbound_md5_hash(sk, skb))
1659                 goto discard_and_relse;
1660
1661         nf_reset(skb);
1662
1663         if (tcp_filter(sk, skb))
1664                 goto discard_and_relse;
1665         th = (const struct tcphdr *)skb->data;
1666         iph = ip_hdr(skb);
1667
1668         skb->dev = NULL;
1669
1670         if (sk->sk_state == TCP_LISTEN) {
1671                 ret = tcp_v4_do_rcv(sk, skb);
1672                 goto put_and_return;
1673         }
1674
1675         sk_incoming_cpu_update(sk);
1676
1677         bh_lock_sock_nested(sk);
1678         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1679         ret = 0;
1680         if (!sock_owned_by_user(sk)) {
1681                 if (!tcp_prequeue(sk, skb))
1682                         ret = tcp_v4_do_rcv(sk, skb);
1683         } else if (unlikely(sk_add_backlog(sk, skb,
1684                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1685                 bh_unlock_sock(sk);
1686                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1687                 goto discard_and_relse;
1688         }
1689         bh_unlock_sock(sk);
1690
1691 put_and_return:
1692         sock_put(sk);
1693
1694         return ret;
1695
1696 no_tcp_socket:
1697         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1698                 goto discard_it;
1699
1700         if (tcp_checksum_complete(skb)) {
1701 csum_error:
1702                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1703 bad_packet:
1704                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1705         } else {
1706                 tcp_v4_send_reset(NULL, skb);
1707         }
1708
1709 discard_it:
1710         /* Discard frame. */
1711         kfree_skb(skb);
1712         return 0;
1713
1714 discard_and_relse:
1715         sock_put(sk);
1716         goto discard_it;
1717
1718 do_time_wait:
1719         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1720                 inet_twsk_put(inet_twsk(sk));
1721                 goto discard_it;
1722         }
1723
1724         if (tcp_checksum_complete(skb)) {
1725                 inet_twsk_put(inet_twsk(sk));
1726                 goto csum_error;
1727         }
1728         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1729         case TCP_TW_SYN: {
1730                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1731                                                         &tcp_hashinfo,
1732                                                         iph->saddr, th->source,
1733                                                         iph->daddr, th->dest,
1734                                                         inet_iif(skb));
1735                 if (sk2) {
1736                         inet_twsk_deschedule_put(inet_twsk(sk));
1737                         sk = sk2;
1738                         goto process;
1739                 }
1740                 /* Fall through to ACK */
1741         }
1742         case TCP_TW_ACK:
1743                 tcp_v4_timewait_ack(sk, skb);
1744                 break;
1745         case TCP_TW_RST:
1746                 goto no_tcp_socket;
1747         case TCP_TW_SUCCESS:;
1748         }
1749         goto discard_it;
1750 }
1751
1752 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1753         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1754         .twsk_unique    = tcp_twsk_unique,
1755         .twsk_destructor= tcp_twsk_destructor,
1756 };
1757
1758 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1759 {
1760         struct dst_entry *dst = skb_dst(skb);
1761
1762         if (dst && dst_hold_safe(dst)) {
1763                 sk->sk_rx_dst = dst;
1764                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1765         }
1766 }
1767 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1768
1769 const struct inet_connection_sock_af_ops ipv4_specific = {
1770         .queue_xmit        = ip_queue_xmit,
1771         .send_check        = tcp_v4_send_check,
1772         .rebuild_header    = inet_sk_rebuild_header,
1773         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1774         .conn_request      = tcp_v4_conn_request,
1775         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1776         .net_header_len    = sizeof(struct iphdr),
1777         .setsockopt        = ip_setsockopt,
1778         .getsockopt        = ip_getsockopt,
1779         .addr2sockaddr     = inet_csk_addr2sockaddr,
1780         .sockaddr_len      = sizeof(struct sockaddr_in),
1781         .bind_conflict     = inet_csk_bind_conflict,
1782 #ifdef CONFIG_COMPAT
1783         .compat_setsockopt = compat_ip_setsockopt,
1784         .compat_getsockopt = compat_ip_getsockopt,
1785 #endif
1786         .mtu_reduced       = tcp_v4_mtu_reduced,
1787 };
1788 EXPORT_SYMBOL(ipv4_specific);
1789
1790 #ifdef CONFIG_TCP_MD5SIG
1791 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1792         .md5_lookup             = tcp_v4_md5_lookup,
1793         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1794         .md5_parse              = tcp_v4_parse_md5_keys,
1795 };
1796 #endif
1797
1798 /* NOTE: A lot of things set to zero explicitly by call to
1799  *       sk_alloc() so need not be done here.
1800  */
1801 static int tcp_v4_init_sock(struct sock *sk)
1802 {
1803         struct inet_connection_sock *icsk = inet_csk(sk);
1804
1805         tcp_init_sock(sk);
1806
1807         icsk->icsk_af_ops = &ipv4_specific;
1808
1809 #ifdef CONFIG_TCP_MD5SIG
1810         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1811 #endif
1812
1813         return 0;
1814 }
1815
1816 void tcp_v4_destroy_sock(struct sock *sk)
1817 {
1818         struct tcp_sock *tp = tcp_sk(sk);
1819
1820         tcp_clear_xmit_timers(sk);
1821
1822         tcp_cleanup_congestion_control(sk);
1823
1824         /* Cleanup up the write buffer. */
1825         tcp_write_queue_purge(sk);
1826
1827         /* Cleans up our, hopefully empty, out_of_order_queue. */
1828         __skb_queue_purge(&tp->out_of_order_queue);
1829
1830 #ifdef CONFIG_TCP_MD5SIG
1831         /* Clean up the MD5 key list, if any */
1832         if (tp->md5sig_info) {
1833                 tcp_clear_md5_list(sk);
1834                 kfree_rcu(tp->md5sig_info, rcu);
1835                 tp->md5sig_info = NULL;
1836         }
1837 #endif
1838
1839         /* Clean prequeue, it must be empty really */
1840         __skb_queue_purge(&tp->ucopy.prequeue);
1841
1842         /* Clean up a referenced TCP bind bucket. */
1843         if (inet_csk(sk)->icsk_bind_hash)
1844                 inet_put_port(sk);
1845
1846         BUG_ON(tp->fastopen_rsk);
1847
1848         /* If socket is aborted during connect operation */
1849         tcp_free_fastopen_req(tp);
1850         tcp_saved_syn_free(tp);
1851
1852         sk_sockets_allocated_dec(sk);
1853         sock_release_memcg(sk);
1854 }
1855 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1856
1857 #ifdef CONFIG_PROC_FS
1858 /* Proc filesystem TCP sock list dumping. */
1859
1860 /*
1861  * Get next listener socket follow cur.  If cur is NULL, get first socket
1862  * starting from bucket given in st->bucket; when st->bucket is zero the
1863  * very first socket in the hash table is returned.
1864  */
1865 static void *listening_get_next(struct seq_file *seq, void *cur)
1866 {
1867         struct inet_connection_sock *icsk;
1868         struct hlist_nulls_node *node;
1869         struct sock *sk = cur;
1870         struct inet_listen_hashbucket *ilb;
1871         struct tcp_iter_state *st = seq->private;
1872         struct net *net = seq_file_net(seq);
1873
1874         if (!sk) {
1875                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1876                 spin_lock_bh(&ilb->lock);
1877                 sk = sk_nulls_head(&ilb->head);
1878                 st->offset = 0;
1879                 goto get_sk;
1880         }
1881         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1882         ++st->num;
1883         ++st->offset;
1884
1885         sk = sk_nulls_next(sk);
1886 get_sk:
1887         sk_nulls_for_each_from(sk, node) {
1888                 if (!net_eq(sock_net(sk), net))
1889                         continue;
1890                 if (sk->sk_family == st->family) {
1891                         cur = sk;
1892                         goto out;
1893                 }
1894                 icsk = inet_csk(sk);
1895         }
1896         spin_unlock_bh(&ilb->lock);
1897         st->offset = 0;
1898         if (++st->bucket < INET_LHTABLE_SIZE) {
1899                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1900                 spin_lock_bh(&ilb->lock);
1901                 sk = sk_nulls_head(&ilb->head);
1902                 goto get_sk;
1903         }
1904         cur = NULL;
1905 out:
1906         return cur;
1907 }
1908
1909 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1910 {
1911         struct tcp_iter_state *st = seq->private;
1912         void *rc;
1913
1914         st->bucket = 0;
1915         st->offset = 0;
1916         rc = listening_get_next(seq, NULL);
1917
1918         while (rc && *pos) {
1919                 rc = listening_get_next(seq, rc);
1920                 --*pos;
1921         }
1922         return rc;
1923 }
1924
1925 static inline bool empty_bucket(const struct tcp_iter_state *st)
1926 {
1927         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1928 }
1929
1930 /*
1931  * Get first established socket starting from bucket given in st->bucket.
1932  * If st->bucket is zero, the very first socket in the hash is returned.
1933  */
1934 static void *established_get_first(struct seq_file *seq)
1935 {
1936         struct tcp_iter_state *st = seq->private;
1937         struct net *net = seq_file_net(seq);
1938         void *rc = NULL;
1939
1940         st->offset = 0;
1941         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1942                 struct sock *sk;
1943                 struct hlist_nulls_node *node;
1944                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1945
1946                 /* Lockless fast path for the common case of empty buckets */
1947                 if (empty_bucket(st))
1948                         continue;
1949
1950                 spin_lock_bh(lock);
1951                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1952                         if (sk->sk_family != st->family ||
1953                             !net_eq(sock_net(sk), net)) {
1954                                 continue;
1955                         }
1956                         rc = sk;
1957                         goto out;
1958                 }
1959                 spin_unlock_bh(lock);
1960         }
1961 out:
1962         return rc;
1963 }
1964
1965 static void *established_get_next(struct seq_file *seq, void *cur)
1966 {
1967         struct sock *sk = cur;
1968         struct hlist_nulls_node *node;
1969         struct tcp_iter_state *st = seq->private;
1970         struct net *net = seq_file_net(seq);
1971
1972         ++st->num;
1973         ++st->offset;
1974
1975         sk = sk_nulls_next(sk);
1976
1977         sk_nulls_for_each_from(sk, node) {
1978                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1979                         return sk;
1980         }
1981
1982         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1983         ++st->bucket;
1984         return established_get_first(seq);
1985 }
1986
1987 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1988 {
1989         struct tcp_iter_state *st = seq->private;
1990         void *rc;
1991
1992         st->bucket = 0;
1993         rc = established_get_first(seq);
1994
1995         while (rc && pos) {
1996                 rc = established_get_next(seq, rc);
1997                 --pos;
1998         }
1999         return rc;
2000 }
2001
2002 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2003 {
2004         void *rc;
2005         struct tcp_iter_state *st = seq->private;
2006
2007         st->state = TCP_SEQ_STATE_LISTENING;
2008         rc        = listening_get_idx(seq, &pos);
2009
2010         if (!rc) {
2011                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2012                 rc        = established_get_idx(seq, pos);
2013         }
2014
2015         return rc;
2016 }
2017
2018 static void *tcp_seek_last_pos(struct seq_file *seq)
2019 {
2020         struct tcp_iter_state *st = seq->private;
2021         int offset = st->offset;
2022         int orig_num = st->num;
2023         void *rc = NULL;
2024
2025         switch (st->state) {
2026         case TCP_SEQ_STATE_LISTENING:
2027                 if (st->bucket >= INET_LHTABLE_SIZE)
2028                         break;
2029                 st->state = TCP_SEQ_STATE_LISTENING;
2030                 rc = listening_get_next(seq, NULL);
2031                 while (offset-- && rc)
2032                         rc = listening_get_next(seq, rc);
2033                 if (rc)
2034                         break;
2035                 st->bucket = 0;
2036                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2037                 /* Fallthrough */
2038         case TCP_SEQ_STATE_ESTABLISHED:
2039                 if (st->bucket > tcp_hashinfo.ehash_mask)
2040                         break;
2041                 rc = established_get_first(seq);
2042                 while (offset-- && rc)
2043                         rc = established_get_next(seq, rc);
2044         }
2045
2046         st->num = orig_num;
2047
2048         return rc;
2049 }
2050
2051 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2052 {
2053         struct tcp_iter_state *st = seq->private;
2054         void *rc;
2055
2056         if (*pos && *pos == st->last_pos) {
2057                 rc = tcp_seek_last_pos(seq);
2058                 if (rc)
2059                         goto out;
2060         }
2061
2062         st->state = TCP_SEQ_STATE_LISTENING;
2063         st->num = 0;
2064         st->bucket = 0;
2065         st->offset = 0;
2066         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2067
2068 out:
2069         st->last_pos = *pos;
2070         return rc;
2071 }
2072
2073 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2074 {
2075         struct tcp_iter_state *st = seq->private;
2076         void *rc = NULL;
2077
2078         if (v == SEQ_START_TOKEN) {
2079                 rc = tcp_get_idx(seq, 0);
2080                 goto out;
2081         }
2082
2083         switch (st->state) {
2084         case TCP_SEQ_STATE_LISTENING:
2085                 rc = listening_get_next(seq, v);
2086                 if (!rc) {
2087                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2088                         st->bucket = 0;
2089                         st->offset = 0;
2090                         rc        = established_get_first(seq);
2091                 }
2092                 break;
2093         case TCP_SEQ_STATE_ESTABLISHED:
2094                 rc = established_get_next(seq, v);
2095                 break;
2096         }
2097 out:
2098         ++*pos;
2099         st->last_pos = *pos;
2100         return rc;
2101 }
2102
2103 static void tcp_seq_stop(struct seq_file *seq, void *v)
2104 {
2105         struct tcp_iter_state *st = seq->private;
2106
2107         switch (st->state) {
2108         case TCP_SEQ_STATE_LISTENING:
2109                 if (v != SEQ_START_TOKEN)
2110                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2111                 break;
2112         case TCP_SEQ_STATE_ESTABLISHED:
2113                 if (v)
2114                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2115                 break;
2116         }
2117 }
2118
2119 int tcp_seq_open(struct inode *inode, struct file *file)
2120 {
2121         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2122         struct tcp_iter_state *s;
2123         int err;
2124
2125         err = seq_open_net(inode, file, &afinfo->seq_ops,
2126                           sizeof(struct tcp_iter_state));
2127         if (err < 0)
2128                 return err;
2129
2130         s = ((struct seq_file *)file->private_data)->private;
2131         s->family               = afinfo->family;
2132         s->last_pos             = 0;
2133         return 0;
2134 }
2135 EXPORT_SYMBOL(tcp_seq_open);
2136
2137 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2138 {
2139         int rc = 0;
2140         struct proc_dir_entry *p;
2141
2142         afinfo->seq_ops.start           = tcp_seq_start;
2143         afinfo->seq_ops.next            = tcp_seq_next;
2144         afinfo->seq_ops.stop            = tcp_seq_stop;
2145
2146         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2147                              afinfo->seq_fops, afinfo);
2148         if (!p)
2149                 rc = -ENOMEM;
2150         return rc;
2151 }
2152 EXPORT_SYMBOL(tcp_proc_register);
2153
2154 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2155 {
2156         remove_proc_entry(afinfo->name, net->proc_net);
2157 }
2158 EXPORT_SYMBOL(tcp_proc_unregister);
2159
2160 static void get_openreq4(const struct request_sock *req,
2161                          struct seq_file *f, int i)
2162 {
2163         const struct inet_request_sock *ireq = inet_rsk(req);
2164         long delta = req->rsk_timer.expires - jiffies;
2165
2166         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2167                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2168                 i,
2169                 ireq->ir_loc_addr,
2170                 ireq->ir_num,
2171                 ireq->ir_rmt_addr,
2172                 ntohs(ireq->ir_rmt_port),
2173                 TCP_SYN_RECV,
2174                 0, 0, /* could print option size, but that is af dependent. */
2175                 1,    /* timers active (only the expire timer) */
2176                 jiffies_delta_to_clock_t(delta),
2177                 req->num_timeout,
2178                 from_kuid_munged(seq_user_ns(f),
2179                                  sock_i_uid(req->rsk_listener)),
2180                 0,  /* non standard timer */
2181                 0, /* open_requests have no inode */
2182                 0,
2183                 req);
2184 }
2185
2186 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2187 {
2188         int timer_active;
2189         unsigned long timer_expires;
2190         const struct tcp_sock *tp = tcp_sk(sk);
2191         const struct inet_connection_sock *icsk = inet_csk(sk);
2192         const struct inet_sock *inet = inet_sk(sk);
2193         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2194         __be32 dest = inet->inet_daddr;
2195         __be32 src = inet->inet_rcv_saddr;
2196         __u16 destp = ntohs(inet->inet_dport);
2197         __u16 srcp = ntohs(inet->inet_sport);
2198         int rx_queue;
2199         int state;
2200
2201         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2202             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2203             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2204                 timer_active    = 1;
2205                 timer_expires   = icsk->icsk_timeout;
2206         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2207                 timer_active    = 4;
2208                 timer_expires   = icsk->icsk_timeout;
2209         } else if (timer_pending(&sk->sk_timer)) {
2210                 timer_active    = 2;
2211                 timer_expires   = sk->sk_timer.expires;
2212         } else {
2213                 timer_active    = 0;
2214                 timer_expires = jiffies;
2215         }
2216
2217         state = sk_state_load(sk);
2218         if (state == TCP_LISTEN)
2219                 rx_queue = sk->sk_ack_backlog;
2220         else
2221                 /* Because we don't lock the socket,
2222                  * we might find a transient negative value.
2223                  */
2224                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2225
2226         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2227                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2228                 i, src, srcp, dest, destp, state,
2229                 tp->write_seq - tp->snd_una,
2230                 rx_queue,
2231                 timer_active,
2232                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2233                 icsk->icsk_retransmits,
2234                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2235                 icsk->icsk_probes_out,
2236                 sock_i_ino(sk),
2237                 atomic_read(&sk->sk_refcnt), sk,
2238                 jiffies_to_clock_t(icsk->icsk_rto),
2239                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2240                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2241                 tp->snd_cwnd,
2242                 state == TCP_LISTEN ?
2243                     fastopenq->max_qlen :
2244                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2245 }
2246
2247 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2248                                struct seq_file *f, int i)
2249 {
2250         long delta = tw->tw_timer.expires - jiffies;
2251         __be32 dest, src;
2252         __u16 destp, srcp;
2253
2254         dest  = tw->tw_daddr;
2255         src   = tw->tw_rcv_saddr;
2256         destp = ntohs(tw->tw_dport);
2257         srcp  = ntohs(tw->tw_sport);
2258
2259         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2260                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2261                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2262                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2263                 atomic_read(&tw->tw_refcnt), tw);
2264 }
2265
2266 #define TMPSZ 150
2267
2268 static int tcp4_seq_show(struct seq_file *seq, void *v)
2269 {
2270         struct tcp_iter_state *st;
2271         struct sock *sk = v;
2272
2273         seq_setwidth(seq, TMPSZ - 1);
2274         if (v == SEQ_START_TOKEN) {
2275                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2276                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2277                            "inode");
2278                 goto out;
2279         }
2280         st = seq->private;
2281
2282         if (sk->sk_state == TCP_TIME_WAIT)
2283                 get_timewait4_sock(v, seq, st->num);
2284         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2285                 get_openreq4(v, seq, st->num);
2286         else
2287                 get_tcp4_sock(v, seq, st->num);
2288 out:
2289         seq_pad(seq, '\n');
2290         return 0;
2291 }
2292
2293 static const struct file_operations tcp_afinfo_seq_fops = {
2294         .owner   = THIS_MODULE,
2295         .open    = tcp_seq_open,
2296         .read    = seq_read,
2297         .llseek  = seq_lseek,
2298         .release = seq_release_net
2299 };
2300
2301 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2302         .name           = "tcp",
2303         .family         = AF_INET,
2304         .seq_fops       = &tcp_afinfo_seq_fops,
2305         .seq_ops        = {
2306                 .show           = tcp4_seq_show,
2307         },
2308 };
2309
2310 static int __net_init tcp4_proc_init_net(struct net *net)
2311 {
2312         return tcp_proc_register(net, &tcp4_seq_afinfo);
2313 }
2314
2315 static void __net_exit tcp4_proc_exit_net(struct net *net)
2316 {
2317         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2318 }
2319
2320 static struct pernet_operations tcp4_net_ops = {
2321         .init = tcp4_proc_init_net,
2322         .exit = tcp4_proc_exit_net,
2323 };
2324
2325 int __init tcp4_proc_init(void)
2326 {
2327         return register_pernet_subsys(&tcp4_net_ops);
2328 }
2329
2330 void tcp4_proc_exit(void)
2331 {
2332         unregister_pernet_subsys(&tcp4_net_ops);
2333 }
2334 #endif /* CONFIG_PROC_FS */
2335
2336 struct proto tcp_prot = {
2337         .name                   = "TCP",
2338         .owner                  = THIS_MODULE,
2339         .close                  = tcp_close,
2340         .connect                = tcp_v4_connect,
2341         .disconnect             = tcp_disconnect,
2342         .accept                 = inet_csk_accept,
2343         .ioctl                  = tcp_ioctl,
2344         .init                   = tcp_v4_init_sock,
2345         .destroy                = tcp_v4_destroy_sock,
2346         .shutdown               = tcp_shutdown,
2347         .setsockopt             = tcp_setsockopt,
2348         .getsockopt             = tcp_getsockopt,
2349         .recvmsg                = tcp_recvmsg,
2350         .sendmsg                = tcp_sendmsg,
2351         .sendpage               = tcp_sendpage,
2352         .backlog_rcv            = tcp_v4_do_rcv,
2353         .release_cb             = tcp_release_cb,
2354         .hash                   = inet_hash,
2355         .unhash                 = inet_unhash,
2356         .get_port               = inet_csk_get_port,
2357         .enter_memory_pressure  = tcp_enter_memory_pressure,
2358         .stream_memory_free     = tcp_stream_memory_free,
2359         .sockets_allocated      = &tcp_sockets_allocated,
2360         .orphan_count           = &tcp_orphan_count,
2361         .memory_allocated       = &tcp_memory_allocated,
2362         .memory_pressure        = &tcp_memory_pressure,
2363         .sysctl_mem             = sysctl_tcp_mem,
2364         .sysctl_wmem            = sysctl_tcp_wmem,
2365         .sysctl_rmem            = sysctl_tcp_rmem,
2366         .max_header             = MAX_TCP_HEADER,
2367         .obj_size               = sizeof(struct tcp_sock),
2368         .slab_flags             = SLAB_DESTROY_BY_RCU,
2369         .twsk_prot              = &tcp_timewait_sock_ops,
2370         .rsk_prot               = &tcp_request_sock_ops,
2371         .h.hashinfo             = &tcp_hashinfo,
2372         .no_autobind            = true,
2373 #ifdef CONFIG_COMPAT
2374         .compat_setsockopt      = compat_tcp_setsockopt,
2375         .compat_getsockopt      = compat_tcp_getsockopt,
2376 #endif
2377 #ifdef CONFIG_MEMCG_KMEM
2378         .init_cgroup            = tcp_init_cgroup,
2379         .destroy_cgroup         = tcp_destroy_cgroup,
2380         .proto_cgroup           = tcp_proto_cgroup,
2381 #endif
2382 };
2383 EXPORT_SYMBOL(tcp_prot);
2384
2385 static void __net_exit tcp_sk_exit(struct net *net)
2386 {
2387         int cpu;
2388
2389         for_each_possible_cpu(cpu)
2390                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2391         free_percpu(net->ipv4.tcp_sk);
2392 }
2393
2394 static int __net_init tcp_sk_init(struct net *net)
2395 {
2396         int res, cpu;
2397
2398         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2399         if (!net->ipv4.tcp_sk)
2400                 return -ENOMEM;
2401
2402         for_each_possible_cpu(cpu) {
2403                 struct sock *sk;
2404
2405                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2406                                            IPPROTO_TCP, net);
2407                 if (res)
2408                         goto fail;
2409                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2410         }
2411
2412         net->ipv4.sysctl_tcp_ecn = 2;
2413         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2414
2415         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2416         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2417         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2418
2419         return 0;
2420 fail:
2421         tcp_sk_exit(net);
2422
2423         return res;
2424 }
2425
2426 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2427 {
2428         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2429 }
2430
2431 static struct pernet_operations __net_initdata tcp_sk_ops = {
2432        .init       = tcp_sk_init,
2433        .exit       = tcp_sk_exit,
2434        .exit_batch = tcp_sk_exit_batch,
2435 };
2436
2437 void __init tcp_v4_init(void)
2438 {
2439         inet_hashinfo_init(&tcp_hashinfo);
2440         if (register_pernet_subsys(&tcp_sk_ops))
2441                 panic("Failed to create the TCP control socket.\n");
2442 }