These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 #include <net/busy_poll.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98 EXPORT_SYMBOL(tcp_hashinfo);
99
100 static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
101 {
102         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
103                                           ip_hdr(skb)->saddr,
104                                           tcp_hdr(skb)->dest,
105                                           tcp_hdr(skb)->source);
106 }
107
108 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
109 {
110         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111         struct tcp_sock *tp = tcp_sk(sk);
112
113         /* With PAWS, it is safe from the viewpoint
114            of data integrity. Even without PAWS it is safe provided sequence
115            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
116
117            Actually, the idea is close to VJ's one, only timestamp cache is
118            held not per host, but per port pair and TW bucket is used as state
119            holder.
120
121            If TW bucket has been already destroyed we fall back to VJ's scheme
122            and use initial timestamp retrieved from peer table.
123          */
124         if (tcptw->tw_ts_recent_stamp &&
125             (!twp || (sysctl_tcp_tw_reuse &&
126                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
127                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
128                 if (tp->write_seq == 0)
129                         tp->write_seq = 1;
130                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
131                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
132                 sock_hold(sktw);
133                 return 1;
134         }
135
136         return 0;
137 }
138 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
139
140 /* This will initiate an outgoing connection. */
141 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
142 {
143         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
144         struct inet_sock *inet = inet_sk(sk);
145         struct tcp_sock *tp = tcp_sk(sk);
146         __be16 orig_sport, orig_dport;
147         __be32 daddr, nexthop;
148         struct flowi4 *fl4;
149         struct rtable *rt;
150         int err;
151         struct ip_options_rcu *inet_opt;
152
153         if (addr_len < sizeof(struct sockaddr_in))
154                 return -EINVAL;
155
156         if (usin->sin_family != AF_INET)
157                 return -EAFNOSUPPORT;
158
159         nexthop = daddr = usin->sin_addr.s_addr;
160         inet_opt = rcu_dereference_protected(inet->inet_opt,
161                                              sock_owned_by_user(sk));
162         if (inet_opt && inet_opt->opt.srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet_opt->opt.faddr;
166         }
167
168         orig_sport = inet->inet_sport;
169         orig_dport = usin->sin_port;
170         fl4 = &inet->cork.fl.u.ip4;
171         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
172                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                               IPPROTO_TCP,
174                               orig_sport, orig_dport, sk);
175         if (IS_ERR(rt)) {
176                 err = PTR_ERR(rt);
177                 if (err == -ENETUNREACH)
178                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
179                 return err;
180         }
181
182         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
183                 ip_rt_put(rt);
184                 return -ENETUNREACH;
185         }
186
187         if (!inet_opt || !inet_opt->opt.srr)
188                 daddr = fl4->daddr;
189
190         if (!inet->inet_saddr)
191                 inet->inet_saddr = fl4->saddr;
192         sk_rcv_saddr_set(sk, inet->inet_saddr);
193
194         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
195                 /* Reset inherited state */
196                 tp->rx_opt.ts_recent       = 0;
197                 tp->rx_opt.ts_recent_stamp = 0;
198                 if (likely(!tp->repair))
199                         tp->write_seq      = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
204                 tcp_fetch_timewait_stamp(sk, &rt->dst);
205
206         inet->inet_dport = usin->sin_port;
207         sk_daddr_set(sk, daddr);
208
209         inet_csk(sk)->icsk_ext_hdr_len = 0;
210         if (inet_opt)
211                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
212
213         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
214
215         /* Socket identity is still unknown (sport may be zero).
216          * However we set state to SYN-SENT and not releasing socket
217          * lock select source port, enter ourselves into the hash tables and
218          * complete initialization after this.
219          */
220         tcp_set_state(sk, TCP_SYN_SENT);
221         err = inet_hash_connect(&tcp_death_row, sk);
222         if (err)
223                 goto failure;
224
225         sk_set_txhash(sk);
226
227         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
228                                inet->inet_sport, inet->inet_dport, sk);
229         if (IS_ERR(rt)) {
230                 err = PTR_ERR(rt);
231                 rt = NULL;
232                 goto failure;
233         }
234         /* OK, now commit destination to socket.  */
235         sk->sk_gso_type = SKB_GSO_TCPV4;
236         sk_setup_caps(sk, &rt->dst);
237
238         if (!tp->write_seq && likely(!tp->repair))
239                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
240                                                            inet->inet_daddr,
241                                                            inet->inet_sport,
242                                                            usin->sin_port);
243
244         inet->inet_id = tp->write_seq ^ jiffies;
245
246         err = tcp_connect(sk);
247
248         rt = NULL;
249         if (err)
250                 goto failure;
251
252         return 0;
253
254 failure:
255         /*
256          * This unhashes the socket and releases the local port,
257          * if necessary.
258          */
259         tcp_set_state(sk, TCP_CLOSE);
260         ip_rt_put(rt);
261         sk->sk_route_caps = 0;
262         inet->inet_dport = 0;
263         return err;
264 }
265 EXPORT_SYMBOL(tcp_v4_connect);
266
267 /*
268  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
269  * It can be called through tcp_release_cb() if socket was owned by user
270  * at the time tcp_v4_err() was called to handle ICMP message.
271  */
272 void tcp_v4_mtu_reduced(struct sock *sk)
273 {
274         struct dst_entry *dst;
275         struct inet_sock *inet = inet_sk(sk);
276         u32 mtu = tcp_sk(sk)->mtu_info;
277
278         dst = inet_csk_update_pmtu(sk, mtu);
279         if (!dst)
280                 return;
281
282         /* Something is about to be wrong... Remember soft error
283          * for the case, if this connection will not able to recover.
284          */
285         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
286                 sk->sk_err_soft = EMSGSIZE;
287
288         mtu = dst_mtu(dst);
289
290         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
291             ip_sk_accept_pmtu(sk) &&
292             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293                 tcp_sync_mss(sk, mtu);
294
295                 /* Resend the TCP packet because it's
296                  * clear that the old packet has been
297                  * dropped. This is the new "fast" path mtu
298                  * discovery.
299                  */
300                 tcp_simple_retransmit(sk);
301         } /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307         struct dst_entry *dst = __sk_dst_check(sk, 0);
308
309         if (dst)
310                 dst->ops->redirect(dst, sk, skb);
311 }
312
313
314 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
315 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
316 {
317         struct request_sock *req = inet_reqsk(sk);
318         struct net *net = sock_net(sk);
319
320         /* ICMPs are not backlogged, hence we cannot get
321          * an established socket here.
322          */
323         WARN_ON(req->sk);
324
325         if (seq != tcp_rsk(req)->snt_isn) {
326                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
327         } else if (abort) {
328                 /*
329                  * Still in SYN_RECV, just remove it silently.
330                  * There is no good way to pass the error to the newly
331                  * created socket, and POSIX does not want network
332                  * errors returned from accept().
333                  */
334                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
335                 NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
336         }
337         reqsk_put(req);
338 }
339 EXPORT_SYMBOL(tcp_req_err);
340
341 /*
342  * This routine is called by the ICMP module when it gets some
343  * sort of error condition.  If err < 0 then the socket should
344  * be closed and the error returned to the user.  If err > 0
345  * it's just the icmp type << 8 | icmp code.  After adjustment
346  * header points to the first 8 bytes of the tcp header.  We need
347  * to find the appropriate port.
348  *
349  * The locking strategy used here is very "optimistic". When
350  * someone else accesses the socket the ICMP is just dropped
351  * and for some paths there is no check at all.
352  * A more general error queue to queue errors for later handling
353  * is probably better.
354  *
355  */
356
357 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
358 {
359         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
360         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
361         struct inet_connection_sock *icsk;
362         struct tcp_sock *tp;
363         struct inet_sock *inet;
364         const int type = icmp_hdr(icmp_skb)->type;
365         const int code = icmp_hdr(icmp_skb)->code;
366         struct sock *sk;
367         struct sk_buff *skb;
368         struct request_sock *fastopen;
369         __u32 seq, snd_una;
370         __u32 remaining;
371         int err;
372         struct net *net = dev_net(icmp_skb->dev);
373
374         sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
375                                        th->dest, iph->saddr, ntohs(th->source),
376                                        inet_iif(icmp_skb));
377         if (!sk) {
378                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
379                 return;
380         }
381         if (sk->sk_state == TCP_TIME_WAIT) {
382                 inet_twsk_put(inet_twsk(sk));
383                 return;
384         }
385         seq = ntohl(th->seq);
386         if (sk->sk_state == TCP_NEW_SYN_RECV)
387                 return tcp_req_err(sk, seq,
388                                   type == ICMP_PARAMETERPROB ||
389                                   type == ICMP_TIME_EXCEEDED ||
390                                   (type == ICMP_DEST_UNREACH &&
391                                    (code == ICMP_NET_UNREACH ||
392                                     code == ICMP_HOST_UNREACH)));
393
394         bh_lock_sock(sk);
395         /* If too many ICMPs get dropped on busy
396          * servers this needs to be solved differently.
397          * We do take care of PMTU discovery (RFC1191) special case :
398          * we can receive locally generated ICMP messages while socket is held.
399          */
400         if (sock_owned_by_user(sk)) {
401                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
402                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
403         }
404         if (sk->sk_state == TCP_CLOSE)
405                 goto out;
406
407         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
408                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
409                 goto out;
410         }
411
412         icsk = inet_csk(sk);
413         tp = tcp_sk(sk);
414         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
415         fastopen = tp->fastopen_rsk;
416         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
417         if (sk->sk_state != TCP_LISTEN &&
418             !between(seq, snd_una, tp->snd_nxt)) {
419                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
420                 goto out;
421         }
422
423         switch (type) {
424         case ICMP_REDIRECT:
425                 do_redirect(icmp_skb, sk);
426                 goto out;
427         case ICMP_SOURCE_QUENCH:
428                 /* Just silently ignore these. */
429                 goto out;
430         case ICMP_PARAMETERPROB:
431                 err = EPROTO;
432                 break;
433         case ICMP_DEST_UNREACH:
434                 if (code > NR_ICMP_UNREACH)
435                         goto out;
436
437                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
438                         /* We are not interested in TCP_LISTEN and open_requests
439                          * (SYN-ACKs send out by Linux are always <576bytes so
440                          * they should go through unfragmented).
441                          */
442                         if (sk->sk_state == TCP_LISTEN)
443                                 goto out;
444
445                         tp->mtu_info = info;
446                         if (!sock_owned_by_user(sk)) {
447                                 tcp_v4_mtu_reduced(sk);
448                         } else {
449                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
450                                         sock_hold(sk);
451                         }
452                         goto out;
453                 }
454
455                 err = icmp_err_convert[code].errno;
456                 /* check if icmp_skb allows revert of backoff
457                  * (see draft-zimmermann-tcp-lcd) */
458                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
459                         break;
460                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
461                     !icsk->icsk_backoff || fastopen)
462                         break;
463
464                 if (sock_owned_by_user(sk))
465                         break;
466
467                 icsk->icsk_backoff--;
468                 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
469                                                TCP_TIMEOUT_INIT;
470                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
471
472                 skb = tcp_write_queue_head(sk);
473                 BUG_ON(!skb);
474
475                 remaining = icsk->icsk_rto -
476                             min(icsk->icsk_rto,
477                                 tcp_time_stamp - tcp_skb_timestamp(skb));
478
479                 if (remaining) {
480                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
481                                                   remaining, TCP_RTO_MAX);
482                 } else {
483                         /* RTO revert clocked out retransmission.
484                          * Will retransmit now */
485                         tcp_retransmit_timer(sk);
486                 }
487
488                 break;
489         case ICMP_TIME_EXCEEDED:
490                 err = EHOSTUNREACH;
491                 break;
492         default:
493                 goto out;
494         }
495
496         switch (sk->sk_state) {
497         case TCP_SYN_SENT:
498         case TCP_SYN_RECV:
499                 /* Only in fast or simultaneous open. If a fast open socket is
500                  * is already accepted it is treated as a connected one below.
501                  */
502                 if (fastopen && !fastopen->sk)
503                         break;
504
505                 if (!sock_owned_by_user(sk)) {
506                         sk->sk_err = err;
507
508                         sk->sk_error_report(sk);
509
510                         tcp_done(sk);
511                 } else {
512                         sk->sk_err_soft = err;
513                 }
514                 goto out;
515         }
516
517         /* If we've already connected we will keep trying
518          * until we time out, or the user gives up.
519          *
520          * rfc1122 4.2.3.9 allows to consider as hard errors
521          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
522          * but it is obsoleted by pmtu discovery).
523          *
524          * Note, that in modern internet, where routing is unreliable
525          * and in each dark corner broken firewalls sit, sending random
526          * errors ordered by their masters even this two messages finally lose
527          * their original sense (even Linux sends invalid PORT_UNREACHs)
528          *
529          * Now we are in compliance with RFCs.
530          *                                                      --ANK (980905)
531          */
532
533         inet = inet_sk(sk);
534         if (!sock_owned_by_user(sk) && inet->recverr) {
535                 sk->sk_err = err;
536                 sk->sk_error_report(sk);
537         } else  { /* Only an error on timeout */
538                 sk->sk_err_soft = err;
539         }
540
541 out:
542         bh_unlock_sock(sk);
543         sock_put(sk);
544 }
545
546 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
547 {
548         struct tcphdr *th = tcp_hdr(skb);
549
550         if (skb->ip_summed == CHECKSUM_PARTIAL) {
551                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
552                 skb->csum_start = skb_transport_header(skb) - skb->head;
553                 skb->csum_offset = offsetof(struct tcphdr, check);
554         } else {
555                 th->check = tcp_v4_check(skb->len, saddr, daddr,
556                                          csum_partial(th,
557                                                       th->doff << 2,
558                                                       skb->csum));
559         }
560 }
561
562 /* This routine computes an IPv4 TCP checksum. */
563 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
564 {
565         const struct inet_sock *inet = inet_sk(sk);
566
567         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
568 }
569 EXPORT_SYMBOL(tcp_v4_send_check);
570
571 /*
572  *      This routine will send an RST to the other tcp.
573  *
574  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575  *                    for reset.
576  *      Answer: if a packet caused RST, it is not for a socket
577  *              existing in our system, if it is matched to a socket,
578  *              it is just duplicate segment or bug in other side's TCP.
579  *              So that we build reply only basing on parameters
580  *              arrived with segment.
581  *      Exception: precedence violation. We do not implement it in any case.
582  */
583
584 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
585 {
586         const struct tcphdr *th = tcp_hdr(skb);
587         struct {
588                 struct tcphdr th;
589 #ifdef CONFIG_TCP_MD5SIG
590                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591 #endif
592         } rep;
593         struct ip_reply_arg arg;
594 #ifdef CONFIG_TCP_MD5SIG
595         struct tcp_md5sig_key *key;
596         const __u8 *hash_location = NULL;
597         unsigned char newhash[16];
598         int genhash;
599         struct sock *sk1 = NULL;
600 #endif
601         struct net *net;
602
603         /* Never send a reset in response to a reset. */
604         if (th->rst)
605                 return;
606
607         /* If sk not NULL, it means we did a successful lookup and incoming
608          * route had to be correct. prequeue might have dropped our dst.
609          */
610         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
611                 return;
612
613         /* Swap the send and the receive. */
614         memset(&rep, 0, sizeof(rep));
615         rep.th.dest   = th->source;
616         rep.th.source = th->dest;
617         rep.th.doff   = sizeof(struct tcphdr) / 4;
618         rep.th.rst    = 1;
619
620         if (th->ack) {
621                 rep.th.seq = th->ack_seq;
622         } else {
623                 rep.th.ack = 1;
624                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
625                                        skb->len - (th->doff << 2));
626         }
627
628         memset(&arg, 0, sizeof(arg));
629         arg.iov[0].iov_base = (unsigned char *)&rep;
630         arg.iov[0].iov_len  = sizeof(rep.th);
631
632         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
633 #ifdef CONFIG_TCP_MD5SIG
634         hash_location = tcp_parse_md5sig_option(th);
635         if (!sk && hash_location) {
636                 /*
637                  * active side is lost. Try to find listening socket through
638                  * source port, and then find md5 key through listening socket.
639                  * we are not loose security here:
640                  * Incoming packet is checked with md5 hash with finding key,
641                  * no RST generated if md5 hash doesn't match.
642                  */
643                 sk1 = __inet_lookup_listener(net,
644                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
645                                              th->source, ip_hdr(skb)->daddr,
646                                              ntohs(th->source), inet_iif(skb));
647                 /* don't send rst if it can't find key */
648                 if (!sk1)
649                         return;
650                 rcu_read_lock();
651                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
652                                         &ip_hdr(skb)->saddr, AF_INET);
653                 if (!key)
654                         goto release_sk1;
655
656                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
657                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
658                         goto release_sk1;
659         } else {
660                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
661                                              &ip_hdr(skb)->saddr,
662                                              AF_INET) : NULL;
663         }
664
665         if (key) {
666                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
667                                    (TCPOPT_NOP << 16) |
668                                    (TCPOPT_MD5SIG << 8) |
669                                    TCPOLEN_MD5SIG);
670                 /* Update length and the length the header thinks exists */
671                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
672                 rep.th.doff = arg.iov[0].iov_len / 4;
673
674                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
675                                      key, ip_hdr(skb)->saddr,
676                                      ip_hdr(skb)->daddr, &rep.th);
677         }
678 #endif
679         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
680                                       ip_hdr(skb)->saddr, /* XXX */
681                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
682         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
683         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
684         /* When socket is gone, all binding information is lost.
685          * routing might fail in this case. No choice here, if we choose to force
686          * input interface, we will misroute in case of asymmetric route.
687          */
688         if (sk)
689                 arg.bound_dev_if = sk->sk_bound_dev_if;
690
691         arg.tos = ip_hdr(skb)->tos;
692         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
693                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
694                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
695                               &arg, arg.iov[0].iov_len);
696
697         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
698         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
699
700 #ifdef CONFIG_TCP_MD5SIG
701 release_sk1:
702         if (sk1) {
703                 rcu_read_unlock();
704                 sock_put(sk1);
705         }
706 #endif
707 }
708
709 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
710    outside socket context is ugly, certainly. What can I do?
711  */
712
713 static void tcp_v4_send_ack(struct net *net,
714                             struct sk_buff *skb, u32 seq, u32 ack,
715                             u32 win, u32 tsval, u32 tsecr, int oif,
716                             struct tcp_md5sig_key *key,
717                             int reply_flags, u8 tos)
718 {
719         const struct tcphdr *th = tcp_hdr(skb);
720         struct {
721                 struct tcphdr th;
722                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
723 #ifdef CONFIG_TCP_MD5SIG
724                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
725 #endif
726                         ];
727         } rep;
728         struct ip_reply_arg arg;
729
730         memset(&rep.th, 0, sizeof(struct tcphdr));
731         memset(&arg, 0, sizeof(arg));
732
733         arg.iov[0].iov_base = (unsigned char *)&rep;
734         arg.iov[0].iov_len  = sizeof(rep.th);
735         if (tsecr) {
736                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
737                                    (TCPOPT_TIMESTAMP << 8) |
738                                    TCPOLEN_TIMESTAMP);
739                 rep.opt[1] = htonl(tsval);
740                 rep.opt[2] = htonl(tsecr);
741                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
742         }
743
744         /* Swap the send and the receive. */
745         rep.th.dest    = th->source;
746         rep.th.source  = th->dest;
747         rep.th.doff    = arg.iov[0].iov_len / 4;
748         rep.th.seq     = htonl(seq);
749         rep.th.ack_seq = htonl(ack);
750         rep.th.ack     = 1;
751         rep.th.window  = htons(win);
752
753 #ifdef CONFIG_TCP_MD5SIG
754         if (key) {
755                 int offset = (tsecr) ? 3 : 0;
756
757                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
758                                           (TCPOPT_NOP << 16) |
759                                           (TCPOPT_MD5SIG << 8) |
760                                           TCPOLEN_MD5SIG);
761                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
762                 rep.th.doff = arg.iov[0].iov_len/4;
763
764                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
765                                     key, ip_hdr(skb)->saddr,
766                                     ip_hdr(skb)->daddr, &rep.th);
767         }
768 #endif
769         arg.flags = reply_flags;
770         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
771                                       ip_hdr(skb)->saddr, /* XXX */
772                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
773         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
774         if (oif)
775                 arg.bound_dev_if = oif;
776         arg.tos = tos;
777         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
778                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
779                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
780                               &arg, arg.iov[0].iov_len);
781
782         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
783 }
784
785 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
786 {
787         struct inet_timewait_sock *tw = inet_twsk(sk);
788         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
789
790         tcp_v4_send_ack(sock_net(sk), skb,
791                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
792                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
793                         tcp_time_stamp + tcptw->tw_ts_offset,
794                         tcptw->tw_ts_recent,
795                         tw->tw_bound_dev_if,
796                         tcp_twsk_md5_key(tcptw),
797                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
798                         tw->tw_tos
799                         );
800
801         inet_twsk_put(tw);
802 }
803
804 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
805                                   struct request_sock *req)
806 {
807         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
808          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
809          */
810         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
811                                              tcp_sk(sk)->snd_nxt;
812
813         tcp_v4_send_ack(sock_net(sk), skb, seq,
814                         tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
815                         tcp_time_stamp,
816                         req->ts_recent,
817                         0,
818                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
819                                           AF_INET),
820                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
821                         ip_hdr(skb)->tos);
822 }
823
824 /*
825  *      Send a SYN-ACK after having received a SYN.
826  *      This still operates on a request_sock only, not on a big
827  *      socket.
828  */
829 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
830                               struct flowi *fl,
831                               struct request_sock *req,
832                               struct tcp_fastopen_cookie *foc,
833                                   bool attach_req)
834 {
835         const struct inet_request_sock *ireq = inet_rsk(req);
836         struct flowi4 fl4;
837         int err = -1;
838         struct sk_buff *skb;
839
840         /* First, grab a route. */
841         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
842                 return -1;
843
844         skb = tcp_make_synack(sk, dst, req, foc, attach_req);
845
846         if (skb) {
847                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
848
849                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
850                                             ireq->ir_rmt_addr,
851                                             ireq->opt);
852                 err = net_xmit_eval(err);
853         }
854
855         return err;
856 }
857
858 /*
859  *      IPv4 request_sock destructor.
860  */
861 static void tcp_v4_reqsk_destructor(struct request_sock *req)
862 {
863         kfree(inet_rsk(req)->opt);
864 }
865
866
867 #ifdef CONFIG_TCP_MD5SIG
868 /*
869  * RFC2385 MD5 checksumming requires a mapping of
870  * IP address->MD5 Key.
871  * We need to maintain these in the sk structure.
872  */
873
874 /* Find the Key structure for an address.  */
875 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
876                                          const union tcp_md5_addr *addr,
877                                          int family)
878 {
879         const struct tcp_sock *tp = tcp_sk(sk);
880         struct tcp_md5sig_key *key;
881         unsigned int size = sizeof(struct in_addr);
882         const struct tcp_md5sig_info *md5sig;
883
884         /* caller either holds rcu_read_lock() or socket lock */
885         md5sig = rcu_dereference_check(tp->md5sig_info,
886                                        sock_owned_by_user(sk) ||
887                                        lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
888         if (!md5sig)
889                 return NULL;
890 #if IS_ENABLED(CONFIG_IPV6)
891         if (family == AF_INET6)
892                 size = sizeof(struct in6_addr);
893 #endif
894         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
895                 if (key->family != family)
896                         continue;
897                 if (!memcmp(&key->addr, addr, size))
898                         return key;
899         }
900         return NULL;
901 }
902 EXPORT_SYMBOL(tcp_md5_do_lookup);
903
904 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
905                                          const struct sock *addr_sk)
906 {
907         const union tcp_md5_addr *addr;
908
909         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
910         return tcp_md5_do_lookup(sk, addr, AF_INET);
911 }
912 EXPORT_SYMBOL(tcp_v4_md5_lookup);
913
914 /* This can be called on a newly created socket, from other files */
915 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
916                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
917 {
918         /* Add Key to the list */
919         struct tcp_md5sig_key *key;
920         struct tcp_sock *tp = tcp_sk(sk);
921         struct tcp_md5sig_info *md5sig;
922
923         key = tcp_md5_do_lookup(sk, addr, family);
924         if (key) {
925                 /* Pre-existing entry - just update that one. */
926                 memcpy(key->key, newkey, newkeylen);
927                 key->keylen = newkeylen;
928                 return 0;
929         }
930
931         md5sig = rcu_dereference_protected(tp->md5sig_info,
932                                            sock_owned_by_user(sk) ||
933                                            lockdep_is_held(&sk->sk_lock.slock));
934         if (!md5sig) {
935                 md5sig = kmalloc(sizeof(*md5sig), gfp);
936                 if (!md5sig)
937                         return -ENOMEM;
938
939                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
940                 INIT_HLIST_HEAD(&md5sig->head);
941                 rcu_assign_pointer(tp->md5sig_info, md5sig);
942         }
943
944         key = sock_kmalloc(sk, sizeof(*key), gfp);
945         if (!key)
946                 return -ENOMEM;
947         if (!tcp_alloc_md5sig_pool()) {
948                 sock_kfree_s(sk, key, sizeof(*key));
949                 return -ENOMEM;
950         }
951
952         memcpy(key->key, newkey, newkeylen);
953         key->keylen = newkeylen;
954         key->family = family;
955         memcpy(&key->addr, addr,
956                (family == AF_INET6) ? sizeof(struct in6_addr) :
957                                       sizeof(struct in_addr));
958         hlist_add_head_rcu(&key->node, &md5sig->head);
959         return 0;
960 }
961 EXPORT_SYMBOL(tcp_md5_do_add);
962
963 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
964 {
965         struct tcp_md5sig_key *key;
966
967         key = tcp_md5_do_lookup(sk, addr, family);
968         if (!key)
969                 return -ENOENT;
970         hlist_del_rcu(&key->node);
971         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
972         kfree_rcu(key, rcu);
973         return 0;
974 }
975 EXPORT_SYMBOL(tcp_md5_do_del);
976
977 static void tcp_clear_md5_list(struct sock *sk)
978 {
979         struct tcp_sock *tp = tcp_sk(sk);
980         struct tcp_md5sig_key *key;
981         struct hlist_node *n;
982         struct tcp_md5sig_info *md5sig;
983
984         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
985
986         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
987                 hlist_del_rcu(&key->node);
988                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
989                 kfree_rcu(key, rcu);
990         }
991 }
992
993 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
994                                  int optlen)
995 {
996         struct tcp_md5sig cmd;
997         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
998
999         if (optlen < sizeof(cmd))
1000                 return -EINVAL;
1001
1002         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1003                 return -EFAULT;
1004
1005         if (sin->sin_family != AF_INET)
1006                 return -EINVAL;
1007
1008         if (!cmd.tcpm_keylen)
1009                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1010                                       AF_INET);
1011
1012         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1013                 return -EINVAL;
1014
1015         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1016                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1017                               GFP_KERNEL);
1018 }
1019
1020 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1021                                         __be32 daddr, __be32 saddr, int nbytes)
1022 {
1023         struct tcp4_pseudohdr *bp;
1024         struct scatterlist sg;
1025
1026         bp = &hp->md5_blk.ip4;
1027
1028         /*
1029          * 1. the TCP pseudo-header (in the order: source IP address,
1030          * destination IP address, zero-padded protocol number, and
1031          * segment length)
1032          */
1033         bp->saddr = saddr;
1034         bp->daddr = daddr;
1035         bp->pad = 0;
1036         bp->protocol = IPPROTO_TCP;
1037         bp->len = cpu_to_be16(nbytes);
1038
1039         sg_init_one(&sg, bp, sizeof(*bp));
1040         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1041 }
1042
1043 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1044                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1045 {
1046         struct tcp_md5sig_pool *hp;
1047         struct hash_desc *desc;
1048
1049         hp = tcp_get_md5sig_pool();
1050         if (!hp)
1051                 goto clear_hash_noput;
1052         desc = &hp->md5_desc;
1053
1054         if (crypto_hash_init(desc))
1055                 goto clear_hash;
1056         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1057                 goto clear_hash;
1058         if (tcp_md5_hash_header(hp, th))
1059                 goto clear_hash;
1060         if (tcp_md5_hash_key(hp, key))
1061                 goto clear_hash;
1062         if (crypto_hash_final(desc, md5_hash))
1063                 goto clear_hash;
1064
1065         tcp_put_md5sig_pool();
1066         return 0;
1067
1068 clear_hash:
1069         tcp_put_md5sig_pool();
1070 clear_hash_noput:
1071         memset(md5_hash, 0, 16);
1072         return 1;
1073 }
1074
1075 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1076                         const struct sock *sk,
1077                         const struct sk_buff *skb)
1078 {
1079         struct tcp_md5sig_pool *hp;
1080         struct hash_desc *desc;
1081         const struct tcphdr *th = tcp_hdr(skb);
1082         __be32 saddr, daddr;
1083
1084         if (sk) { /* valid for establish/request sockets */
1085                 saddr = sk->sk_rcv_saddr;
1086                 daddr = sk->sk_daddr;
1087         } else {
1088                 const struct iphdr *iph = ip_hdr(skb);
1089                 saddr = iph->saddr;
1090                 daddr = iph->daddr;
1091         }
1092
1093         hp = tcp_get_md5sig_pool();
1094         if (!hp)
1095                 goto clear_hash_noput;
1096         desc = &hp->md5_desc;
1097
1098         if (crypto_hash_init(desc))
1099                 goto clear_hash;
1100
1101         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1102                 goto clear_hash;
1103         if (tcp_md5_hash_header(hp, th))
1104                 goto clear_hash;
1105         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1106                 goto clear_hash;
1107         if (tcp_md5_hash_key(hp, key))
1108                 goto clear_hash;
1109         if (crypto_hash_final(desc, md5_hash))
1110                 goto clear_hash;
1111
1112         tcp_put_md5sig_pool();
1113         return 0;
1114
1115 clear_hash:
1116         tcp_put_md5sig_pool();
1117 clear_hash_noput:
1118         memset(md5_hash, 0, 16);
1119         return 1;
1120 }
1121 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1122
1123 #endif
1124
1125 /* Called with rcu_read_lock() */
1126 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1127                                     const struct sk_buff *skb)
1128 {
1129 #ifdef CONFIG_TCP_MD5SIG
1130         /*
1131          * This gets called for each TCP segment that arrives
1132          * so we want to be efficient.
1133          * We have 3 drop cases:
1134          * o No MD5 hash and one expected.
1135          * o MD5 hash and we're not expecting one.
1136          * o MD5 hash and its wrong.
1137          */
1138         const __u8 *hash_location = NULL;
1139         struct tcp_md5sig_key *hash_expected;
1140         const struct iphdr *iph = ip_hdr(skb);
1141         const struct tcphdr *th = tcp_hdr(skb);
1142         int genhash;
1143         unsigned char newhash[16];
1144
1145         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1146                                           AF_INET);
1147         hash_location = tcp_parse_md5sig_option(th);
1148
1149         /* We've parsed the options - do we have a hash? */
1150         if (!hash_expected && !hash_location)
1151                 return false;
1152
1153         if (hash_expected && !hash_location) {
1154                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1155                 return true;
1156         }
1157
1158         if (!hash_expected && hash_location) {
1159                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1160                 return true;
1161         }
1162
1163         /* Okay, so this is hash_expected and hash_location -
1164          * so we need to calculate the checksum.
1165          */
1166         genhash = tcp_v4_md5_hash_skb(newhash,
1167                                       hash_expected,
1168                                       NULL, skb);
1169
1170         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1171                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1172                                      &iph->saddr, ntohs(th->source),
1173                                      &iph->daddr, ntohs(th->dest),
1174                                      genhash ? " tcp_v4_calc_md5_hash failed"
1175                                      : "");
1176                 return true;
1177         }
1178         return false;
1179 #endif
1180         return false;
1181 }
1182
1183 static void tcp_v4_init_req(struct request_sock *req,
1184                             const struct sock *sk_listener,
1185                             struct sk_buff *skb)
1186 {
1187         struct inet_request_sock *ireq = inet_rsk(req);
1188
1189         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1190         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1191         ireq->no_srccheck = inet_sk(sk_listener)->transparent;
1192         ireq->opt = tcp_v4_save_options(skb);
1193 }
1194
1195 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1196                                           struct flowi *fl,
1197                                           const struct request_sock *req,
1198                                           bool *strict)
1199 {
1200         struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
1201
1202         if (strict) {
1203                 if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
1204                         *strict = true;
1205                 else
1206                         *strict = false;
1207         }
1208
1209         return dst;
1210 }
1211
1212 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1213         .family         =       PF_INET,
1214         .obj_size       =       sizeof(struct tcp_request_sock),
1215         .rtx_syn_ack    =       tcp_rtx_synack,
1216         .send_ack       =       tcp_v4_reqsk_send_ack,
1217         .destructor     =       tcp_v4_reqsk_destructor,
1218         .send_reset     =       tcp_v4_send_reset,
1219         .syn_ack_timeout =      tcp_syn_ack_timeout,
1220 };
1221
1222 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1223         .mss_clamp      =       TCP_MSS_DEFAULT,
1224 #ifdef CONFIG_TCP_MD5SIG
1225         .req_md5_lookup =       tcp_v4_md5_lookup,
1226         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1227 #endif
1228         .init_req       =       tcp_v4_init_req,
1229 #ifdef CONFIG_SYN_COOKIES
1230         .cookie_init_seq =      cookie_v4_init_sequence,
1231 #endif
1232         .route_req      =       tcp_v4_route_req,
1233         .init_seq       =       tcp_v4_init_sequence,
1234         .send_synack    =       tcp_v4_send_synack,
1235 };
1236
1237 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1238 {
1239         /* Never answer to SYNs send to broadcast or multicast */
1240         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1241                 goto drop;
1242
1243         return tcp_conn_request(&tcp_request_sock_ops,
1244                                 &tcp_request_sock_ipv4_ops, sk, skb);
1245
1246 drop:
1247         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1248         return 0;
1249 }
1250 EXPORT_SYMBOL(tcp_v4_conn_request);
1251
1252
1253 /*
1254  * The three way handshake has completed - we got a valid synack -
1255  * now create the new socket.
1256  */
1257 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1258                                   struct request_sock *req,
1259                                   struct dst_entry *dst,
1260                                   struct request_sock *req_unhash,
1261                                   bool *own_req)
1262 {
1263         struct inet_request_sock *ireq;
1264         struct inet_sock *newinet;
1265         struct tcp_sock *newtp;
1266         struct sock *newsk;
1267 #ifdef CONFIG_TCP_MD5SIG
1268         struct tcp_md5sig_key *key;
1269 #endif
1270         struct ip_options_rcu *inet_opt;
1271
1272         if (sk_acceptq_is_full(sk))
1273                 goto exit_overflow;
1274
1275         newsk = tcp_create_openreq_child(sk, req, skb);
1276         if (!newsk)
1277                 goto exit_nonewsk;
1278
1279         newsk->sk_gso_type = SKB_GSO_TCPV4;
1280         inet_sk_rx_dst_set(newsk, skb);
1281
1282         newtp                 = tcp_sk(newsk);
1283         newinet               = inet_sk(newsk);
1284         ireq                  = inet_rsk(req);
1285         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1286         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1287         newinet->inet_saddr           = ireq->ir_loc_addr;
1288         inet_opt              = ireq->opt;
1289         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1290         ireq->opt             = NULL;
1291         newinet->mc_index     = inet_iif(skb);
1292         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1293         newinet->rcv_tos      = ip_hdr(skb)->tos;
1294         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1295         if (inet_opt)
1296                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1297         newinet->inet_id = newtp->write_seq ^ jiffies;
1298
1299         if (!dst) {
1300                 dst = inet_csk_route_child_sock(sk, newsk, req);
1301                 if (!dst)
1302                         goto put_and_exit;
1303         } else {
1304                 /* syncookie case : see end of cookie_v4_check() */
1305         }
1306         sk_setup_caps(newsk, dst);
1307
1308         tcp_ca_openreq_child(newsk, dst);
1309
1310         tcp_sync_mss(newsk, dst_mtu(dst));
1311         newtp->advmss = dst_metric_advmss(dst);
1312         if (tcp_sk(sk)->rx_opt.user_mss &&
1313             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1314                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1315
1316         tcp_initialize_rcv_mss(newsk);
1317
1318 #ifdef CONFIG_TCP_MD5SIG
1319         /* Copy over the MD5 key from the original socket */
1320         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1321                                 AF_INET);
1322         if (key) {
1323                 /*
1324                  * We're using one, so create a matching key
1325                  * on the newsk structure. If we fail to get
1326                  * memory, then we end up not copying the key
1327                  * across. Shucks.
1328                  */
1329                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1330                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1331                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1332         }
1333 #endif
1334
1335         if (__inet_inherit_port(sk, newsk) < 0)
1336                 goto put_and_exit;
1337         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1338         if (*own_req)
1339                 tcp_move_syn(newtp, req);
1340
1341         return newsk;
1342
1343 exit_overflow:
1344         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1345 exit_nonewsk:
1346         dst_release(dst);
1347 exit:
1348         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1349         return NULL;
1350 put_and_exit:
1351         inet_csk_prepare_forced_close(newsk);
1352         tcp_done(newsk);
1353         goto exit;
1354 }
1355 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1356
1357 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1358 {
1359 #ifdef CONFIG_SYN_COOKIES
1360         const struct tcphdr *th = tcp_hdr(skb);
1361
1362         if (!th->syn)
1363                 sk = cookie_v4_check(sk, skb);
1364 #endif
1365         return sk;
1366 }
1367
1368 /* The socket must have it's spinlock held when we get
1369  * here, unless it is a TCP_LISTEN socket.
1370  *
1371  * We have a potential double-lock case here, so even when
1372  * doing backlog processing we use the BH locking scheme.
1373  * This is because we cannot sleep with the original spinlock
1374  * held.
1375  */
1376 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1377 {
1378         struct sock *rsk;
1379
1380         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1381                 struct dst_entry *dst = sk->sk_rx_dst;
1382
1383                 sock_rps_save_rxhash(sk, skb);
1384                 sk_mark_napi_id(sk, skb);
1385                 if (dst) {
1386                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1387                             !dst->ops->check(dst, 0)) {
1388                                 dst_release(dst);
1389                                 sk->sk_rx_dst = NULL;
1390                         }
1391                 }
1392                 tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
1393                 return 0;
1394         }
1395
1396         if (tcp_checksum_complete(skb))
1397                 goto csum_err;
1398
1399         if (sk->sk_state == TCP_LISTEN) {
1400                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1401
1402                 if (!nsk)
1403                         goto discard;
1404                 if (nsk != sk) {
1405                         sock_rps_save_rxhash(nsk, skb);
1406                         sk_mark_napi_id(nsk, skb);
1407                         if (tcp_child_process(sk, nsk, skb)) {
1408                                 rsk = nsk;
1409                                 goto reset;
1410                         }
1411                         return 0;
1412                 }
1413         } else
1414                 sock_rps_save_rxhash(sk, skb);
1415
1416         if (tcp_rcv_state_process(sk, skb)) {
1417                 rsk = sk;
1418                 goto reset;
1419         }
1420         return 0;
1421
1422 reset:
1423         tcp_v4_send_reset(rsk, skb);
1424 discard:
1425         kfree_skb(skb);
1426         /* Be careful here. If this function gets more complicated and
1427          * gcc suffers from register pressure on the x86, sk (in %ebx)
1428          * might be destroyed here. This current version compiles correctly,
1429          * but you have been warned.
1430          */
1431         return 0;
1432
1433 csum_err:
1434         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1435         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1436         goto discard;
1437 }
1438 EXPORT_SYMBOL(tcp_v4_do_rcv);
1439
1440 void tcp_v4_early_demux(struct sk_buff *skb)
1441 {
1442         const struct iphdr *iph;
1443         const struct tcphdr *th;
1444         struct sock *sk;
1445
1446         if (skb->pkt_type != PACKET_HOST)
1447                 return;
1448
1449         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1450                 return;
1451
1452         iph = ip_hdr(skb);
1453         th = tcp_hdr(skb);
1454
1455         if (th->doff < sizeof(struct tcphdr) / 4)
1456                 return;
1457
1458         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1459                                        iph->saddr, th->source,
1460                                        iph->daddr, ntohs(th->dest),
1461                                        skb->skb_iif);
1462         if (sk) {
1463                 skb->sk = sk;
1464                 skb->destructor = sock_edemux;
1465                 if (sk_fullsock(sk)) {
1466                         struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1467
1468                         if (dst)
1469                                 dst = dst_check(dst, 0);
1470                         if (dst &&
1471                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1472                                 skb_dst_set_noref(skb, dst);
1473                 }
1474         }
1475 }
1476
1477 /* Packet is added to VJ-style prequeue for processing in process
1478  * context, if a reader task is waiting. Apparently, this exciting
1479  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1480  * failed somewhere. Latency? Burstiness? Well, at least now we will
1481  * see, why it failed. 8)8)                               --ANK
1482  *
1483  */
1484 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1485 {
1486         struct tcp_sock *tp = tcp_sk(sk);
1487
1488         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1489                 return false;
1490
1491         if (skb->len <= tcp_hdrlen(skb) &&
1492             skb_queue_len(&tp->ucopy.prequeue) == 0)
1493                 return false;
1494
1495         /* Before escaping RCU protected region, we need to take care of skb
1496          * dst. Prequeue is only enabled for established sockets.
1497          * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
1498          * Instead of doing full sk_rx_dst validity here, let's perform
1499          * an optimistic check.
1500          */
1501         if (likely(sk->sk_rx_dst))
1502                 skb_dst_drop(skb);
1503         else
1504                 skb_dst_force_safe(skb);
1505
1506         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1507         tp->ucopy.memory += skb->truesize;
1508         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1509                 struct sk_buff *skb1;
1510
1511                 BUG_ON(sock_owned_by_user(sk));
1512
1513                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1514                         sk_backlog_rcv(sk, skb1);
1515                         NET_INC_STATS_BH(sock_net(sk),
1516                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1517                 }
1518
1519                 tp->ucopy.memory = 0;
1520         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1521                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1522                                            POLLIN | POLLRDNORM | POLLRDBAND);
1523                 if (!inet_csk_ack_scheduled(sk))
1524                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1525                                                   (3 * tcp_rto_min(sk)) / 4,
1526                                                   TCP_RTO_MAX);
1527         }
1528         return true;
1529 }
1530 EXPORT_SYMBOL(tcp_prequeue);
1531
1532 /*
1533  *      From tcp_input.c
1534  */
1535
1536 int tcp_v4_rcv(struct sk_buff *skb)
1537 {
1538         const struct iphdr *iph;
1539         const struct tcphdr *th;
1540         struct sock *sk;
1541         int ret;
1542         struct net *net = dev_net(skb->dev);
1543
1544         if (skb->pkt_type != PACKET_HOST)
1545                 goto discard_it;
1546
1547         /* Count it even if it's bad */
1548         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1549
1550         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1551                 goto discard_it;
1552
1553         th = tcp_hdr(skb);
1554
1555         if (th->doff < sizeof(struct tcphdr) / 4)
1556                 goto bad_packet;
1557         if (!pskb_may_pull(skb, th->doff * 4))
1558                 goto discard_it;
1559
1560         /* An explanation is required here, I think.
1561          * Packet length and doff are validated by header prediction,
1562          * provided case of th->doff==0 is eliminated.
1563          * So, we defer the checks. */
1564
1565         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1566                 goto csum_error;
1567
1568         th = tcp_hdr(skb);
1569         iph = ip_hdr(skb);
1570         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1571          * barrier() makes sure compiler wont play fool^Waliasing games.
1572          */
1573         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1574                 sizeof(struct inet_skb_parm));
1575         barrier();
1576
1577         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1578         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1579                                     skb->len - th->doff * 4);
1580         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1581         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1582         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1583         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1584         TCP_SKB_CB(skb)->sacked  = 0;
1585
1586 lookup:
1587         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1588         if (!sk)
1589                 goto no_tcp_socket;
1590
1591 process:
1592         if (sk->sk_state == TCP_TIME_WAIT)
1593                 goto do_time_wait;
1594
1595         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1596                 struct request_sock *req = inet_reqsk(sk);
1597                 struct sock *nsk;
1598
1599                 sk = req->rsk_listener;
1600                 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1601                         reqsk_put(req);
1602                         goto discard_it;
1603                 }
1604                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1605                         inet_csk_reqsk_queue_drop_and_put(sk, req);
1606                         goto lookup;
1607                 }
1608                 sock_hold(sk);
1609                 nsk = tcp_check_req(sk, skb, req, false);
1610                 if (!nsk) {
1611                         reqsk_put(req);
1612                         goto discard_and_relse;
1613                 }
1614                 if (nsk == sk) {
1615                         reqsk_put(req);
1616                 } else if (tcp_child_process(sk, nsk, skb)) {
1617                         tcp_v4_send_reset(nsk, skb);
1618                         goto discard_and_relse;
1619                 } else {
1620                         sock_put(sk);
1621                         return 0;
1622                 }
1623         }
1624         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1625                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1626                 goto discard_and_relse;
1627         }
1628
1629         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1630                 goto discard_and_relse;
1631
1632         if (tcp_v4_inbound_md5_hash(sk, skb))
1633                 goto discard_and_relse;
1634
1635         nf_reset(skb);
1636
1637         if (sk_filter(sk, skb))
1638                 goto discard_and_relse;
1639
1640         skb->dev = NULL;
1641
1642         if (sk->sk_state == TCP_LISTEN) {
1643                 ret = tcp_v4_do_rcv(sk, skb);
1644                 goto put_and_return;
1645         }
1646
1647         sk_incoming_cpu_update(sk);
1648
1649         bh_lock_sock_nested(sk);
1650         tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
1651         ret = 0;
1652         if (!sock_owned_by_user(sk)) {
1653                 if (!tcp_prequeue(sk, skb))
1654                         ret = tcp_v4_do_rcv(sk, skb);
1655         } else if (unlikely(sk_add_backlog(sk, skb,
1656                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
1657                 bh_unlock_sock(sk);
1658                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1659                 goto discard_and_relse;
1660         }
1661         bh_unlock_sock(sk);
1662
1663 put_and_return:
1664         sock_put(sk);
1665
1666         return ret;
1667
1668 no_tcp_socket:
1669         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1670                 goto discard_it;
1671
1672         if (tcp_checksum_complete(skb)) {
1673 csum_error:
1674                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
1675 bad_packet:
1676                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1677         } else {
1678                 tcp_v4_send_reset(NULL, skb);
1679         }
1680
1681 discard_it:
1682         /* Discard frame. */
1683         kfree_skb(skb);
1684         return 0;
1685
1686 discard_and_relse:
1687         sock_put(sk);
1688         goto discard_it;
1689
1690 do_time_wait:
1691         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1692                 inet_twsk_put(inet_twsk(sk));
1693                 goto discard_it;
1694         }
1695
1696         if (tcp_checksum_complete(skb)) {
1697                 inet_twsk_put(inet_twsk(sk));
1698                 goto csum_error;
1699         }
1700         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1701         case TCP_TW_SYN: {
1702                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1703                                                         &tcp_hashinfo,
1704                                                         iph->saddr, th->source,
1705                                                         iph->daddr, th->dest,
1706                                                         inet_iif(skb));
1707                 if (sk2) {
1708                         inet_twsk_deschedule_put(inet_twsk(sk));
1709                         sk = sk2;
1710                         goto process;
1711                 }
1712                 /* Fall through to ACK */
1713         }
1714         case TCP_TW_ACK:
1715                 tcp_v4_timewait_ack(sk, skb);
1716                 break;
1717         case TCP_TW_RST:
1718                 goto no_tcp_socket;
1719         case TCP_TW_SUCCESS:;
1720         }
1721         goto discard_it;
1722 }
1723
1724 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1725         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1726         .twsk_unique    = tcp_twsk_unique,
1727         .twsk_destructor= tcp_twsk_destructor,
1728 };
1729
1730 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1731 {
1732         struct dst_entry *dst = skb_dst(skb);
1733
1734         if (dst && dst_hold_safe(dst)) {
1735                 sk->sk_rx_dst = dst;
1736                 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1737         }
1738 }
1739 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1740
1741 const struct inet_connection_sock_af_ops ipv4_specific = {
1742         .queue_xmit        = ip_queue_xmit,
1743         .send_check        = tcp_v4_send_check,
1744         .rebuild_header    = inet_sk_rebuild_header,
1745         .sk_rx_dst_set     = inet_sk_rx_dst_set,
1746         .conn_request      = tcp_v4_conn_request,
1747         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1748         .net_header_len    = sizeof(struct iphdr),
1749         .setsockopt        = ip_setsockopt,
1750         .getsockopt        = ip_getsockopt,
1751         .addr2sockaddr     = inet_csk_addr2sockaddr,
1752         .sockaddr_len      = sizeof(struct sockaddr_in),
1753         .bind_conflict     = inet_csk_bind_conflict,
1754 #ifdef CONFIG_COMPAT
1755         .compat_setsockopt = compat_ip_setsockopt,
1756         .compat_getsockopt = compat_ip_getsockopt,
1757 #endif
1758         .mtu_reduced       = tcp_v4_mtu_reduced,
1759 };
1760 EXPORT_SYMBOL(ipv4_specific);
1761
1762 #ifdef CONFIG_TCP_MD5SIG
1763 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1764         .md5_lookup             = tcp_v4_md5_lookup,
1765         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1766         .md5_parse              = tcp_v4_parse_md5_keys,
1767 };
1768 #endif
1769
1770 /* NOTE: A lot of things set to zero explicitly by call to
1771  *       sk_alloc() so need not be done here.
1772  */
1773 static int tcp_v4_init_sock(struct sock *sk)
1774 {
1775         struct inet_connection_sock *icsk = inet_csk(sk);
1776
1777         tcp_init_sock(sk);
1778
1779         icsk->icsk_af_ops = &ipv4_specific;
1780
1781 #ifdef CONFIG_TCP_MD5SIG
1782         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1783 #endif
1784
1785         return 0;
1786 }
1787
1788 void tcp_v4_destroy_sock(struct sock *sk)
1789 {
1790         struct tcp_sock *tp = tcp_sk(sk);
1791
1792         tcp_clear_xmit_timers(sk);
1793
1794         tcp_cleanup_congestion_control(sk);
1795
1796         /* Cleanup up the write buffer. */
1797         tcp_write_queue_purge(sk);
1798
1799         /* Cleans up our, hopefully empty, out_of_order_queue. */
1800         __skb_queue_purge(&tp->out_of_order_queue);
1801
1802 #ifdef CONFIG_TCP_MD5SIG
1803         /* Clean up the MD5 key list, if any */
1804         if (tp->md5sig_info) {
1805                 tcp_clear_md5_list(sk);
1806                 kfree_rcu(tp->md5sig_info, rcu);
1807                 tp->md5sig_info = NULL;
1808         }
1809 #endif
1810
1811         /* Clean prequeue, it must be empty really */
1812         __skb_queue_purge(&tp->ucopy.prequeue);
1813
1814         /* Clean up a referenced TCP bind bucket. */
1815         if (inet_csk(sk)->icsk_bind_hash)
1816                 inet_put_port(sk);
1817
1818         BUG_ON(tp->fastopen_rsk);
1819
1820         /* If socket is aborted during connect operation */
1821         tcp_free_fastopen_req(tp);
1822         tcp_saved_syn_free(tp);
1823
1824         sk_sockets_allocated_dec(sk);
1825         sock_release_memcg(sk);
1826 }
1827 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1828
1829 #ifdef CONFIG_PROC_FS
1830 /* Proc filesystem TCP sock list dumping. */
1831
1832 /*
1833  * Get next listener socket follow cur.  If cur is NULL, get first socket
1834  * starting from bucket given in st->bucket; when st->bucket is zero the
1835  * very first socket in the hash table is returned.
1836  */
1837 static void *listening_get_next(struct seq_file *seq, void *cur)
1838 {
1839         struct inet_connection_sock *icsk;
1840         struct hlist_nulls_node *node;
1841         struct sock *sk = cur;
1842         struct inet_listen_hashbucket *ilb;
1843         struct tcp_iter_state *st = seq->private;
1844         struct net *net = seq_file_net(seq);
1845
1846         if (!sk) {
1847                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1848                 spin_lock_bh(&ilb->lock);
1849                 sk = sk_nulls_head(&ilb->head);
1850                 st->offset = 0;
1851                 goto get_sk;
1852         }
1853         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1854         ++st->num;
1855         ++st->offset;
1856
1857         sk = sk_nulls_next(sk);
1858 get_sk:
1859         sk_nulls_for_each_from(sk, node) {
1860                 if (!net_eq(sock_net(sk), net))
1861                         continue;
1862                 if (sk->sk_family == st->family) {
1863                         cur = sk;
1864                         goto out;
1865                 }
1866                 icsk = inet_csk(sk);
1867         }
1868         spin_unlock_bh(&ilb->lock);
1869         st->offset = 0;
1870         if (++st->bucket < INET_LHTABLE_SIZE) {
1871                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1872                 spin_lock_bh(&ilb->lock);
1873                 sk = sk_nulls_head(&ilb->head);
1874                 goto get_sk;
1875         }
1876         cur = NULL;
1877 out:
1878         return cur;
1879 }
1880
1881 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1882 {
1883         struct tcp_iter_state *st = seq->private;
1884         void *rc;
1885
1886         st->bucket = 0;
1887         st->offset = 0;
1888         rc = listening_get_next(seq, NULL);
1889
1890         while (rc && *pos) {
1891                 rc = listening_get_next(seq, rc);
1892                 --*pos;
1893         }
1894         return rc;
1895 }
1896
1897 static inline bool empty_bucket(const struct tcp_iter_state *st)
1898 {
1899         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
1900 }
1901
1902 /*
1903  * Get first established socket starting from bucket given in st->bucket.
1904  * If st->bucket is zero, the very first socket in the hash is returned.
1905  */
1906 static void *established_get_first(struct seq_file *seq)
1907 {
1908         struct tcp_iter_state *st = seq->private;
1909         struct net *net = seq_file_net(seq);
1910         void *rc = NULL;
1911
1912         st->offset = 0;
1913         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1914                 struct sock *sk;
1915                 struct hlist_nulls_node *node;
1916                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1917
1918                 /* Lockless fast path for the common case of empty buckets */
1919                 if (empty_bucket(st))
1920                         continue;
1921
1922                 spin_lock_bh(lock);
1923                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1924                         if (sk->sk_family != st->family ||
1925                             !net_eq(sock_net(sk), net)) {
1926                                 continue;
1927                         }
1928                         rc = sk;
1929                         goto out;
1930                 }
1931                 spin_unlock_bh(lock);
1932         }
1933 out:
1934         return rc;
1935 }
1936
1937 static void *established_get_next(struct seq_file *seq, void *cur)
1938 {
1939         struct sock *sk = cur;
1940         struct hlist_nulls_node *node;
1941         struct tcp_iter_state *st = seq->private;
1942         struct net *net = seq_file_net(seq);
1943
1944         ++st->num;
1945         ++st->offset;
1946
1947         sk = sk_nulls_next(sk);
1948
1949         sk_nulls_for_each_from(sk, node) {
1950                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1951                         return sk;
1952         }
1953
1954         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1955         ++st->bucket;
1956         return established_get_first(seq);
1957 }
1958
1959 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1960 {
1961         struct tcp_iter_state *st = seq->private;
1962         void *rc;
1963
1964         st->bucket = 0;
1965         rc = established_get_first(seq);
1966
1967         while (rc && pos) {
1968                 rc = established_get_next(seq, rc);
1969                 --pos;
1970         }
1971         return rc;
1972 }
1973
1974 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1975 {
1976         void *rc;
1977         struct tcp_iter_state *st = seq->private;
1978
1979         st->state = TCP_SEQ_STATE_LISTENING;
1980         rc        = listening_get_idx(seq, &pos);
1981
1982         if (!rc) {
1983                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1984                 rc        = established_get_idx(seq, pos);
1985         }
1986
1987         return rc;
1988 }
1989
1990 static void *tcp_seek_last_pos(struct seq_file *seq)
1991 {
1992         struct tcp_iter_state *st = seq->private;
1993         int offset = st->offset;
1994         int orig_num = st->num;
1995         void *rc = NULL;
1996
1997         switch (st->state) {
1998         case TCP_SEQ_STATE_LISTENING:
1999                 if (st->bucket >= INET_LHTABLE_SIZE)
2000                         break;
2001                 st->state = TCP_SEQ_STATE_LISTENING;
2002                 rc = listening_get_next(seq, NULL);
2003                 while (offset-- && rc)
2004                         rc = listening_get_next(seq, rc);
2005                 if (rc)
2006                         break;
2007                 st->bucket = 0;
2008                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2009                 /* Fallthrough */
2010         case TCP_SEQ_STATE_ESTABLISHED:
2011                 if (st->bucket > tcp_hashinfo.ehash_mask)
2012                         break;
2013                 rc = established_get_first(seq);
2014                 while (offset-- && rc)
2015                         rc = established_get_next(seq, rc);
2016         }
2017
2018         st->num = orig_num;
2019
2020         return rc;
2021 }
2022
2023 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2024 {
2025         struct tcp_iter_state *st = seq->private;
2026         void *rc;
2027
2028         if (*pos && *pos == st->last_pos) {
2029                 rc = tcp_seek_last_pos(seq);
2030                 if (rc)
2031                         goto out;
2032         }
2033
2034         st->state = TCP_SEQ_STATE_LISTENING;
2035         st->num = 0;
2036         st->bucket = 0;
2037         st->offset = 0;
2038         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2039
2040 out:
2041         st->last_pos = *pos;
2042         return rc;
2043 }
2044
2045 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2046 {
2047         struct tcp_iter_state *st = seq->private;
2048         void *rc = NULL;
2049
2050         if (v == SEQ_START_TOKEN) {
2051                 rc = tcp_get_idx(seq, 0);
2052                 goto out;
2053         }
2054
2055         switch (st->state) {
2056         case TCP_SEQ_STATE_LISTENING:
2057                 rc = listening_get_next(seq, v);
2058                 if (!rc) {
2059                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2060                         st->bucket = 0;
2061                         st->offset = 0;
2062                         rc        = established_get_first(seq);
2063                 }
2064                 break;
2065         case TCP_SEQ_STATE_ESTABLISHED:
2066                 rc = established_get_next(seq, v);
2067                 break;
2068         }
2069 out:
2070         ++*pos;
2071         st->last_pos = *pos;
2072         return rc;
2073 }
2074
2075 static void tcp_seq_stop(struct seq_file *seq, void *v)
2076 {
2077         struct tcp_iter_state *st = seq->private;
2078
2079         switch (st->state) {
2080         case TCP_SEQ_STATE_LISTENING:
2081                 if (v != SEQ_START_TOKEN)
2082                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2083                 break;
2084         case TCP_SEQ_STATE_ESTABLISHED:
2085                 if (v)
2086                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2087                 break;
2088         }
2089 }
2090
2091 int tcp_seq_open(struct inode *inode, struct file *file)
2092 {
2093         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2094         struct tcp_iter_state *s;
2095         int err;
2096
2097         err = seq_open_net(inode, file, &afinfo->seq_ops,
2098                           sizeof(struct tcp_iter_state));
2099         if (err < 0)
2100                 return err;
2101
2102         s = ((struct seq_file *)file->private_data)->private;
2103         s->family               = afinfo->family;
2104         s->last_pos             = 0;
2105         return 0;
2106 }
2107 EXPORT_SYMBOL(tcp_seq_open);
2108
2109 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2110 {
2111         int rc = 0;
2112         struct proc_dir_entry *p;
2113
2114         afinfo->seq_ops.start           = tcp_seq_start;
2115         afinfo->seq_ops.next            = tcp_seq_next;
2116         afinfo->seq_ops.stop            = tcp_seq_stop;
2117
2118         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2119                              afinfo->seq_fops, afinfo);
2120         if (!p)
2121                 rc = -ENOMEM;
2122         return rc;
2123 }
2124 EXPORT_SYMBOL(tcp_proc_register);
2125
2126 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2127 {
2128         remove_proc_entry(afinfo->name, net->proc_net);
2129 }
2130 EXPORT_SYMBOL(tcp_proc_unregister);
2131
2132 static void get_openreq4(const struct request_sock *req,
2133                          struct seq_file *f, int i)
2134 {
2135         const struct inet_request_sock *ireq = inet_rsk(req);
2136         long delta = req->rsk_timer.expires - jiffies;
2137
2138         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2139                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2140                 i,
2141                 ireq->ir_loc_addr,
2142                 ireq->ir_num,
2143                 ireq->ir_rmt_addr,
2144                 ntohs(ireq->ir_rmt_port),
2145                 TCP_SYN_RECV,
2146                 0, 0, /* could print option size, but that is af dependent. */
2147                 1,    /* timers active (only the expire timer) */
2148                 jiffies_delta_to_clock_t(delta),
2149                 req->num_timeout,
2150                 from_kuid_munged(seq_user_ns(f),
2151                                  sock_i_uid(req->rsk_listener)),
2152                 0,  /* non standard timer */
2153                 0, /* open_requests have no inode */
2154                 0,
2155                 req);
2156 }
2157
2158 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2159 {
2160         int timer_active;
2161         unsigned long timer_expires;
2162         const struct tcp_sock *tp = tcp_sk(sk);
2163         const struct inet_connection_sock *icsk = inet_csk(sk);
2164         const struct inet_sock *inet = inet_sk(sk);
2165         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2166         __be32 dest = inet->inet_daddr;
2167         __be32 src = inet->inet_rcv_saddr;
2168         __u16 destp = ntohs(inet->inet_dport);
2169         __u16 srcp = ntohs(inet->inet_sport);
2170         int rx_queue;
2171         int state;
2172
2173         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2174             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2175             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2176                 timer_active    = 1;
2177                 timer_expires   = icsk->icsk_timeout;
2178         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2179                 timer_active    = 4;
2180                 timer_expires   = icsk->icsk_timeout;
2181         } else if (timer_pending(&sk->sk_timer)) {
2182                 timer_active    = 2;
2183                 timer_expires   = sk->sk_timer.expires;
2184         } else {
2185                 timer_active    = 0;
2186                 timer_expires = jiffies;
2187         }
2188
2189         state = sk_state_load(sk);
2190         if (state == TCP_LISTEN)
2191                 rx_queue = sk->sk_ack_backlog;
2192         else
2193                 /* Because we don't lock the socket,
2194                  * we might find a transient negative value.
2195                  */
2196                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2197
2198         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2199                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2200                 i, src, srcp, dest, destp, state,
2201                 tp->write_seq - tp->snd_una,
2202                 rx_queue,
2203                 timer_active,
2204                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2205                 icsk->icsk_retransmits,
2206                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2207                 icsk->icsk_probes_out,
2208                 sock_i_ino(sk),
2209                 atomic_read(&sk->sk_refcnt), sk,
2210                 jiffies_to_clock_t(icsk->icsk_rto),
2211                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2212                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2213                 tp->snd_cwnd,
2214                 state == TCP_LISTEN ?
2215                     fastopenq->max_qlen :
2216                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2217 }
2218
2219 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2220                                struct seq_file *f, int i)
2221 {
2222         long delta = tw->tw_timer.expires - jiffies;
2223         __be32 dest, src;
2224         __u16 destp, srcp;
2225
2226         dest  = tw->tw_daddr;
2227         src   = tw->tw_rcv_saddr;
2228         destp = ntohs(tw->tw_dport);
2229         srcp  = ntohs(tw->tw_sport);
2230
2231         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2232                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2233                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2234                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2235                 atomic_read(&tw->tw_refcnt), tw);
2236 }
2237
2238 #define TMPSZ 150
2239
2240 static int tcp4_seq_show(struct seq_file *seq, void *v)
2241 {
2242         struct tcp_iter_state *st;
2243         struct sock *sk = v;
2244
2245         seq_setwidth(seq, TMPSZ - 1);
2246         if (v == SEQ_START_TOKEN) {
2247                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2248                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2249                            "inode");
2250                 goto out;
2251         }
2252         st = seq->private;
2253
2254         if (sk->sk_state == TCP_TIME_WAIT)
2255                 get_timewait4_sock(v, seq, st->num);
2256         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2257                 get_openreq4(v, seq, st->num);
2258         else
2259                 get_tcp4_sock(v, seq, st->num);
2260 out:
2261         seq_pad(seq, '\n');
2262         return 0;
2263 }
2264
2265 static const struct file_operations tcp_afinfo_seq_fops = {
2266         .owner   = THIS_MODULE,
2267         .open    = tcp_seq_open,
2268         .read    = seq_read,
2269         .llseek  = seq_lseek,
2270         .release = seq_release_net
2271 };
2272
2273 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2274         .name           = "tcp",
2275         .family         = AF_INET,
2276         .seq_fops       = &tcp_afinfo_seq_fops,
2277         .seq_ops        = {
2278                 .show           = tcp4_seq_show,
2279         },
2280 };
2281
2282 static int __net_init tcp4_proc_init_net(struct net *net)
2283 {
2284         return tcp_proc_register(net, &tcp4_seq_afinfo);
2285 }
2286
2287 static void __net_exit tcp4_proc_exit_net(struct net *net)
2288 {
2289         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2290 }
2291
2292 static struct pernet_operations tcp4_net_ops = {
2293         .init = tcp4_proc_init_net,
2294         .exit = tcp4_proc_exit_net,
2295 };
2296
2297 int __init tcp4_proc_init(void)
2298 {
2299         return register_pernet_subsys(&tcp4_net_ops);
2300 }
2301
2302 void tcp4_proc_exit(void)
2303 {
2304         unregister_pernet_subsys(&tcp4_net_ops);
2305 }
2306 #endif /* CONFIG_PROC_FS */
2307
2308 struct proto tcp_prot = {
2309         .name                   = "TCP",
2310         .owner                  = THIS_MODULE,
2311         .close                  = tcp_close,
2312         .connect                = tcp_v4_connect,
2313         .disconnect             = tcp_disconnect,
2314         .accept                 = inet_csk_accept,
2315         .ioctl                  = tcp_ioctl,
2316         .init                   = tcp_v4_init_sock,
2317         .destroy                = tcp_v4_destroy_sock,
2318         .shutdown               = tcp_shutdown,
2319         .setsockopt             = tcp_setsockopt,
2320         .getsockopt             = tcp_getsockopt,
2321         .recvmsg                = tcp_recvmsg,
2322         .sendmsg                = tcp_sendmsg,
2323         .sendpage               = tcp_sendpage,
2324         .backlog_rcv            = tcp_v4_do_rcv,
2325         .release_cb             = tcp_release_cb,
2326         .hash                   = inet_hash,
2327         .unhash                 = inet_unhash,
2328         .get_port               = inet_csk_get_port,
2329         .enter_memory_pressure  = tcp_enter_memory_pressure,
2330         .stream_memory_free     = tcp_stream_memory_free,
2331         .sockets_allocated      = &tcp_sockets_allocated,
2332         .orphan_count           = &tcp_orphan_count,
2333         .memory_allocated       = &tcp_memory_allocated,
2334         .memory_pressure        = &tcp_memory_pressure,
2335         .sysctl_mem             = sysctl_tcp_mem,
2336         .sysctl_wmem            = sysctl_tcp_wmem,
2337         .sysctl_rmem            = sysctl_tcp_rmem,
2338         .max_header             = MAX_TCP_HEADER,
2339         .obj_size               = sizeof(struct tcp_sock),
2340         .slab_flags             = SLAB_DESTROY_BY_RCU,
2341         .twsk_prot              = &tcp_timewait_sock_ops,
2342         .rsk_prot               = &tcp_request_sock_ops,
2343         .h.hashinfo             = &tcp_hashinfo,
2344         .no_autobind            = true,
2345 #ifdef CONFIG_COMPAT
2346         .compat_setsockopt      = compat_tcp_setsockopt,
2347         .compat_getsockopt      = compat_tcp_getsockopt,
2348 #endif
2349 #ifdef CONFIG_MEMCG_KMEM
2350         .init_cgroup            = tcp_init_cgroup,
2351         .destroy_cgroup         = tcp_destroy_cgroup,
2352         .proto_cgroup           = tcp_proto_cgroup,
2353 #endif
2354 };
2355 EXPORT_SYMBOL(tcp_prot);
2356
2357 static void __net_exit tcp_sk_exit(struct net *net)
2358 {
2359         int cpu;
2360
2361         for_each_possible_cpu(cpu)
2362                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2363         free_percpu(net->ipv4.tcp_sk);
2364 }
2365
2366 static int __net_init tcp_sk_init(struct net *net)
2367 {
2368         int res, cpu;
2369
2370         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2371         if (!net->ipv4.tcp_sk)
2372                 return -ENOMEM;
2373
2374         for_each_possible_cpu(cpu) {
2375                 struct sock *sk;
2376
2377                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2378                                            IPPROTO_TCP, net);
2379                 if (res)
2380                         goto fail;
2381                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2382         }
2383
2384         net->ipv4.sysctl_tcp_ecn = 2;
2385         net->ipv4.sysctl_tcp_ecn_fallback = 1;
2386
2387         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2388         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2389         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2390
2391         return 0;
2392 fail:
2393         tcp_sk_exit(net);
2394
2395         return res;
2396 }
2397
2398 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2399 {
2400         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2401 }
2402
2403 static struct pernet_operations __net_initdata tcp_sk_ops = {
2404        .init       = tcp_sk_init,
2405        .exit       = tcp_sk_exit,
2406        .exit_batch = tcp_sk_exit_batch,
2407 };
2408
2409 void __init tcp_v4_init(void)
2410 {
2411         inet_hashinfo_init(&tcp_hashinfo);
2412         if (register_pernet_subsys(&tcp_sk_ops))
2413                 panic("Failed to create the TCP control socket.\n");
2414 }