2 // Copyright (c) 2017 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
19 #include <immintrin.h>
21 #include "rte_ct_tcp.h"
22 #include "rte_cnxn_tracking.h"
24 /* uint32_t CT_DEBUG = 1; */ /* Can be used to conditionally turn of debug */
26 #define STATE_TRACKING 0
27 #define RTE_CT_ASSERT 0
29 /* constants for mbuff manipulation */
30 #define META_DATA_OFFSET 128
31 #define RTE_PKTMBUF_HEADROOM 128 /* where is this defined ? */
32 #define ETHERNET_START (META_DATA_OFFSET + RTE_PKTMBUF_HEADROOM)
33 #define ETH_HDR_SIZE 14
34 #define IP_START (ETHERNET_START + ETH_HDR_SIZE)
36 #define IPv4_HEADER_SIZE 20
37 #define IPv6_HEADER_SIZE 40
39 #define IP_VERSION_4 4
40 #define IP_VERSION_6 6
42 #define rte_after(seq2, seq1) rte_before(seq1, seq2)
43 static inline uint8_t rte_before(uint32_t seq1, uint32_t seq2)
45 return (int32_t) (seq1 - seq2) < 0;
48 /* short state names for defining state table */
50 #define ctNO RTE_CT_TCP_NONE
51 #define ctSS RTE_CT_TCP_SYN_SENT
52 #define ctSR RTE_CT_TCP_SYN_RECV
53 #define ctES RTE_CT_TCP_ESTABLISHED
54 #define ctFW RTE_CT_TCP_FIN_WAIT
55 #define ctCW RTE_CT_TCP_CLOSE_WAIT
56 #define ctLA RTE_CT_TCP_LAST_ACK
57 #define ctTW RTE_CT_TCP_TIME_WAIT
58 #define ctCL RTE_CT_TCP_CLOSE
59 #define ctS2 RTE_CT_TCP_SYN_SENT_2
60 #define ctIV RTE_CT_TCP_MAX
61 #define ctIG RTE_CT_TCP_IGNORE
63 static const uint8_t rte_ct_tcp_state_table[2][6][RTE_CT_TCP_MAX] = {
64 { /* "client" direction, i.e. first SYN sent */
65 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
66 /* syn */ {ctSS, ctSS, ctIG, ctIG, ctIG, ctIG, ctIG, ctSS, ctSS,
69 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
70 /* synack */ {ctIV, ctIV, ctSR, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV,
73 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
74 /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL,
76 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
77 /* ack */ {ctES, ctIV, ctES, ctES, ctCW, ctCW, ctTW, ctTW, ctCL,
80 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
81 /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL,
83 /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV}
86 { /* "server" direction */
87 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
88 /* syn */ {ctIV, ctS2, ctIV, ctIV, ctIV, ctIV, ctIV, ctSS, ctIV,
91 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
92 /* synack */ {ctIV, ctSR, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG, ctIG,
95 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
96 /* fin */ {ctIV, ctIV, ctFW, ctFW, ctLA, ctLA, ctLA, ctTW, ctCL,
99 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
100 /* ack */ {ctIV, ctIG, ctSR, ctES, ctCW, ctCW, ctTW, ctTW, ctCL,
103 /* ctNO, ctSS, ctSR, ctES, ctFW, ctCW, ctLA, ctTW, ctCL, ctS2 */
104 /* rst */ {ctIV, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL, ctCL,
106 /* ill */ {ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV, ctIV}
110 /* What TCP flags are set from RST/SYN/FIN/ACK. */
113 RTE_CT_TCP_SAK_FLAG, /* SYN ACK */
120 static uint8_t rte_ct_tcp_flags_to_state_table_index[16] = {
122 RTE_CT_TCP_ILL_FLAG, /* 0 0 0 0 */
123 RTE_CT_TCP_FIN_FLAG, /* 0 0 0 1 */
124 RTE_CT_TCP_SYN_FLAG, /* 0 0 1 0 */
125 RTE_CT_TCP_ILL_FLAG, /* 0 0 1 1 */
126 RTE_CT_TCP_RST_FLAG, /* 0 1 0 0 */
127 RTE_CT_TCP_RST_FLAG, /* 0 1 0 1 */
128 RTE_CT_TCP_RST_FLAG, /* 0 1 1 0 */
129 RTE_CT_TCP_ILL_FLAG, /* 0 1 1 1 */
131 RTE_CT_TCP_ACK_FLAG, /* 1 0 0 0 */
132 RTE_CT_TCP_FIN_FLAG, /* 1 0 0 1 */
133 RTE_CT_TCP_SAK_FLAG, /* 1 0 1 0 */
134 RTE_CT_TCP_ILL_FLAG, /* 1 0 1 1 */
135 RTE_CT_TCP_RST_FLAG, /* 1 1 0 0 */
136 RTE_CT_TCP_ILL_FLAG, /* 1 1 0 1 */
137 RTE_CT_TCP_RST_FLAG, /* 1 1 1 0 */
138 RTE_CT_TCP_ILL_FLAG, /* 1 1 1 1 */
141 static inline uint8_t
142 rte_ct_get_index(uint8_t tcp_flags)
144 uint8_t important_flags;
146 tcp_flags &= 0x3f; /* clear off optional flags */
147 important_flags = ((tcp_flags & 0x10) >> 1) | (tcp_flags & 7);
148 /* should be _pext_u32(tcp_flags, 0x17) */
150 if (unlikely((tcp_flags == 0) || (tcp_flags == 0x3f)))
151 /* these known as null and christmas tree respectively */
152 return RTE_CT_TCP_ILL_FLAG;
154 return rte_ct_tcp_flags_to_state_table_index[important_flags];
159 rte_ct_either_direction_has_flags(struct rte_ct_cnxn_data *cd, uint8_t flags)
161 return ((cd->ct_protocol.tcp_ct_data.seen[0].flags | cd->
162 ct_protocol.tcp_ct_data.seen[1].flags) & flags) != 0;
165 static inline uint32_t rte_ct_seq_plus_length(struct rte_mbuf *pkt,
168 uint16_t pkt_length = 0;
169 struct tcp_hdr *tcpheader =
170 (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
173 uint32_t tcp_hdr_size = (tcpheader->data_off & 0xf0) >> 2;
175 void *ip_hdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START);
177 if (ip_hdr_size == IPv4_HEADER_SIZE) {
178 struct ipv4_hdr *ihdr = (struct ipv4_hdr *)ip_hdr;
180 pkt_length = rte_bswap16(ihdr->total_length);
182 if (ip_hdr_size == IPv6_HEADER_SIZE) {
183 struct ipv6_hdr *ihdr = (struct ipv6_hdr *)ip_hdr;
185 pkt_length = rte_bswap16(ihdr->payload_len) + IPv6_HEADER_SIZE;
189 * Return sequence number plus the length of TCP segment (payload).
190 * SYN & FIN are each considered one byte, but it is illegal
191 * to have them together in one header (checked elsewhere)
195 return rte_bswap32(tcpheader->sent_seq) +
196 pkt_length - ip_hdr_size - tcp_hdr_size +
197 ((tcpheader->tcp_flags & (RTE_CT_TCPHDR_SYN | RTE_CT_TCPHDR_FIN)) !=
203 rte_ct_check_for_scaling_and_sack_perm(
204 struct rte_mbuf *pkt,
205 struct rte_ct_tcp_state *state,
209 struct tcp_hdr *tcpheader =
210 (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
213 uint32_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2;
214 uint32_t length = dataoff_in_bytes - sizeof(struct tcp_hdr);
220 /* no header options */
222 uint8_t *options_ptr =
223 RTE_MBUF_METADATA_UINT8_PTR(pkt,
224 (IP_START + ip_hdr_size +
225 sizeof(struct tcp_hdr)));
228 uint8_t option = *options_ptr;
229 uint8_t opsize = options_ptr[1];
230 /* opsize reset for NOPs below */
234 case RTE_CT_TCPOPT_EOL:
238 case RTE_CT_TCPOPT_NOP:
243 case RTE_CT_TCPOPT_SACK_PERM:
244 if (opsize == RTE_CT_TCPOLEN_SACK_PERM)
245 state->flags |= RTE_CT_TCP_FLAG_SACK_PERM;
248 case RTE_CT_TCPOPT_WINDOW:
249 if (opsize == RTE_CT_TCPOLEN_WINDOW) {
251 RTE_MIN(options_ptr[2],
252 RTE_CT_MAX_TCP_WINDOW_SCALE);
253 state->flags |= RTE_CT_TCP_FLAG_WINDOW_SCALE;
262 if ((opsize < 2) || (opsize > length)) {
263 /* something wrong */
264 printf("scaling_and_sack_perm:something wrong\n");
267 options_ptr += opsize;
274 rte_ct_tcpdisplay_hdr(struct tcp_hdr *tcpheader)
276 printf("Tcp header: src_port=%d", rte_bswap16(tcpheader->src_port));
277 printf(", dst_port=%d", rte_bswap16(tcpheader->dst_port));
278 printf(", sent_seq=%u", rte_bswap32(tcpheader->sent_seq));
279 printf(", recv_ack=%u", rte_bswap32(tcpheader->recv_ack));
280 printf(",data_off=%d", tcpheader->data_off / 16);
281 printf(",tcp_flags=%02x", tcpheader->tcp_flags);
282 printf(", rx_win=%d\n", rte_bswap16(tcpheader->rx_win));
287 rte_ct_clear_cnxn_data(__rte_unused struct rte_ct_cnxn_tracker *ct,
288 struct rte_ct_cnxn_data *cd,
289 __rte_unused struct rte_mbuf *pkt)
291 /* clear all tcp connection data, then set up individual fields */
293 memset(&cd->ct_protocol.tcp_ct_data, 0,
294 sizeof(cd->ct_protocol.tcp_ct_data));
295 cd->ct_protocol.tcp_ct_data.last_index = RTE_CT_TCP_ILL_FLAG;
299 enum rte_ct_packet_action
300 rte_ct_tcp_new_connection(
301 struct rte_ct_cnxn_tracker *ct,
302 struct rte_ct_cnxn_data *cd,
303 struct rte_mbuf *pkt,
307 struct tcp_hdr *tcpheader =
308 (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
309 (IP_START + ip_hdr_size));
311 enum rte_ct_tcp_states new_state;
313 struct rte_ct_tcp_state *sender =
314 &cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_ORIGINAL];
315 struct rte_ct_tcp_state *receiver =
316 &cd->ct_protocol.tcp_ct_data.seen[RTE_CT_DIR_REPLY];
320 rte_ct_tcpdisplay_hdr(tcpheader);
322 index = rte_ct_get_index(tcpheader->tcp_flags);
323 new_state = rte_ct_tcp_state_table[0][index][RTE_CT_TCP_NONE];
325 if (unlikely(new_state >= RTE_CT_TCP_MAX)) {
327 printf("invalid new state with flags %02x\n",
328 tcpheader->tcp_flags);
329 return RTE_CT_DROP_PACKET;
332 * A normal connection starts with a SYN packet. However, it is possible
333 * that an onginging connection has been routed here somehow. Support
334 * for these connections is optional.
337 if (unlikely((new_state != RTE_CT_TCP_SYN_SENT
338 && ct->misc_options.tcp_loose == 0))) {
339 /* Not a standard connection start and not supporting
340 * onging connections. */
341 return RTE_CT_DROP_PACKET;
345 printf(" new connection with state %s\n",
346 rte_ct_tcp_names[new_state]);
348 /* clear all tcp connection data, then set up individual fields */
349 rte_ct_clear_cnxn_data(ct, cd, pkt);
350 cd->ct_protocol.tcp_ct_data.state = new_state;
352 sender->end = sender->maxend = rte_ct_seq_plus_length(pkt, ip_hdr_size);
353 win = rte_bswap16(tcpheader->rx_win);
354 sender->maxwin = RTE_MAX(win, (uint32_t)1);
356 if (likely(new_state == RTE_CT_TCP_SYN_SENT)) {
357 /* check for window scaling and selective ACK */
358 rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
361 cd->ct_protocol.synproxy_data.synproxied = use_synproxy;
365 * new connection from client using synproxy. The proxy
366 * must send back a SYN-ACK
371 printf("synproxy sending SYN-ACK to client\n");
373 return RTE_CT_SEND_CLIENT_SYNACK;
377 * An ongoing connection. Make a very liberal connection since
378 * all the original set up data is lost. Assume SACK and
379 * liberal window checking to handle unknown window scaling.
382 sender->maxend += sender->maxwin;
383 sender->flags = receiver->flags =
384 (RTE_CT_TCP_FLAG_SACK_PERM | RTE_CT_TCP_FLAG_BE_LIBERAL);
388 printf("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i",
389 sender->end, sender->maxend, sender->maxwin,
391 printf(" receiver end=%u maxend=%u maxwin=%u scale=%i\n",
392 receiver->end, receiver->maxend,
397 return RTE_CT_OPEN_CONNECTION;
401 rte_ct_tcp_sack(struct rte_mbuf *pkt, uint8_t ip_hdr_size)
403 struct tcp_hdr *tcpheader =
404 (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
407 uint16_t dataoff_in_bytes = (tcpheader->data_off & 0xf0) >> 2;
408 uint16_t length = dataoff_in_bytes - sizeof(struct tcp_hdr);
409 uint32_t sack = rte_bswap32(tcpheader->recv_ack);
411 if (unlikely(!length))
414 uint8_t *options_ptr = RTE_MBUF_METADATA_UINT8_PTR(pkt,
415 (IP_START + ip_hdr_size + sizeof(struct tcp_hdr)));
418 uint8_t opcode = *options_ptr;
419 uint8_t opsize = options_ptr[1];
424 case RTE_CT_TCPOPT_TIMESTAMP:
425 /* common "solo" option, check first */
428 case RTE_CT_TCPOPT_EOL:
429 return sack; /* end of options */
431 case RTE_CT_TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
436 case RTE_CT_TCPOPT_SACK:
438 * SACK (selective ACK) contains a block of
439 * 1 to 4 entries of 8 bytes each.
440 * Each entry is a pair of 32 bit numbers.
441 * This block follows the usual 2
442 * bytes for opcode and opsize. Thus,
443 * the entire SACK option must be 10, 18,
444 * 26 or 34 bytes long.
446 if ((opsize >= (RTE_CT_TCPOLEN_PER_SACK_ENTRY + 2)) &&
448 RTE_CT_TCPOLEN_PER_SACK_ENTRY) == 0) {
449 /* skip over opcode and size, and point to
450 * 2nd 32 bits in entry */
452 for (i = 0; i < (opsize - 2); i +=
453 RTE_CT_TCPOLEN_PER_SACK_ENTRY) {
455 (uint32_t *) &options_ptr[i];
456 uint32_t ack = rte_bswap32(*sack_ptr);
458 if (rte_after(ack, sack))
467 if ((opsize < 2) || (opsize > length)) {
468 printf("rte_ct_tcp_sack: something wrong, opsize %i,",
470 printf(" length %i\n", length);
473 options_ptr += opsize;
480 * if this is a retransmission of last packet, increment retransmission count,
481 * otherwise record this as last packet.
484 rte_ct_check_for_retransmissions(
485 struct rte_ct_tcp *state,
492 if (state->last_dir == dir
493 && state->last_seq == seq
494 && state->last_ack == ack
495 && state->last_end == end && state->last_win == win)
498 state->last_dir = dir;
499 state->last_seq = seq;
500 state->last_ack = ack;
501 state->last_end = end;
502 state->last_win = win;
508 * Verify that the sequence number in the given packet is within the valid
509 * range at this point in the connection
512 rte_ct_tcp_in_window(
513 struct rte_ct_cnxn_data *cd,
514 struct rte_ct_cnxn_tracker *ct,
515 struct rte_ct_tcp *state,
516 enum rte_ct_pkt_direction dir,
518 struct rte_mbuf *pkt,
521 struct rte_ct_tcp_state *sender = &state->seen[dir];
522 struct rte_ct_tcp_state *receiver = &state->seen[!dir];
523 uint32_t seq, ack, sack, end, win, swin;
524 uint8_t in_recv_win, tcp_flags;
525 enum rte_ct_packet_action res;
527 void *iphdr = RTE_MBUF_METADATA_UINT32_PTR(pkt, IP_START);
528 struct tcp_hdr *tcpheader =
529 (struct tcp_hdr *)RTE_MBUF_METADATA_UINT32_PTR(pkt,
530 (IP_START + ip_hdr_size));
532 if (cd->ct_protocol.synproxy_data.synproxied)
533 rte_sp_adjust_client_ack_before_window_check(cd, iphdr,
537 seq = rte_bswap32(tcpheader->sent_seq);
538 ack = sack = rte_bswap32(tcpheader->recv_ack);
539 win = rte_bswap16(tcpheader->rx_win);
540 end = rte_ct_seq_plus_length(pkt, ip_hdr_size);
541 tcp_flags = tcpheader->tcp_flags;
543 if (receiver->flags & RTE_CT_TCP_FLAG_SACK_PERM)
544 sack = rte_ct_tcp_sack(pkt, ip_hdr_size);
546 if (unlikely(sender->maxwin == 0)) {
547 /* First packet for sender, initialize data. */
548 if (tcp_flags & RTE_CT_TCPHDR_SYN) {
550 * SYN-ACK in reply to a SYN
551 * or SYN from reply direction in simultaneous open.
553 sender->end = sender->maxend = end;
554 sender->maxwin = RTE_MAX(win, (uint32_t)1);
556 rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
560 * RFC 1323: Both sides must send Window Scale option
561 * to enable scaling in either direction.
564 flags & receiver->flags &
565 RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0)
566 sender->scale = receiver->scale = 0;
568 if (!(tcp_flags & RTE_CT_TCPHDR_ACK))
569 /* Simultaneous open */
573 * In the middle of a connection with no setup data.
574 * Use available data from the packet.
577 swin = win << sender->scale;
578 sender->maxwin = (swin == 0 ? 1 : swin);
579 sender->maxend = end + sender->maxwin;
581 * We haven't seen traffic in the other direction yet
582 * but we have to tweak window tracking to pass III
583 * and IV until that happens.
585 if (receiver->maxwin == 0)
586 receiver->end = receiver->maxend = sack;
589 /* if sender unititialized */
590 else if (((cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT &&
591 dir == RTE_CT_DIR_ORIGINAL) ||
592 (cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_RECV &&
593 dir == RTE_CT_DIR_REPLY)) && rte_after(end, sender->end)) {
595 * RFC 793: "if a TCP is reinitialized ... then it need
596 * not wait at all; it must only be sure to use sequence
597 * numbers larger than those recently used."
599 sender->end = sender->maxend = end;
600 sender->maxwin = RTE_MAX(win, (uint32_t)1);
602 rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
605 /* If no ACK, just pretend there was. */
606 if (!(tcp_flags & RTE_CT_TCPHDR_ACK) ||
607 (((tcp_flags & RTE_CT_TCPHDR_RST_ACK) ==
608 RTE_CT_TCPHDR_RST_ACK) && (ack == 0))) {
610 ack = sack = receiver->end;
613 if ((tcp_flags & RTE_CT_TCPHDR_RST) && seq == 0 &&
614 cd->ct_protocol.tcp_ct_data.state == RTE_CT_TCP_SYN_SENT)
615 /* RST sent answering SYN. */
616 seq = end = sender->end;
618 /* Is the ending sequence in the receive window (if available)? */
619 in_recv_win = !receiver->maxwin ||
620 rte_after(end, sender->end - receiver->maxwin - 1);
622 if (rte_before(seq, sender->maxend + 1) && in_recv_win &&
623 rte_before(sack, receiver->end + 1) &&
625 receiver->end - RTE_MAX(sender->maxwin,
626 (uint32_t)RTE_MAX_ACKWIN_CONST) - 1)) {
628 * Apply window scaling (RFC 1323). Only valid if both
629 * directions sent this option in a SYN packet,
630 * so ignore until not a SYN packet. Scale will be
631 * set to zero if connection set up but no valid scale is there.
633 if (!(tcp_flags & RTE_CT_TCPHDR_SYN))
634 win <<= sender->scale;
636 /* Update sender data. */
637 swin = win + (sack - ack);
638 sender->maxwin = RTE_MAX(sender->maxwin, swin);
640 if (rte_after(end, sender->end)) {
642 sender->flags |= RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
645 if (tcp_flags & RTE_CT_TCPHDR_ACK) {
646 if (!(sender->flags & RTE_CT_TCP_FLAG_MAXACK_SET)) {
647 sender->maxack = ack;
648 sender->flags |= RTE_CT_TCP_FLAG_MAXACK_SET;
649 } else if (rte_after(ack, sender->maxack))
650 sender->maxack = ack;
653 /* Update receiver data. */
654 if (receiver->maxwin != 0 && rte_after(end, sender->maxend))
655 receiver->maxwin += end - sender->maxend;
657 if (rte_after(sack + win, receiver->maxend - 1))
658 receiver->maxend = sack + RTE_MAX(win, (uint32_t)1);
660 if (ack == receiver->end)
661 receiver->flags &= ~RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
663 /* If this packet has an ACK, it may be a retransmission. */
664 if (index == RTE_CT_TCP_ACK_FLAG)
665 rte_ct_check_for_retransmissions(state, dir, seq, ack,
669 res = (sender->flags & RTE_CT_TCP_FLAG_BE_LIBERAL ||
670 ct->misc_options.tcp_be_liberal);
676 printf("tcp_in_window FAILED for %p\n", cd);
677 printf("rte_before(%u, %u + 1) is %d\n",
678 seq, sender->maxend + 1,
679 rte_before(seq, sender->maxend + 1));
680 printf("!%u || rte_after(%u, %u - %u - 1) is %d\n",
681 receiver->maxwin, end, sender->end,
682 receiver->maxwin, in_recv_win);
683 printf("rte_before(%u, %u + 1) is %d\n", sack,
684 receiver->end, rte_before(sack,
687 ("rte_after(%u,(%u - RTE_MAX(%u, %u) - 1))) is%d\n",
688 sack, receiver->end, sender->maxwin,
689 RTE_MAX_ACKWIN_CONST, rte_after(sack,
690 receiver->end - RTE_MAX(sender->maxwin,
691 (uint32_t)RTE_MAX_ACKWIN_CONST)
696 if (cd->ct_protocol.synproxy_data.synproxied)
697 rte_sp_adjust_server_seq_after_window_check(cd, iphdr,
702 /*for the given two FSM states,return the one with the smallest timeout value*/
703 static inline uint8_t
704 rte_ct_choose_min_timeout_state(
705 struct rte_ct_cnxn_tracker *ct,
709 if (ct->ct_timeout.tcptimeout.tcp_timeouts[state1] <
710 ct->ct_timeout.tcptimeout.tcp_timeouts[state2])
717 /* Returns verdict for packet */
718 enum rte_ct_packet_action
719 rte_ct_verify_tcp_packet(
720 struct rte_ct_cnxn_tracker *ct,
721 struct rte_ct_cnxn_data *cd,
722 struct rte_mbuf *pkt,
723 uint8_t key_was_flipped,
726 struct tcp_hdr *tcpheader = (struct tcp_hdr *)
727 RTE_MBUF_METADATA_UINT32_PTR(pkt, (IP_START + ip_hdr_size));
729 enum rte_ct_tcp_states new_state, old_state;
730 enum rte_ct_pkt_direction dir;
733 /* state whose timeout value will be used. In odd cases,
734 * not always current state */
735 uint8_t timeout_state;
737 dir = (cd->key_is_client_order == !key_was_flipped);
739 if (cd->ct_protocol.synproxy_data.synproxied &&
740 cd->ct_protocol.synproxy_data.half_established &&
741 !cd->ct_protocol.synproxy_data.cnxn_established &&
742 dir == RTE_CT_DIR_ORIGINAL) {
744 * Packet from client, but only client side of this connection
745 * has been set up. Buffer packet until server side of
746 * connection complete.
748 rte_ct_buffer_packet(ct, cd, pkt);
749 return RTE_CT_HIJACK;
752 uint32_t recv_ack = rte_bswap32(tcpheader->recv_ack);
753 uint32_t sent_seq = rte_bswap32(tcpheader->sent_seq);
755 int check_window = 1;
756 enum rte_ct_packet_action return_action = RTE_CT_FORWARD_PACKET;
758 /* rte_ct_tcpdisplay_hdr(tcpheader); */
760 old_state = cd->ct_protocol.tcp_ct_data.state;
761 index = rte_ct_get_index(tcpheader->tcp_flags);
762 new_state = rte_ct_tcp_state_table[dir][index][old_state];
764 if (new_state == RTE_CT_TCP_MAX) {
766 printf("!!!!invalid state transition from %s ",
767 rte_ct_tcp_names[old_state]);
768 printf("with flags 0x%02x\n",
769 tcpheader->tcp_flags);
772 ct->counters->pkts_drop_invalid_state++;
773 return RTE_CT_DROP_PACKET;
776 if (STATE_TRACKING && new_state != old_state)
777 printf(" new state %s\n", rte_ct_tcp_names[new_state]);
781 case RTE_CT_TCP_ESTABLISHED:
783 if (cd->ct_protocol.synproxy_data.synproxied &&
784 !cd->ct_protocol.synproxy_data.half_established &&
785 (old_state == RTE_CT_TCP_SYN_RECV)) {
787 * During synproxy setup, ESTABLISHED state entered by
788 * ACK arriving from client. The proxy must now send a
789 * spoofed SYN to the server.
790 * Reset the state to RTE_CT_TCP_SYN_SENT.
793 if (STATE_TRACKING) {
794 printf(" synproxy first half-cnxn complete,");
795 printf(" new state %s\n",
796 rte_ct_tcp_names[RTE_CT_TCP_SYN_SENT]);
798 cd->ct_protocol.synproxy_data.half_established = true;
800 rte_sp_cvt_to_spoofed_server_syn(cd, pkt);
801 rte_ct_clear_cnxn_data(ct, cd, pkt);
802 cd->ct_protocol.tcp_ct_data.state = RTE_CT_TCP_SYN_SENT;
804 struct rte_ct_tcp_state *sender =
805 &cd->ct_protocol.tcp_ct_data.
806 seen[RTE_CT_DIR_ORIGINAL];
807 uint16_t win = rte_bswap16(tcpheader->rx_win);
809 sender->end = sender->maxend =
810 rte_ct_seq_plus_length(pkt, ip_hdr_size);
811 sender->maxwin = RTE_MAX(win, (uint32_t)1);
812 rte_ct_check_for_scaling_and_sack_perm(pkt, sender,
814 /* TODO seq number code */
815 rte_ct_set_cnxn_timer_for_tcp(ct, cd,
816 RTE_CT_TCP_SYN_SENT);
817 return RTE_CT_SEND_SERVER_SYN;
821 case RTE_CT_TCP_SYN_RECV:
823 if (cd->ct_protocol.synproxy_data.synproxied &&
824 cd->ct_protocol.synproxy_data.half_established &&
825 !cd->ct_protocol.synproxy_data.cnxn_established) {
827 * The reply SYN/ACK has been received from the server.
828 * The connection can now be considered established,
829 * even though an ACK stills needs to be sent to
833 if (!rte_ct_tcp_in_window(cd, ct,
834 &cd->ct_protocol.tcp_ct_data,
835 dir, index, pkt, ip_hdr_size)) {
836 ct->counters->pkts_drop_outof_window++;
837 return RTE_CT_DROP_PACKET;
840 if (STATE_TRACKING) {
841 printf("synproxy full cnxn complete,");
842 printf(" new state %s\n", rte_ct_tcp_names
843 [RTE_CT_TCP_ESTABLISHED]);
847 * Convert the packet to an ack to return to the server.
848 * This routine also saves the real sequence number
852 rte_sp_cvt_to_spoofed_server_ack(cd, pkt);
854 index = rte_ct_get_index(tcpheader->tcp_flags);
856 if (!rte_ct_tcp_in_window(cd, ct,
857 &cd->ct_protocol.tcp_ct_data,
858 !dir, index, pkt, ip_hdr_size)) {
859 ct->counters->pkts_drop_outof_window++;
860 return RTE_CT_DROP_PACKET;
863 /* good packets, OK to update state */
865 cd->ct_protocol.tcp_ct_data.state =
866 RTE_CT_TCP_ESTABLISHED;
867 ct->counters->sessions_established++;
868 cd->ct_protocol.synproxy_data.cnxn_established = true;
869 cd->ct_protocol.tcp_ct_data.last_index = index;
870 cd->ct_protocol.tcp_ct_data.last_dir = !dir;
872 rte_ct_set_cnxn_timer_for_tcp(ct, cd,
873 RTE_CT_TCP_ESTABLISHED);
874 rte_ct_release_buffered_packets(ct, cd);
876 return RTE_CT_SEND_SERVER_ACK;
879 case RTE_CT_TCP_SYN_SENT:
882 * A connection that is actively closed goes to TIME-WAIT state.
883 * It can be re-opened (before it times out) by a SYN packet.
886 if (old_state < RTE_CT_TCP_TIME_WAIT)
889 * Due to previous check and state machine transitions,
890 * old state must be RTE_CT_TCP_TIME_WAIT or RTE_CT_TCP_CLOSE .
891 * Need to re-open connection.
894 return RTE_CT_REOPEN_CNXN_AND_FORWARD_PACKET;
896 case RTE_CT_TCP_IGNORE:
899 * Ignored packets usually mean the connection data is
900 * out of sync with client/server. Ignore, but forward
901 * these packets since they may be valid for the connection.
902 * If the ignored packet is invalid, the receiver will send
903 * an RST which should get the connection entry back in sync.
907 * However, if connection is running synproxy and the full
908 * connection is not yet established, there is no where
909 * for test packets to go so drop these packets.
912 if (cd->ct_protocol.synproxy_data.synproxied &&
913 !cd->ct_protocol.synproxy_data.cnxn_established)
914 return RTE_CT_DROP_PACKET;
916 if (index == RTE_CT_TCP_SAK_FLAG &&
917 cd->ct_protocol.tcp_ct_data.last_index ==
919 && cd->ct_protocol.tcp_ct_data.last_dir != dir
920 && recv_ack == cd->ct_protocol.tcp_ct_data.last_end) {
922 * SYN/ACK in reply direction acknowledging a SYN
923 * earlier ignored as invalid.Client and server in sync,
924 * but connection tracker is not. Use previous values
925 * to get back in sync.
928 struct rte_ct_tcp_state *last_seen =
929 &cd->ct_protocol.tcp_ct_data.seen[cd->ct_protocol.
933 /* reset new and old states to what they should
935 old_state = RTE_CT_TCP_SYN_SENT;
936 new_state = RTE_CT_TCP_SYN_RECV;
938 last_seen->end = cd->ct_protocol.tcp_ct_data.last_end;
940 cd->ct_protocol.tcp_ct_data.last_end;
942 RTE_MAX(cd->ct_protocol.tcp_ct_data.last_win,
945 cd->ct_protocol.tcp_ct_data.last_wscale;
946 cd->ct_protocol.tcp_ct_data.last_flags &=
947 ~RTE_CT_EXP_CHALLENGE_ACK;
949 cd->ct_protocol.tcp_ct_data.last_flags;
950 memset(&cd->ct_protocol.tcp_ct_data.seen[dir], 0,
951 sizeof(struct rte_ct_tcp_state));
955 cd->ct_protocol.tcp_ct_data.last_index = index;
956 cd->ct_protocol.tcp_ct_data.last_dir = dir;
957 cd->ct_protocol.tcp_ct_data.last_seq = sent_seq;
958 cd->ct_protocol.tcp_ct_data.last_end =
959 rte_ct_seq_plus_length(pkt, ip_hdr_size);
960 cd->ct_protocol.tcp_ct_data.last_win =
961 rte_bswap16(tcpheader->rx_win);
964 * An orinal SYN. Client and the server may be in sync, but
965 * the tracker is not . Annotate
966 * the TCP options and let the packet go through. If it is a
967 * valid SYN packet, the server will reply with a SYN/ACK, and
968 * then we'll get in sync. Otherwise, the server potentially
969 * responds with a challenge ACK if implementing RFC5961.
971 if (index == RTE_CT_TCP_SYN_FLAG &&
972 dir == RTE_CT_DIR_ORIGINAL) {
973 struct rte_ct_tcp_state seen;
975 /* call following to set "flag" and "scale" fields */
976 rte_ct_check_for_scaling_and_sack_perm(pkt, &seen,
979 /* only possible flags set for scling and sack */
980 cd->ct_protocol.tcp_ct_data.last_flags = seen.flags;
981 cd->ct_protocol.tcp_ct_data.last_wscale =
982 (seen.flags & RTE_CT_TCP_FLAG_WINDOW_SCALE) == 0 ?
986 * Mark the potential for RFC5961 challenge ACK,
987 * this pose a special problem for LAST_ACK state
988 * as ACK is intrepretated as ACKing last FIN.
990 if (old_state == RTE_CT_TCP_LAST_ACK)
991 cd->ct_protocol.tcp_ct_data.last_flags |=
992 RTE_CT_EXP_CHALLENGE_ACK;
994 return RTE_CT_FORWARD_PACKET;
996 case RTE_CT_TCP_TIME_WAIT:
998 * RFC5961 compliance cause stack to send "challenge-ACK" in
999 * response to unneeded SYNs. Do not treat this as acking
1002 if (old_state == RTE_CT_TCP_LAST_ACK &&
1003 index == RTE_CT_TCP_ACK_FLAG &&
1004 cd->ct_protocol.tcp_ct_data.last_dir != dir &&
1005 cd->ct_protocol.tcp_ct_data.last_index ==
1007 && (cd->ct_protocol.tcp_ct_data.
1008 last_flags & RTE_CT_EXP_CHALLENGE_ACK)) {
1009 /* Detected RFC5961 challenge ACK */
1010 cd->ct_protocol.tcp_ct_data.last_flags &=
1011 ~RTE_CT_EXP_CHALLENGE_ACK;
1012 return RTE_CT_FORWARD_PACKET; /* Don't change state */
1016 case RTE_CT_TCP_CLOSE:
1018 if (index == RTE_CT_TCP_RST_FLAG) {
1020 * Can only transition to CLOSE state with an RST,
1022 * CLOSE state with ACK, FIN, or RST. Do special checks.
1025 if ((cd->ct_protocol.tcp_ct_data.seen[!dir].flags &
1026 RTE_CT_TCP_FLAG_MAXACK_SET) &&
1027 rte_before(sent_seq, cd->ct_protocol.
1028 tcp_ct_data.seen[!dir].maxack)) {
1030 ct->counters->pkts_drop_invalid_rst++;
1032 return RTE_CT_DROP_PACKET;
1035 if (((cd->connstatus == RTE_SEEN_REPLY_CONN &&
1036 cd->ct_protocol.tcp_ct_data.last_index ==
1037 RTE_CT_TCP_SYN_FLAG) ||
1038 (cd->connstatus != RTE_ASSURED_CONN &&
1039 cd->ct_protocol.tcp_ct_data.last_index ==
1040 RTE_CT_TCP_ACK_FLAG)) &&
1042 cd->ct_protocol.tcp_ct_data.last_end) {
1043 /* RST sent to invalid SYN or ACK previously
1054 if (likely(check_window)) {
1055 if (unlikely(!rte_ct_tcp_in_window(cd, ct,
1056 &cd->ct_protocol.tcp_ct_data,
1058 pkt, ip_hdr_size))) {
1059 ct->counters->pkts_drop_outof_window++;
1060 return RTE_CT_DROP_PACKET;
1064 if (new_state == RTE_CT_TCP_ESTABLISHED &&
1065 old_state != RTE_CT_TCP_ESTABLISHED)
1066 /* only increment for first state transition to established */
1067 /* synproxy established count handled elswhere */
1068 ct->counters->sessions_established++;
1069 /* From this point on, all packets are in-window */
1070 cd->ct_protocol.tcp_ct_data.last_index = index;
1071 cd->ct_protocol.tcp_ct_data.last_dir = dir;
1073 if (index == RTE_CT_TCP_SAK_FLAG)
1074 cd->connstatus = RTE_SEEN_REPLY_CONN;
1076 timeout_state = new_state;
1078 if (cd->ct_protocol.tcp_ct_data.retrans >=
1079 ct->misc_options.tcp_max_retrans)
1081 rte_ct_choose_min_timeout_state(ct, timeout_state,
1082 RTE_CT_TCP_RETRANS);
1083 else if (rte_ct_either_direction_has_flags(cd,
1084 RTE_CT_TCP_FLAG_DATA_UNACKNOWLEDGED))
1086 rte_ct_choose_min_timeout_state(ct, timeout_state,
1089 if (cd->connstatus != RTE_SEEN_REPLY_CONN) {
1090 if (tcpheader->tcp_flags & RTE_CT_TCPHDR_RST) {
1092 * if only reply seen is RST, there is not an
1093 * established connection, so just destroy
1097 return RTE_CT_DESTROY_CNXN_AND_FORWARD_PACKET;
1099 /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1100 pickup with loose=1. Avoid large ESTABLISHED timeout. */
1101 if (new_state == RTE_CT_TCP_ESTABLISHED)
1102 timeout_state = rte_ct_choose_min_timeout_state(ct,
1106 } else if (cd->connstatus != RTE_ASSURED_CONN &&
1107 (old_state == RTE_CT_TCP_SYN_RECV
1108 || old_state == RTE_CT_TCP_ESTABLISHED)
1109 && new_state == RTE_CT_TCP_ESTABLISHED)
1110 cd->connstatus = RTE_ASSURED_CONN;
1112 cd->ct_protocol.tcp_ct_data.state = new_state;
1113 rte_ct_set_cnxn_timer_for_tcp(ct, cd, timeout_state);
1115 return return_action;