X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;ds=sidebyside;f=kernel%2Finclude%2Fnet%2Ftcp.h;h=414d822bc1db778c3606c62a346227228f7319f7;hb=e09b41010ba33a20a87472ee821fa407a5b8da36;hp=6d204f3f9df8cafb82d856db08769a7d24dfd79e;hpb=f93b97fd65072de626c074dbe099a1fff05ce060;p=kvmfornfv.git diff --git a/kernel/include/net/tcp.h b/kernel/include/net/tcp.h index 6d204f3f9..414d822bc 100644 --- a/kernel/include/net/tcp.h +++ b/kernel/include/net/tcp.h @@ -279,13 +279,24 @@ extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; +extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; +extern int sysctl_tcp_pacing_ss_ratio; +extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern int tcp_memory_pressure; +/* optimized version of sk_under_memory_pressure() for TCP sockets */ +static inline bool tcp_under_memory_pressure(const struct sock *sk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + return !!sk->sk_cgrp->memory_pressure; + + return tcp_memory_pressure; +} /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). @@ -311,6 +322,8 @@ static inline bool tcp_out_of_memory(struct sock *sk) return false; } +void sk_forced_mem_schedule(struct sock *sk, int size); + static inline bool tcp_too_many_orphans(struct sock *sk, int shift) { struct percpu_counter *ocp = sk->sk_prot->orphan_count; @@ -326,18 +339,6 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) bool tcp_check_oom(struct sock *sk, int shift); -/* syncookies: remember time of last synqueue overflow */ -static inline void tcp_synq_overflow(struct sock *sk) -{ - tcp_sk(sk)->rx_opt.ts_recent_stamp = jiffies; -} - -/* syncookies: no recent synqueue overflow on this listening socket? */ -static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) -{ - unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - return time_after(jiffies, last_overflow + TCP_TIMEOUT_FALLBACK); -} extern struct proto tcp_prot; @@ -365,8 +366,7 @@ void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); -int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th, unsigned int len); +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len); void tcp_rcv_space_adjust(struct sock *sk); @@ -449,21 +449,24 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); -void tcp_req_err(struct sock *sk, u32 seq); +void tcp_req_err(struct sock *sk, u32 seq, bool abort); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); -struct sock *tcp_create_openreq_child(struct sock *sk, +struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb); void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); -struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, +struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst); + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); -struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, +struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcp_fastopen_cookie *foc); + struct tcp_fastopen_cookie *foc, + bool attach_req); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -471,6 +474,9 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ +struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); @@ -483,20 +489,42 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if * the counter advances immediately after a cookie is generated). */ -#define MAX_SYNCOOKIE_AGE 2 +#define MAX_SYNCOOKIE_AGE 2 +#define TCP_SYNCOOKIE_PERIOD (60 * HZ) +#define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD) + +/* syncookies: remember time of last synqueue overflow + * But do not dirty this field too often (once per second is enough) + * It is racy as we do not hold a lock, but race is very minor. + */ +static inline void tcp_synq_overflow(const struct sock *sk) +{ + unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned long now = jiffies; + + if (time_after(now, last_overflow + HZ)) + tcp_sk(sk)->rx_opt.ts_recent_stamp = now; +} + +/* syncookies: no recent synqueue overflow on this listening socket? */ +static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) +{ + unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + + return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID); +} static inline u32 tcp_cookie_time(void) { u64 val = get_jiffies_64(); - do_div(val, 60 * HZ); + do_div(val, TCP_SYNCOOKIE_PERIOD); return val; } u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); -__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, - __u16 *mss); +__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); __u32 cookie_init_timestamp(struct request_sock *req); bool cookie_timestamp_decode(struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, @@ -509,8 +537,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); -__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, - __u16 *mss); +__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ @@ -527,7 +554,7 @@ int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); -int tcp_write_wakeup(struct sock *); +int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); @@ -540,7 +567,9 @@ bool tcp_schedule_loss_probe(struct sock *sk); /* tcp_input.c */ void tcp_resume_early_retransmit(struct sock *sk); void tcp_rearm_rto(struct sock *sk); +void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); +void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); @@ -646,6 +675,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) return dst_metric_locked(dst, RTAX_CC_ALGO); } +/* Minimum RTT in usec. ~0 means not available. */ +static inline u32 tcp_min_rtt(const struct tcp_sock *tp) +{ + return tp->rtt_min[0].rtt; +} + /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. @@ -692,6 +727,8 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) #define TCPHDR_ECE 0x40 #define TCPHDR_CWR 0x80 +#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) + /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. @@ -705,11 +742,14 @@ struct tcp_skb_cb { /* Note : tcp_tw_isn is used in input path only * (isn chosen by tcp_timewait_state_process()) * - * tcp_gso_segs is used in write queue only, - * cf tcp_skb_pcount() + * tcp_gso_segs/size are used in write queue only, + * cf tcp_skb_pcount()/tcp_skb_mss() */ __u32 tcp_tw_isn; - __u32 tcp_gso_segs; + struct { + u16 tcp_gso_segs; + u16 tcp_gso_size; + }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ @@ -765,10 +805,10 @@ static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) TCP_SKB_CB(skb)->tcp_gso_segs += segs; } -/* This is valid iff tcp_skb_pcount() > 1. */ +/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { - return skb_shinfo(skb)->gso_size; + return TCP_SKB_CB(skb)->tcp_gso_size; } /* Events passed to congestion control interface */ @@ -858,7 +898,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else @@ -961,6 +1001,11 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd < tp->snd_ssthresh; +} + static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; @@ -1037,20 +1082,37 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) return tp->snd_cwnd < 2 * tp->max_packets_out; return tp->is_cwnd_limited; } -static inline void tcp_check_probe_timer(struct sock *sk) +/* Something is really bad, we could not queue an additional packet, + * because qdisc is full or receiver sent a 0 window. + * We do not want to add fuel to the fire, or abort too early, + * so make sure the timer we arm now is at least 200ms in the future, + * regardless of current icsk_rto value (as it could be ~2ms) + */ +static inline unsigned long tcp_probe0_base(const struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); + return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN); +} + +/* Variant of inet_csk_rto_backoff() used for zero window probes */ +static inline unsigned long tcp_probe0_when(const struct sock *sk, + unsigned long max_when) +{ + u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff; + + return (unsigned long)min_t(u64, when, max_when); +} - if (!tp->packets_out && !icsk->icsk_pending) +static inline void tcp_check_probe_timer(struct sock *sk) +{ + if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - icsk->icsk_rto, TCP_RTO_MAX); + tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) @@ -1115,6 +1177,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) } u32 tcp_default_init_rwnd(u32 mss); +void tcp_cwnd_restart(struct sock *sk, s32 delta); + +static inline void tcp_slow_start_after_idle_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 delta; + + if (!sysctl_tcp_slow_start_after_idle || tp->packets_out) + return; + delta = tcp_time_stamp - tp->lsndtime; + if (delta > inet_csk(sk)->icsk_rto) + tcp_cwnd_restart(sk, delta); +} /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, @@ -1141,7 +1216,8 @@ static inline int tcp_full_space(const struct sock *sk) } extern void tcp_openreq_init_rwin(struct request_sock *req, - struct sock *sk, struct dst_entry *dst); + const struct sock *sk_listener, + const struct dst_entry *dst); void tcp_enter_memory_pressure(struct sock *sk); @@ -1305,16 +1381,16 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family); -struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG -struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family); #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else -static inline struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, +static inline struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, const union tcp_md5_addr *addr, int family) { @@ -1355,10 +1431,10 @@ void tcp_free_fastopen_req(struct tcp_sock *tp); extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; int tcp_fastopen_reset_cipher(void *key, unsigned int len); -bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct tcp_fastopen_cookie *foc, - struct dst_entry *dst); +struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + struct dst_entry *dst); void tcp_fastopen_init_key_once(bool publish); #define TCP_FASTOPEN_KEY_LENGTH 16 @@ -1553,7 +1629,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, - TCP_SEQ_STATE_OPENREQ, TCP_SEQ_STATE_ESTABLISHED, }; @@ -1572,7 +1647,6 @@ struct tcp_iter_state { enum tcp_seq_states state; struct sock *syn_wait_sk; int bucket, offset, sbucket, num; - kuid_t uid; loff_t last_pos; }; @@ -1609,7 +1683,7 @@ int tcp4_proc_init(void); void tcp4_proc_exit(void); #endif -int tcp_rtx_synack(struct sock *sk, struct request_sock *req); +int tcp_rtx_synack(const struct sock *sk, struct request_sock *req); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); @@ -1617,7 +1691,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, /* TCP af-specific functions */ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk, + struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, @@ -1632,40 +1706,42 @@ struct tcp_sock_af_ops { struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *(*req_md5_lookup)(struct sock *sk, + struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); #endif - void (*init_req)(struct request_sock *req, struct sock *sk, + void (*init_req)(struct request_sock *req, + const struct sock *sk_listener, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES - __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb, + __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif - struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl, + struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, const struct request_sock *req, bool *strict); __u32 (*init_seq)(const struct sk_buff *skb); - int (*send_synack)(struct sock *sk, struct dst_entry *dst, + int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, - u16 queue_mapping, struct tcp_fastopen_cookie *foc); - void (*queue_hash_add)(struct sock *sk, struct request_sock *req, - const unsigned long timeout); + struct tcp_fastopen_cookie *foc, + bool attach_req); }; #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, - struct sock *sk, struct sk_buff *skb, + const struct sock *sk, struct sk_buff *skb, __u16 *mss) { - return ops->cookie_init_seq(sk, skb, mss); + tcp_synq_overflow(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + return ops->cookie_init_seq(skb, mss); } #else static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, - struct sock *sk, struct sk_buff *skb, + const struct sock *sk, struct sk_buff *skb, __u16 *mss) { return 0; @@ -1677,6 +1753,19 @@ int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); +/* tcp_recovery.c */ + +/* Flags to enable various loss recovery features. See below */ +extern int sysctl_tcp_recovery; + +/* Use TCP RACK to detect (some) tail and retransmit losses */ +#define TCP_RACK_LOST_RETRANS 0x1 + +extern int tcp_rack_mark_lost(struct sock *sk); + +extern void tcp_rack_advance(struct tcp_sock *tp, + const struct skb_mstamp *xmit_time, u8 sacked); + /* * Save and compile IPv4 options, return a pointer to it */