| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ | 
|---|
| 2 | #ifndef _TCP_ECN_H | 
|---|
| 3 | #define _TCP_ECN_H | 
|---|
| 4 |  | 
|---|
| 5 | #include <linux/tcp.h> | 
|---|
| 6 | #include <linux/skbuff.h> | 
|---|
| 7 | #include <linux/bitfield.h> | 
|---|
| 8 |  | 
|---|
| 9 | #include <net/inet_connection_sock.h> | 
|---|
| 10 | #include <net/sock.h> | 
|---|
| 11 | #include <net/tcp.h> | 
|---|
| 12 | #include <net/inet_ecn.h> | 
|---|
| 13 |  | 
|---|
| 14 | /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is | 
|---|
| 15 | * attemped to be negotiated and requested for incoming connection | 
|---|
| 16 | * and outgoing connection, respectively. | 
|---|
| 17 | */ | 
|---|
| 18 | enum tcp_ecn_mode { | 
|---|
| 19 | TCP_ECN_IN_NOECN_OUT_NOECN = 0, | 
|---|
| 20 | TCP_ECN_IN_ECN_OUT_ECN = 1, | 
|---|
| 21 | TCP_ECN_IN_ECN_OUT_NOECN = 2, | 
|---|
| 22 | TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, | 
|---|
| 23 | TCP_ECN_IN_ACCECN_OUT_ECN = 4, | 
|---|
| 24 | TCP_ECN_IN_ACCECN_OUT_NOECN = 5, | 
|---|
| 25 | }; | 
|---|
| 26 |  | 
|---|
| 27 | /* AccECN option sending when AccECN has been successfully negotiated */ | 
|---|
| 28 | enum tcp_accecn_option { | 
|---|
| 29 | TCP_ACCECN_OPTION_DISABLED = 0, | 
|---|
| 30 | TCP_ACCECN_OPTION_MINIMUM = 1, | 
|---|
| 31 | TCP_ACCECN_OPTION_FULL = 2, | 
|---|
| 32 | }; | 
|---|
| 33 |  | 
|---|
| 34 | static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) | 
|---|
| 35 | { | 
|---|
| 36 | /* Do not set CWR if in AccECN mode! */ | 
|---|
| 37 | if (tcp_ecn_mode_rfc3168(tp)) | 
|---|
| 38 | tp->ecn_flags |= TCP_ECN_QUEUE_CWR; | 
|---|
| 39 | } | 
|---|
| 40 |  | 
|---|
| 41 | static inline void tcp_ecn_accept_cwr(struct sock *sk, | 
|---|
| 42 | const struct sk_buff *skb) | 
|---|
| 43 | { | 
|---|
| 44 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 45 |  | 
|---|
| 46 | if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { | 
|---|
| 47 | tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | 
|---|
| 48 |  | 
|---|
| 49 | /* If the sender is telling us it has entered CWR, then its | 
|---|
| 50 | * cwnd may be very low (even just 1 packet), so we should ACK | 
|---|
| 51 | * immediately. | 
|---|
| 52 | */ | 
|---|
| 53 | if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) | 
|---|
| 54 | inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; | 
|---|
| 55 | } | 
|---|
| 56 | } | 
|---|
| 57 |  | 
|---|
| 58 | static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) | 
|---|
| 59 | { | 
|---|
| 60 | tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; | 
|---|
| 61 | } | 
|---|
| 62 |  | 
|---|
| 63 | /* tp->accecn_fail_mode */ | 
|---|
| 64 | #define TCP_ACCECN_ACE_FAIL_SEND	BIT(0) | 
|---|
| 65 | #define TCP_ACCECN_ACE_FAIL_RECV	BIT(1) | 
|---|
| 66 | #define TCP_ACCECN_OPT_FAIL_SEND	BIT(2) | 
|---|
| 67 | #define TCP_ACCECN_OPT_FAIL_RECV	BIT(3) | 
|---|
| 68 |  | 
|---|
| 69 | static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) | 
|---|
| 70 | { | 
|---|
| 71 | return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; | 
|---|
| 72 | } | 
|---|
| 73 |  | 
|---|
| 74 | static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) | 
|---|
| 75 | { | 
|---|
| 76 | return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; | 
|---|
| 77 | } | 
|---|
| 78 |  | 
|---|
| 79 | static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) | 
|---|
| 80 | { | 
|---|
| 81 | return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; | 
|---|
| 82 | } | 
|---|
| 83 |  | 
|---|
| 84 | static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) | 
|---|
| 85 | { | 
|---|
| 86 | return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; | 
|---|
| 87 | } | 
|---|
| 88 |  | 
|---|
| 89 | static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) | 
|---|
| 90 | { | 
|---|
| 91 | tp->accecn_fail_mode |= mode; | 
|---|
| 92 | } | 
|---|
| 93 |  | 
|---|
| 94 | #define TCP_ACCECN_OPT_NOT_SEEN		0x0 | 
|---|
| 95 | #define TCP_ACCECN_OPT_EMPTY_SEEN	0x1 | 
|---|
| 96 | #define TCP_ACCECN_OPT_COUNTER_SEEN	0x2 | 
|---|
| 97 | #define TCP_ACCECN_OPT_FAIL_SEEN	0x3 | 
|---|
| 98 |  | 
|---|
| 99 | static inline u8 tcp_accecn_ace(const struct tcphdr *th) | 
|---|
| 100 | { | 
|---|
| 101 | return (th->ae << 2) | (th->cwr << 1) | th->ece; | 
|---|
| 102 | } | 
|---|
| 103 |  | 
|---|
| 104 | /* Infer the ECT value our SYN arrived with from the echoed ACE field */ | 
|---|
| 105 | static inline int (u8 ace) | 
|---|
| 106 | { | 
|---|
| 107 | /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ | 
|---|
| 108 | static const int ace_to_ecn[8] = { | 
|---|
| 109 | INET_ECN_ECT_0,		/* 0b000 (Undefined) */ | 
|---|
| 110 | INET_ECN_ECT_1,		/* 0b001 (Undefined) */ | 
|---|
| 111 | INET_ECN_NOT_ECT,	/* 0b010 (Not-ECT is received) */ | 
|---|
| 112 | INET_ECN_ECT_1,		/* 0b011 (ECT-1 is received) */ | 
|---|
| 113 | INET_ECN_ECT_0,		/* 0b100 (ECT-0 is received) */ | 
|---|
| 114 | INET_ECN_ECT_1,		/* 0b101 (Reserved) */ | 
|---|
| 115 | INET_ECN_CE,		/* 0b110 (CE is received) */ | 
|---|
| 116 | INET_ECN_ECT_1		/* 0b111 (Undefined) */ | 
|---|
| 117 | }; | 
|---|
| 118 |  | 
|---|
| 119 | return ace_to_ecn[ace & 0x7]; | 
|---|
| 120 | } | 
|---|
| 121 |  | 
|---|
| 122 | /* Check ECN field transition to detect invalid transitions */ | 
|---|
| 123 | static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) | 
|---|
| 124 | { | 
|---|
| 125 | if (rcv == snt) | 
|---|
| 126 | return true; | 
|---|
| 127 |  | 
|---|
| 128 | /* Non-ECT altered to something or something became non-ECT */ | 
|---|
| 129 | if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) | 
|---|
| 130 | return false; | 
|---|
| 131 | /* CE -> ECT(0/1)? */ | 
|---|
| 132 | if (snt == INET_ECN_CE) | 
|---|
| 133 | return false; | 
|---|
| 134 | return true; | 
|---|
| 135 | } | 
|---|
| 136 |  | 
|---|
| 137 | static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, | 
|---|
| 138 | u8 sent_ect) | 
|---|
| 139 | { | 
|---|
| 140 | u8 ect = tcp_accecn_extract_syn_ect(ace); | 
|---|
| 141 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 142 |  | 
|---|
| 143 | if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) | 
|---|
| 144 | return true; | 
|---|
| 145 |  | 
|---|
| 146 | if (!tcp_ect_transition_valid(snt: sent_ect, rcv: ect)) { | 
|---|
| 147 | tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); | 
|---|
| 148 | return false; | 
|---|
| 149 | } | 
|---|
| 150 |  | 
|---|
| 151 | return true; | 
|---|
| 152 | } | 
|---|
| 153 |  | 
|---|
| 154 | static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, | 
|---|
| 155 | u8 saw_opt) | 
|---|
| 156 | { | 
|---|
| 157 | tp->saw_accecn_opt = saw_opt; | 
|---|
| 158 | if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) | 
|---|
| 159 | tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); | 
|---|
| 160 | } | 
|---|
| 161 |  | 
|---|
| 162 | /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ | 
|---|
| 163 | static inline void tcp_accecn_third_ack(struct sock *sk, | 
|---|
| 164 | const struct sk_buff *skb, u8 sent_ect) | 
|---|
| 165 | { | 
|---|
| 166 | u8 ace = tcp_accecn_ace(th: tcp_hdr(skb)); | 
|---|
| 167 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 168 |  | 
|---|
| 169 | switch (ace) { | 
|---|
| 170 | case 0x0: | 
|---|
| 171 | /* Invalid value */ | 
|---|
| 172 | tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); | 
|---|
| 173 | break; | 
|---|
| 174 | case 0x7: | 
|---|
| 175 | case 0x5: | 
|---|
| 176 | case 0x1: | 
|---|
| 177 | /* Unused but legal values */ | 
|---|
| 178 | break; | 
|---|
| 179 | default: | 
|---|
| 180 | /* Validation only applies to first non-data packet */ | 
|---|
| 181 | if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && | 
|---|
| 182 | !TCP_SKB_CB(skb)->sacked && | 
|---|
| 183 | tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { | 
|---|
| 184 | if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && | 
|---|
| 185 | !tp->delivered_ce) | 
|---|
| 186 | tp->delivered_ce++; | 
|---|
| 187 | } | 
|---|
| 188 | break; | 
|---|
| 189 | } | 
|---|
| 190 | } | 
|---|
| 191 |  | 
|---|
| 192 | /* Demand the minimum # to send AccECN optnio */ | 
|---|
| 193 | static inline void tcp_accecn_opt_demand_min(struct sock *sk, | 
|---|
| 194 | u8 opt_demand_min) | 
|---|
| 195 | { | 
|---|
| 196 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 197 | u8 opt_demand; | 
|---|
| 198 |  | 
|---|
| 199 | opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); | 
|---|
| 200 | tp->accecn_opt_demand = opt_demand; | 
|---|
| 201 | } | 
|---|
| 202 |  | 
|---|
| 203 | /* Maps IP ECN field ECT/CE code point to AccECN option field number, given | 
|---|
| 204 | * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). | 
|---|
| 205 | */ | 
|---|
| 206 | static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) | 
|---|
| 207 | { | 
|---|
| 208 | switch (ecnfield & INET_ECN_MASK) { | 
|---|
| 209 | case INET_ECN_NOT_ECT: | 
|---|
| 210 | return 0;	/* AccECN does not send counts of NOT_ECT */ | 
|---|
| 211 | case INET_ECN_ECT_1: | 
|---|
| 212 | return 1; | 
|---|
| 213 | case INET_ECN_CE: | 
|---|
| 214 | return 2; | 
|---|
| 215 | case INET_ECN_ECT_0: | 
|---|
| 216 | return 3; | 
|---|
| 217 | } | 
|---|
| 218 | return 0; | 
|---|
| 219 | } | 
|---|
| 220 |  | 
|---|
| 221 | /* Maps IP ECN field ECT/CE code point to AccECN option field value offset. | 
|---|
| 222 | * Some fields do not start from zero, to detect zeroing by middleboxes. | 
|---|
| 223 | */ | 
|---|
| 224 | static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) | 
|---|
| 225 | { | 
|---|
| 226 | switch (ecnfield & INET_ECN_MASK) { | 
|---|
| 227 | case INET_ECN_NOT_ECT: | 
|---|
| 228 | return 0;	/* AccECN does not send counts of NOT_ECT */ | 
|---|
| 229 | case INET_ECN_ECT_1: | 
|---|
| 230 | return TCP_ACCECN_E1B_INIT_OFFSET; | 
|---|
| 231 | case INET_ECN_CE: | 
|---|
| 232 | return TCP_ACCECN_CEB_INIT_OFFSET; | 
|---|
| 233 | case INET_ECN_ECT_0: | 
|---|
| 234 | return TCP_ACCECN_E0B_INIT_OFFSET; | 
|---|
| 235 | } | 
|---|
| 236 | return 0; | 
|---|
| 237 | } | 
|---|
| 238 |  | 
|---|
| 239 | /* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ | 
|---|
| 240 | static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, | 
|---|
| 241 | bool order) | 
|---|
| 242 | { | 
|---|
| 243 | /* Based on Table 5 of the AccECN spec to map (option, order) to | 
|---|
| 244 | * the corresponding ECN conuters (ECT-1, ECT-0, or CE). | 
|---|
| 245 | */ | 
|---|
| 246 | static const u8 optfield_lookup[2][3] = { | 
|---|
| 247 | /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ | 
|---|
| 248 | { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, | 
|---|
| 249 | /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ | 
|---|
| 250 | { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } | 
|---|
| 251 | }; | 
|---|
| 252 |  | 
|---|
| 253 | return optfield_lookup[order][option % 3]; | 
|---|
| 254 | } | 
|---|
| 255 |  | 
|---|
| 256 | /* Handles AccECN option ECT and CE 24-bit byte counters update into | 
|---|
| 257 | * the u32 value in tcp_sock. As we're processing TCP options, it is | 
|---|
| 258 | * safe to access from - 1. | 
|---|
| 259 | */ | 
|---|
| 260 | static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, | 
|---|
| 261 | u32 init_offset) | 
|---|
| 262 | { | 
|---|
| 263 | u32 truncated = (get_unaligned_be32(p: from - 1) - init_offset) & | 
|---|
| 264 | 0xFFFFFFU; | 
|---|
| 265 | u32 delta = (truncated - *cnt) & 0xFFFFFFU; | 
|---|
| 266 |  | 
|---|
| 267 | /* If delta has the highest bit set (24th bit) indicating | 
|---|
| 268 | * negative, sign extend to correct an estimation using | 
|---|
| 269 | * sign_extend32(delta, 24 - 1) | 
|---|
| 270 | */ | 
|---|
| 271 | delta = sign_extend32(value: delta, index: 23); | 
|---|
| 272 | *cnt += delta; | 
|---|
| 273 | return (s32)delta; | 
|---|
| 274 | } | 
|---|
| 275 |  | 
|---|
| 276 | /* Updates Accurate ECN received counters from the received IP ECN field */ | 
|---|
| 277 | static inline void tcp_ecn_received_counters(struct sock *sk, | 
|---|
| 278 | const struct sk_buff *skb, u32 len) | 
|---|
| 279 | { | 
|---|
| 280 | u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; | 
|---|
| 281 | u8 is_ce = INET_ECN_is_ce(dsfield: ecnfield); | 
|---|
| 282 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 283 | bool ecn_edge; | 
|---|
| 284 |  | 
|---|
| 285 | if (!INET_ECN_is_not_ect(dsfield: ecnfield)) { | 
|---|
| 286 | u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); | 
|---|
| 287 |  | 
|---|
| 288 | /* As for accurate ECN, the TCP_ECN_SEEN flag is set by | 
|---|
| 289 | * tcp_ecn_received_counters() when the ECN codepoint of | 
|---|
| 290 | * received TCP data or ACK contains ECT(0), ECT(1), or CE. | 
|---|
| 291 | */ | 
|---|
| 292 | if (!tcp_ecn_mode_rfc3168(tp)) | 
|---|
| 293 | tp->ecn_flags |= TCP_ECN_SEEN; | 
|---|
| 294 |  | 
|---|
| 295 | /* ACE counter tracks *all* segments including pure ACKs */ | 
|---|
| 296 | tp->received_ce += pcount; | 
|---|
| 297 | tp->received_ce_pending = min(tp->received_ce_pending + pcount, | 
|---|
| 298 | 0xfU); | 
|---|
| 299 |  | 
|---|
| 300 | if (len > 0) { | 
|---|
| 301 | u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); | 
|---|
| 302 | u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; | 
|---|
| 303 | u32 bytes_mask = GENMASK_U32(31, 22); | 
|---|
| 304 |  | 
|---|
| 305 | tp->received_ecn_bytes[ecnfield - 1] += len; | 
|---|
| 306 | tp->accecn_minlen = max_t(u8, tp->accecn_minlen, | 
|---|
| 307 | minlen); | 
|---|
| 308 |  | 
|---|
| 309 | /* Send AccECN option at least once per 2^22-byte | 
|---|
| 310 | * increase in any ECN byte counter. | 
|---|
| 311 | */ | 
|---|
| 312 | if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & | 
|---|
| 313 | bytes_mask) { | 
|---|
| 314 | tcp_accecn_opt_demand_min(sk, opt_demand_min: 1); | 
|---|
| 315 | } | 
|---|
| 316 | } | 
|---|
| 317 | } | 
|---|
| 318 |  | 
|---|
| 319 | ecn_edge = tp->prev_ecnfield != ecnfield; | 
|---|
| 320 | if (ecn_edge || is_ce) { | 
|---|
| 321 | tp->prev_ecnfield = ecnfield; | 
|---|
| 322 | /* Demand Accurate ECN change-triggered ACKs. Two ACK are | 
|---|
| 323 | * demanded to indicate unambiguously the ecnfield value | 
|---|
| 324 | * in the latter ACK. | 
|---|
| 325 | */ | 
|---|
| 326 | if (tcp_ecn_mode_accecn(tp)) { | 
|---|
| 327 | if (ecn_edge) | 
|---|
| 328 | inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; | 
|---|
| 329 | tp->accecn_opt_demand = 2; | 
|---|
| 330 | } | 
|---|
| 331 | } | 
|---|
| 332 | } | 
|---|
| 333 |  | 
|---|
| 334 | /* AccECN specification, 2.2: [...] A Data Receiver maintains four counters | 
|---|
| 335 | * initialized at the start of	the half-connection. [...] These byte counters | 
|---|
| 336 | * reflect only the TCP payload length, excluding TCP header and TCP options. | 
|---|
| 337 | */ | 
|---|
| 338 | static inline void tcp_ecn_received_counters_payload(struct sock *sk, | 
|---|
| 339 | const struct sk_buff *skb) | 
|---|
| 340 | { | 
|---|
| 341 | const struct tcphdr *th = (const struct tcphdr *)skb->data; | 
|---|
| 342 |  | 
|---|
| 343 | tcp_ecn_received_counters(sk, skb, len: skb->len - th->doff * 4); | 
|---|
| 344 | } | 
|---|
| 345 |  | 
|---|
| 346 | /* AccECN specification, 5.1: [...] a server can determine that it | 
|---|
| 347 | * negotiated AccECN as [...] if the ACK contains an ACE field with | 
|---|
| 348 | * the value 0b010 to 0b111 (decimal 2 to 7). | 
|---|
| 349 | */ | 
|---|
| 350 | static inline bool cookie_accecn_ok(const struct tcphdr *th) | 
|---|
| 351 | { | 
|---|
| 352 | return tcp_accecn_ace(th) > 0x1; | 
|---|
| 353 | } | 
|---|
| 354 |  | 
|---|
| 355 | /* Used to form the ACE flags for SYN/ACK */ | 
|---|
| 356 | static inline u16 tcp_accecn_reflector_flags(u8 ect) | 
|---|
| 357 | { | 
|---|
| 358 | /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. | 
|---|
| 359 | * Below is an excerpt from the 1st block of Table 2 of AccECN spec, | 
|---|
| 360 | * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE | 
|---|
| 361 | */ | 
|---|
| 362 | static const u8 ecn_to_ace_flags[4] = { | 
|---|
| 363 | 0b010,	/* Not-ECT is received */ | 
|---|
| 364 | 0b011,	/* ECT(1) is received */ | 
|---|
| 365 | 0b100,	/* ECT(0) is received */ | 
|---|
| 366 | 0b110	/* CE is received */ | 
|---|
| 367 | }; | 
|---|
| 368 |  | 
|---|
| 369 | return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); | 
|---|
| 370 | } | 
|---|
| 371 |  | 
|---|
| 372 | /* AccECN specification, 3.1.2: If a TCP server that implements AccECN | 
|---|
| 373 | * receives a SYN with the three TCP header flags (AE, CWR and ECE) set | 
|---|
| 374 | * to any combination other than 000, 011 or 111, it MUST negotiate the | 
|---|
| 375 | * use of AccECN as if they had been set to 111. | 
|---|
| 376 | */ | 
|---|
| 377 | static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) | 
|---|
| 378 | { | 
|---|
| 379 | u8 ace = tcp_accecn_ace(th); | 
|---|
| 380 |  | 
|---|
| 381 | return ace && ace != 0x3; | 
|---|
| 382 | } | 
|---|
| 383 |  | 
|---|
| 384 | static inline void __tcp_accecn_init_bytes_counters(int *counter_array) | 
|---|
| 385 | { | 
|---|
| 386 | BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); | 
|---|
| 387 | BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); | 
|---|
| 388 | BUILD_BUG_ON(INET_ECN_CE != 0x3); | 
|---|
| 389 |  | 
|---|
| 390 | counter_array[INET_ECN_ECT_1 - 1] = 0; | 
|---|
| 391 | counter_array[INET_ECN_ECT_0 - 1] = 0; | 
|---|
| 392 | counter_array[INET_ECN_CE - 1] = 0; | 
|---|
| 393 | } | 
|---|
| 394 |  | 
|---|
| 395 | static inline void tcp_accecn_init_counters(struct tcp_sock *tp) | 
|---|
| 396 | { | 
|---|
| 397 | tp->received_ce = 0; | 
|---|
| 398 | tp->received_ce_pending = 0; | 
|---|
| 399 | __tcp_accecn_init_bytes_counters(counter_array: tp->received_ecn_bytes); | 
|---|
| 400 | __tcp_accecn_init_bytes_counters(counter_array: tp->delivered_ecn_bytes); | 
|---|
| 401 | tp->accecn_minlen = 0; | 
|---|
| 402 | tp->accecn_opt_demand = 0; | 
|---|
| 403 | tp->est_ecnfield = 0; | 
|---|
| 404 | } | 
|---|
| 405 |  | 
|---|
| 406 | /* Used for make_synack to form the ACE flags */ | 
|---|
| 407 | static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) | 
|---|
| 408 | { | 
|---|
| 409 | /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received | 
|---|
| 410 | * from SYN. Below is an excerpt from Table 2 of the AccECN spec: | 
|---|
| 411 | * +====================+====================================+ | 
|---|
| 412 | * |  IP-ECN codepoint  |  Respective ACE falgs on SYN/ACK   | | 
|---|
| 413 | * |   received on SYN  |       AE       CWR       ECE       | | 
|---|
| 414 | * +====================+====================================+ | 
|---|
| 415 | * |      Not-ECT       |       0         1         0        | | 
|---|
| 416 | * |      ECT(1)        |       0         1         1        | | 
|---|
| 417 | * |      ECT(0)        |       1         0         0        | | 
|---|
| 418 | * |        CE          |       1         1         0        | | 
|---|
| 419 | * +====================+====================================+ | 
|---|
| 420 | */ | 
|---|
| 421 | th->ae = !!(ect & INET_ECN_ECT_0); | 
|---|
| 422 | th->cwr = ect != INET_ECN_ECT_0; | 
|---|
| 423 | th->ece = ect == INET_ECN_ECT_1; | 
|---|
| 424 | } | 
|---|
| 425 |  | 
|---|
| 426 | static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, | 
|---|
| 427 | struct tcphdr *th) | 
|---|
| 428 | { | 
|---|
| 429 | u32 wire_ace; | 
|---|
| 430 |  | 
|---|
| 431 | /* The final packet of the 3WHS or anything like it must reflect | 
|---|
| 432 | * the SYN/ACK ECT instead of putting CEP into ACE field, such | 
|---|
| 433 | * case show up in tcp_flags. | 
|---|
| 434 | */ | 
|---|
| 435 | if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { | 
|---|
| 436 | wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; | 
|---|
| 437 | th->ece = !!(wire_ace & 0x1); | 
|---|
| 438 | th->cwr = !!(wire_ace & 0x2); | 
|---|
| 439 | th->ae = !!(wire_ace & 0x4); | 
|---|
| 440 | tp->received_ce_pending = 0; | 
|---|
| 441 | } | 
|---|
| 442 | } | 
|---|
| 443 |  | 
|---|
| 444 | static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, | 
|---|
| 445 | u8 opt_offset) | 
|---|
| 446 | { | 
|---|
| 447 | u8 *ptr = skb_transport_header(skb) + opt_offset; | 
|---|
| 448 | unsigned int optlen = ptr[1] - 2; | 
|---|
| 449 |  | 
|---|
| 450 | if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) | 
|---|
| 451 | return TCP_ACCECN_OPT_FAIL_SEEN; | 
|---|
| 452 | ptr += 2; | 
|---|
| 453 |  | 
|---|
| 454 | /* Detect option zeroing: an AccECN connection "MAY check that the | 
|---|
| 455 | * initial value of the EE0B field or the EE1B field is non-zero" | 
|---|
| 456 | */ | 
|---|
| 457 | if (optlen < TCPOLEN_ACCECN_PERFIELD) | 
|---|
| 458 | return TCP_ACCECN_OPT_EMPTY_SEEN; | 
|---|
| 459 | if (get_unaligned_be24(p: ptr) == 0) | 
|---|
| 460 | return TCP_ACCECN_OPT_FAIL_SEEN; | 
|---|
| 461 | if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) | 
|---|
| 462 | return TCP_ACCECN_OPT_COUNTER_SEEN; | 
|---|
| 463 | ptr += TCPOLEN_ACCECN_PERFIELD * 2; | 
|---|
| 464 | if (get_unaligned_be24(p: ptr) == 0) | 
|---|
| 465 | return TCP_ACCECN_OPT_FAIL_SEEN; | 
|---|
| 466 |  | 
|---|
| 467 | return TCP_ACCECN_OPT_COUNTER_SEEN; | 
|---|
| 468 | } | 
|---|
| 469 |  | 
|---|
| 470 | /* See Table 2 of the AccECN draft */ | 
|---|
| 471 | static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, | 
|---|
| 472 | const struct tcphdr *th, u8 ip_dsfield) | 
|---|
| 473 | { | 
|---|
| 474 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 475 | u8 ace = tcp_accecn_ace(th); | 
|---|
| 476 |  | 
|---|
| 477 | switch (ace) { | 
|---|
| 478 | case 0x0: | 
|---|
| 479 | case 0x7: | 
|---|
| 480 | /* +========+========+============+=============+ | 
|---|
| 481 | * | A      | B      |  SYN/ACK   |  Feedback   | | 
|---|
| 482 | * |        |        |    B->A    |  Mode of A  | | 
|---|
| 483 | * |        |        | AE CWR ECE |             | | 
|---|
| 484 | * +========+========+============+=============+ | 
|---|
| 485 | * | AccECN | No ECN | 0   0   0  |   Not ECN   | | 
|---|
| 486 | * | AccECN | Broken | 1   1   1  |   Not ECN   | | 
|---|
| 487 | * +========+========+============+=============+ | 
|---|
| 488 | */ | 
|---|
| 489 | tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); | 
|---|
| 490 | break; | 
|---|
| 491 | case 0x1: | 
|---|
| 492 | case 0x5: | 
|---|
| 493 | /* +========+========+============+=============+ | 
|---|
| 494 | * | A      | B      |  SYN/ACK   |  Feedback   | | 
|---|
| 495 | * |        |        |    B->A    |  Mode of A  | | 
|---|
| 496 | * |        |        | AE CWR ECE |             | | 
|---|
| 497 | * +========+========+============+=============+ | 
|---|
| 498 | * | AccECN | Nonce  | 1   0   1  | (Reserved)  | | 
|---|
| 499 | * | AccECN | ECN    | 0   0   1  | Classic ECN | | 
|---|
| 500 | * | Nonce  | AccECN | 0   0   1  | Classic ECN | | 
|---|
| 501 | * | ECN    | AccECN | 0   0   1  | Classic ECN | | 
|---|
| 502 | * +========+========+============+=============+ | 
|---|
| 503 | */ | 
|---|
| 504 | if (tcp_ecn_mode_pending(tp)) | 
|---|
| 505 | /* Downgrade from AccECN, or requested initially */ | 
|---|
| 506 | tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); | 
|---|
| 507 | break; | 
|---|
| 508 | default: | 
|---|
| 509 | tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); | 
|---|
| 510 | tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; | 
|---|
| 511 | if (tp->rx_opt.accecn && | 
|---|
| 512 | tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { | 
|---|
| 513 | u8 saw_opt = tcp_accecn_option_init(skb, opt_offset: tp->rx_opt.accecn); | 
|---|
| 514 |  | 
|---|
| 515 | tcp_accecn_saw_opt_fail_recv(tp, saw_opt); | 
|---|
| 516 | tp->accecn_opt_demand = 2; | 
|---|
| 517 | } | 
|---|
| 518 | if (INET_ECN_is_ce(dsfield: ip_dsfield) && | 
|---|
| 519 | tcp_accecn_validate_syn_feedback(sk, ace, | 
|---|
| 520 | sent_ect: tp->syn_ect_snt)) { | 
|---|
| 521 | tp->received_ce++; | 
|---|
| 522 | tp->received_ce_pending++; | 
|---|
| 523 | } | 
|---|
| 524 | break; | 
|---|
| 525 | } | 
|---|
| 526 | } | 
|---|
| 527 |  | 
|---|
| 528 | static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, | 
|---|
| 529 | const struct sk_buff *skb) | 
|---|
| 530 | { | 
|---|
| 531 | if (tcp_ecn_mode_pending(tp)) { | 
|---|
| 532 | if (!tcp_accecn_syn_requested(th)) { | 
|---|
| 533 | /* Downgrade to classic ECN feedback */ | 
|---|
| 534 | tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); | 
|---|
| 535 | } else { | 
|---|
| 536 | tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & | 
|---|
| 537 | INET_ECN_MASK; | 
|---|
| 538 | tp->prev_ecnfield = tp->syn_ect_rcv; | 
|---|
| 539 | tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); | 
|---|
| 540 | } | 
|---|
| 541 | } | 
|---|
| 542 | if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) | 
|---|
| 543 | tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); | 
|---|
| 544 | } | 
|---|
| 545 |  | 
|---|
| 546 | static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, | 
|---|
| 547 | const struct tcphdr *th) | 
|---|
| 548 | { | 
|---|
| 549 | if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) | 
|---|
| 550 | return true; | 
|---|
| 551 | return false; | 
|---|
| 552 | } | 
|---|
| 553 |  | 
|---|
| 554 | /* Packet ECN state for a SYN-ACK */ | 
|---|
| 555 | static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) | 
|---|
| 556 | { | 
|---|
| 557 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 558 |  | 
|---|
| 559 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; | 
|---|
| 560 | if (tcp_ecn_disabled(tp)) | 
|---|
| 561 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; | 
|---|
| 562 | else if (tcp_ca_needs_ecn(sk) || | 
|---|
| 563 | tcp_bpf_ca_needs_ecn(sk)) | 
|---|
| 564 | INET_ECN_xmit(sk); | 
|---|
| 565 |  | 
|---|
| 566 | if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { | 
|---|
| 567 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; | 
|---|
| 568 | TCP_SKB_CB(skb)->tcp_flags |= | 
|---|
| 569 | tcp_accecn_reflector_flags(ect: tp->syn_ect_rcv); | 
|---|
| 570 | tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; | 
|---|
| 571 | } | 
|---|
| 572 | } | 
|---|
| 573 |  | 
|---|
| 574 | /* Packet ECN state for a SYN.  */ | 
|---|
| 575 | static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) | 
|---|
| 576 | { | 
|---|
| 577 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 578 | bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); | 
|---|
| 579 | bool use_ecn, use_accecn; | 
|---|
| 580 | u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); | 
|---|
| 581 |  | 
|---|
| 582 | use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; | 
|---|
| 583 | use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || | 
|---|
| 584 | tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || | 
|---|
| 585 | tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; | 
|---|
| 586 |  | 
|---|
| 587 | if (!use_ecn) { | 
|---|
| 588 | const struct dst_entry *dst = __sk_dst_get(sk); | 
|---|
| 589 |  | 
|---|
| 590 | if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) | 
|---|
| 591 | use_ecn = true; | 
|---|
| 592 | } | 
|---|
| 593 |  | 
|---|
| 594 | tp->ecn_flags = 0; | 
|---|
| 595 |  | 
|---|
| 596 | if (use_ecn) { | 
|---|
| 597 | if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) | 
|---|
| 598 | INET_ECN_xmit(sk); | 
|---|
| 599 |  | 
|---|
| 600 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; | 
|---|
| 601 | if (use_accecn) { | 
|---|
| 602 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; | 
|---|
| 603 | tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); | 
|---|
| 604 | tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; | 
|---|
| 605 | } else { | 
|---|
| 606 | tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); | 
|---|
| 607 | } | 
|---|
| 608 | } | 
|---|
| 609 | } | 
|---|
| 610 |  | 
|---|
| 611 | static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) | 
|---|
| 612 | { | 
|---|
| 613 | if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { | 
|---|
| 614 | /* tp->ecn_flags are cleared at a later point in time when | 
|---|
| 615 | * SYN ACK is ultimatively being received. | 
|---|
| 616 | */ | 
|---|
| 617 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; | 
|---|
| 618 | } | 
|---|
| 619 | } | 
|---|
| 620 |  | 
|---|
| 621 | static inline void | 
|---|
| 622 | tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) | 
|---|
| 623 | { | 
|---|
| 624 | if (tcp_rsk(req)->accecn_ok) | 
|---|
| 625 | tcp_accecn_echo_syn_ect(th, ect: tcp_rsk(req)->syn_ect_rcv); | 
|---|
| 626 | else if (inet_rsk(sk: req)->ecn_ok) | 
|---|
| 627 | th->ece = 1; | 
|---|
| 628 | } | 
|---|
| 629 |  | 
|---|
| 630 | static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) | 
|---|
| 631 | { | 
|---|
| 632 | u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); | 
|---|
| 633 | const struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 634 |  | 
|---|
| 635 | if (!ecn_beacon) | 
|---|
| 636 | return false; | 
|---|
| 637 |  | 
|---|
| 638 | return tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: tp->accecn_opt_tstamp) * ecn_beacon >= | 
|---|
| 639 | (tp->srtt_us >> 3); | 
|---|
| 640 | } | 
|---|
| 641 |  | 
|---|
| 642 | #endif /* _LINUX_TCP_ECN_H */ | 
|---|
| 643 |  | 
|---|