| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | /* | 
|---|
| 3 | * INET		An implementation of the TCP/IP protocol suite for the LINUX | 
|---|
| 4 | *		operating system.  INET is implemented using the  BSD Socket | 
|---|
| 5 | *		interface as the means of communication with the user level. | 
|---|
| 6 | * | 
|---|
| 7 | *		Implementation of the Transmission Control Protocol(TCP). | 
|---|
| 8 | * | 
|---|
| 9 | * Authors:	Ross Biro | 
|---|
| 10 | *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | 
|---|
| 11 | *		Mark Evans, <evansmp@uhura.aston.ac.uk> | 
|---|
| 12 | *		Corey Minyard <wf-rch!minyard@relay.EU.net> | 
|---|
| 13 | *		Florian La Roche, <flla@stud.uni-sb.de> | 
|---|
| 14 | *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu> | 
|---|
| 15 | *		Linus Torvalds, <torvalds@cs.helsinki.fi> | 
|---|
| 16 | *		Alan Cox, <gw4pts@gw4pts.ampr.org> | 
|---|
| 17 | *		Matthew Dillon, <dillon@apollo.west.oic.com> | 
|---|
| 18 | *		Arnt Gulbrandsen, <agulbra@nvg.unit.no> | 
|---|
| 19 | *		Jorge Cwik, <jorge@laser.satlink.net> | 
|---|
| 20 | */ | 
|---|
| 21 |  | 
|---|
| 22 | #include <net/tcp.h> | 
|---|
| 23 | #include <net/tcp_ecn.h> | 
|---|
| 24 | #include <net/xfrm.h> | 
|---|
| 25 | #include <net/busy_poll.h> | 
|---|
| 26 | #include <net/rstreason.h> | 
|---|
| 27 | #include <net/psp.h> | 
|---|
| 28 |  | 
|---|
| 29 | static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) | 
|---|
| 30 | { | 
|---|
| 31 | if (seq == s_win) | 
|---|
| 32 | return true; | 
|---|
| 33 | if (after(end_seq, s_win) && before(seq1: seq, seq2: e_win)) | 
|---|
| 34 | return true; | 
|---|
| 35 | return seq == e_win && seq == end_seq; | 
|---|
| 36 | } | 
|---|
| 37 |  | 
|---|
| 38 | static enum tcp_tw_status | 
|---|
| 39 | tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, | 
|---|
| 40 | const struct sk_buff *skb, int mib_idx) | 
|---|
| 41 | { | 
|---|
| 42 | struct tcp_timewait_sock *tcptw = tcp_twsk(sk: (struct sock *)tw); | 
|---|
| 43 |  | 
|---|
| 44 | if (!tcp_oow_rate_limited(net: twsk_net(twsk: tw), skb, mib_idx, | 
|---|
| 45 | last_oow_ack_time: &tcptw->tw_last_oow_ack_time)) { | 
|---|
| 46 | /* Send ACK. Note, we do not put the bucket, | 
|---|
| 47 | * it will be released by caller. | 
|---|
| 48 | */ | 
|---|
| 49 | return TCP_TW_ACK_OOW; | 
|---|
| 50 | } | 
|---|
| 51 |  | 
|---|
| 52 | /* We are rate-limiting, so just release the tw sock and drop skb. */ | 
|---|
| 53 | inet_twsk_put(tw); | 
|---|
| 54 | return TCP_TW_SUCCESS; | 
|---|
| 55 | } | 
|---|
| 56 |  | 
|---|
| 57 | static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, | 
|---|
| 58 | u32 rcv_nxt) | 
|---|
| 59 | { | 
|---|
| 60 | #ifdef CONFIG_TCP_AO | 
|---|
| 61 | struct tcp_ao_info *ao; | 
|---|
| 62 |  | 
|---|
| 63 | ao = rcu_dereference(tcptw->ao_info); | 
|---|
| 64 | if (unlikely(ao && seq < rcv_nxt)) | 
|---|
| 65 | WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); | 
|---|
| 66 | #endif | 
|---|
| 67 | WRITE_ONCE(tcptw->tw_rcv_nxt, seq); | 
|---|
| 68 | } | 
|---|
| 69 |  | 
|---|
| 70 | /* | 
|---|
| 71 | * * Main purpose of TIME-WAIT state is to close connection gracefully, | 
|---|
| 72 | *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN | 
|---|
| 73 | *   (and, probably, tail of data) and one or more our ACKs are lost. | 
|---|
| 74 | * * What is TIME-WAIT timeout? It is associated with maximal packet | 
|---|
| 75 | *   lifetime in the internet, which results in wrong conclusion, that | 
|---|
| 76 | *   it is set to catch "old duplicate segments" wandering out of their path. | 
|---|
| 77 | *   It is not quite correct. This timeout is calculated so that it exceeds | 
|---|
| 78 | *   maximal retransmission timeout enough to allow to lose one (or more) | 
|---|
| 79 | *   segments sent by peer and our ACKs. This time may be calculated from RTO. | 
|---|
| 80 | * * When TIME-WAIT socket receives RST, it means that another end | 
|---|
| 81 | *   finally closed and we are allowed to kill TIME-WAIT too. | 
|---|
| 82 | * * Second purpose of TIME-WAIT is catching old duplicate segments. | 
|---|
| 83 | *   Well, certainly it is pure paranoia, but if we load TIME-WAIT | 
|---|
| 84 | *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. | 
|---|
| 85 | * * If we invented some more clever way to catch duplicates | 
|---|
| 86 | *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. | 
|---|
| 87 | * | 
|---|
| 88 | * The algorithm below is based on FORMAL INTERPRETATION of RFCs. | 
|---|
| 89 | * When you compare it to RFCs, please, read section SEGMENT ARRIVES | 
|---|
| 90 | * from the very beginning. | 
|---|
| 91 | * | 
|---|
| 92 | * NOTE. With recycling (and later with fin-wait-2) TW bucket | 
|---|
| 93 | * is _not_ stateless. It means, that strictly speaking we must | 
|---|
| 94 | * spinlock it. I do not want! Well, probability of misbehaviour | 
|---|
| 95 | * is ridiculously low and, seems, we could use some mb() tricks | 
|---|
| 96 | * to avoid misread sequence numbers, states etc.  --ANK | 
|---|
| 97 | * | 
|---|
| 98 | * We don't need to initialize tmp_out.sack_ok as we don't use the results | 
|---|
| 99 | */ | 
|---|
| 100 | enum tcp_tw_status | 
|---|
| 101 | tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, | 
|---|
| 102 | const struct tcphdr *th, u32 *tw_isn, | 
|---|
| 103 | enum skb_drop_reason *drop_reason) | 
|---|
| 104 | { | 
|---|
| 105 | struct tcp_timewait_sock *tcptw = tcp_twsk(sk: (struct sock *)tw); | 
|---|
| 106 | u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); | 
|---|
| 107 | struct tcp_options_received tmp_opt; | 
|---|
| 108 | enum skb_drop_reason psp_drop; | 
|---|
| 109 | bool paws_reject = false; | 
|---|
| 110 | int ts_recent_stamp; | 
|---|
| 111 |  | 
|---|
| 112 | /* Instead of dropping immediately, wait to see what value is | 
|---|
| 113 | * returned. We will accept a non psp-encapsulated syn in the | 
|---|
| 114 | * case where TCP_TW_SYN is returned. | 
|---|
| 115 | */ | 
|---|
| 116 | psp_drop = psp_twsk_rx_policy_check(tw, skb); | 
|---|
| 117 |  | 
|---|
| 118 | tmp_opt.saw_tstamp = 0; | 
|---|
| 119 | ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); | 
|---|
| 120 | if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { | 
|---|
| 121 | tcp_parse_options(net: twsk_net(twsk: tw), skb, opt_rx: &tmp_opt, estab: 0, NULL); | 
|---|
| 122 |  | 
|---|
| 123 | if (tmp_opt.saw_tstamp) { | 
|---|
| 124 | if (tmp_opt.rcv_tsecr) | 
|---|
| 125 | tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; | 
|---|
| 126 | tmp_opt.ts_recent	= READ_ONCE(tcptw->tw_ts_recent); | 
|---|
| 127 | tmp_opt.ts_recent_stamp	= ts_recent_stamp; | 
|---|
| 128 | paws_reject = tcp_paws_reject(rx_opt: &tmp_opt, rst: th->rst); | 
|---|
| 129 | } | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { | 
|---|
| 133 | /* Just repeat all the checks of tcp_rcv_state_process() */ | 
|---|
| 134 |  | 
|---|
| 135 | if (psp_drop) | 
|---|
| 136 | goto out_put; | 
|---|
| 137 |  | 
|---|
| 138 | /* Out of window, send ACK */ | 
|---|
| 139 | if (paws_reject || | 
|---|
| 140 | !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 
|---|
| 141 | s_win: rcv_nxt, | 
|---|
| 142 | e_win: rcv_nxt + tcptw->tw_rcv_wnd)) | 
|---|
| 143 | return tcp_timewait_check_oow_rate_limit( | 
|---|
| 144 | tw, skb, mib_idx: LINUX_MIB_TCPACKSKIPPEDFINWAIT2); | 
|---|
| 145 |  | 
|---|
| 146 | if (th->rst) | 
|---|
| 147 | goto kill; | 
|---|
| 148 |  | 
|---|
| 149 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, seq2: rcv_nxt)) | 
|---|
| 150 | return TCP_TW_RST; | 
|---|
| 151 |  | 
|---|
| 152 | /* Dup ACK? */ | 
|---|
| 153 | if (!th->ack || | 
|---|
| 154 | !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || | 
|---|
| 155 | TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { | 
|---|
| 156 | inet_twsk_put(tw); | 
|---|
| 157 | return TCP_TW_SUCCESS; | 
|---|
| 158 | } | 
|---|
| 159 |  | 
|---|
| 160 | /* New data or FIN. If new data arrive after half-duplex close, | 
|---|
| 161 | * reset. | 
|---|
| 162 | */ | 
|---|
| 163 | if (!th->fin || | 
|---|
| 164 | TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) | 
|---|
| 165 | return TCP_TW_RST; | 
|---|
| 166 |  | 
|---|
| 167 | /* FIN arrived, enter true time-wait state. */ | 
|---|
| 168 | WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); | 
|---|
| 169 | twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, | 
|---|
| 170 | rcv_nxt); | 
|---|
| 171 |  | 
|---|
| 172 | if (tmp_opt.saw_tstamp) { | 
|---|
| 173 | u64 ts = tcp_clock_ms(); | 
|---|
| 174 |  | 
|---|
| 175 | WRITE_ONCE(tw->tw_entry_stamp, ts); | 
|---|
| 176 | WRITE_ONCE(tcptw->tw_ts_recent_stamp, | 
|---|
| 177 | div_u64(ts, MSEC_PER_SEC)); | 
|---|
| 178 | WRITE_ONCE(tcptw->tw_ts_recent, | 
|---|
| 179 | tmp_opt.rcv_tsval); | 
|---|
| 180 | } | 
|---|
| 181 |  | 
|---|
| 182 | inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); | 
|---|
| 183 | return TCP_TW_ACK; | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | /* | 
|---|
| 187 | *	Now real TIME-WAIT state. | 
|---|
| 188 | * | 
|---|
| 189 | *	RFC 1122: | 
|---|
| 190 | *	"When a connection is [...] on TIME-WAIT state [...] | 
|---|
| 191 | *	[a TCP] MAY accept a new SYN from the remote TCP to | 
|---|
| 192 | *	reopen the connection directly, if it: | 
|---|
| 193 | * | 
|---|
| 194 | *	(1)  assigns its initial sequence number for the new | 
|---|
| 195 | *	connection to be larger than the largest sequence | 
|---|
| 196 | *	number it used on the previous connection incarnation, | 
|---|
| 197 | *	and | 
|---|
| 198 | * | 
|---|
| 199 | *	(2)  returns to TIME-WAIT state if the SYN turns out | 
|---|
| 200 | *	to be an old duplicate". | 
|---|
| 201 | */ | 
|---|
| 202 |  | 
|---|
| 203 | if (!paws_reject && | 
|---|
| 204 | (TCP_SKB_CB(skb)->seq == rcv_nxt && | 
|---|
| 205 | (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { | 
|---|
| 206 | /* In window segment, it may be only reset or bare ack. */ | 
|---|
| 207 |  | 
|---|
| 208 | if (psp_drop) | 
|---|
| 209 | goto out_put; | 
|---|
| 210 |  | 
|---|
| 211 | if (th->rst) { | 
|---|
| 212 | /* This is TIME_WAIT assassination, in two flavors. | 
|---|
| 213 | * Oh well... nobody has a sufficient solution to this | 
|---|
| 214 | * protocol bug yet. | 
|---|
| 215 | */ | 
|---|
| 216 | if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { | 
|---|
| 217 | kill: | 
|---|
| 218 | inet_twsk_deschedule_put(tw); | 
|---|
| 219 | return TCP_TW_SUCCESS; | 
|---|
| 220 | } | 
|---|
| 221 | } else { | 
|---|
| 222 | inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); | 
|---|
| 223 | } | 
|---|
| 224 |  | 
|---|
| 225 | if (tmp_opt.saw_tstamp) { | 
|---|
| 226 | WRITE_ONCE(tcptw->tw_ts_recent, | 
|---|
| 227 | tmp_opt.rcv_tsval); | 
|---|
| 228 | WRITE_ONCE(tcptw->tw_ts_recent_stamp, | 
|---|
| 229 | ktime_get_seconds()); | 
|---|
| 230 | } | 
|---|
| 231 |  | 
|---|
| 232 | inet_twsk_put(tw); | 
|---|
| 233 | return TCP_TW_SUCCESS; | 
|---|
| 234 | } | 
|---|
| 235 |  | 
|---|
| 236 | /* Out of window segment. | 
|---|
| 237 |  | 
|---|
| 238 | All the segments are ACKed immediately. | 
|---|
| 239 |  | 
|---|
| 240 | The only exception is new SYN. We accept it, if it is | 
|---|
| 241 | not old duplicate and we are not in danger to be killed | 
|---|
| 242 | by delayed old duplicates. RFC check is that it has | 
|---|
| 243 | newer sequence number works at rates <40Mbit/sec. | 
|---|
| 244 | However, if paws works, it is reliable AND even more, | 
|---|
| 245 | we even may relax silly seq space cutoff. | 
|---|
| 246 |  | 
|---|
| 247 | RED-PEN: we violate main RFC requirement, if this SYN will appear | 
|---|
| 248 | old duplicate (i.e. we receive RST in reply to SYN-ACK), | 
|---|
| 249 | we must return socket to time-wait state. It is not good, | 
|---|
| 250 | but not fatal yet. | 
|---|
| 251 | */ | 
|---|
| 252 |  | 
|---|
| 253 | if (th->syn && !th->rst && !th->ack && !paws_reject && | 
|---|
| 254 | (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || | 
|---|
| 255 | (tmp_opt.saw_tstamp && | 
|---|
| 256 | (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { | 
|---|
| 257 | u32 isn = tcptw->tw_snd_nxt + 65535 + 2; | 
|---|
| 258 | if (isn == 0) | 
|---|
| 259 | isn++; | 
|---|
| 260 | *tw_isn = isn; | 
|---|
| 261 | return TCP_TW_SYN; | 
|---|
| 262 | } | 
|---|
| 263 |  | 
|---|
| 264 | if (psp_drop) | 
|---|
| 265 | goto out_put; | 
|---|
| 266 |  | 
|---|
| 267 | if (paws_reject) { | 
|---|
| 268 | *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; | 
|---|
| 269 | __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); | 
|---|
| 270 | } | 
|---|
| 271 |  | 
|---|
| 272 | if (!th->rst) { | 
|---|
| 273 | /* In this case we must reset the TIMEWAIT timer. | 
|---|
| 274 | * | 
|---|
| 275 | * If it is ACKless SYN it may be both old duplicate | 
|---|
| 276 | * and new good SYN with random sequence number <rcv_nxt. | 
|---|
| 277 | * Do not reschedule in the last case. | 
|---|
| 278 | */ | 
|---|
| 279 | if (paws_reject || th->ack) | 
|---|
| 280 | inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); | 
|---|
| 281 |  | 
|---|
| 282 | return tcp_timewait_check_oow_rate_limit( | 
|---|
| 283 | tw, skb, mib_idx: LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); | 
|---|
| 284 | } | 
|---|
| 285 |  | 
|---|
| 286 | out_put: | 
|---|
| 287 | inet_twsk_put(tw); | 
|---|
| 288 | return TCP_TW_SUCCESS; | 
|---|
| 289 | } | 
|---|
| 290 | EXPORT_IPV6_MOD(tcp_timewait_state_process); | 
|---|
| 291 |  | 
|---|
| 292 | static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) | 
|---|
| 293 | { | 
|---|
| 294 | #ifdef CONFIG_TCP_MD5SIG | 
|---|
| 295 | const struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 296 | struct tcp_md5sig_key *key; | 
|---|
| 297 |  | 
|---|
| 298 | /* | 
|---|
| 299 | * The timewait bucket does not have the key DB from the | 
|---|
| 300 | * sock structure. We just make a quick copy of the | 
|---|
| 301 | * md5 key being used (if indeed we are using one) | 
|---|
| 302 | * so the timewait ack generating code has the key. | 
|---|
| 303 | */ | 
|---|
| 304 | tcptw->tw_md5_key = NULL; | 
|---|
| 305 | if (!static_branch_unlikely(&tcp_md5_needed.key)) | 
|---|
| 306 | return; | 
|---|
| 307 |  | 
|---|
| 308 | key = tp->af_specific->md5_lookup(sk, sk); | 
|---|
| 309 | if (key) { | 
|---|
| 310 | tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); | 
|---|
| 311 | if (!tcptw->tw_md5_key) | 
|---|
| 312 | return; | 
|---|
| 313 | if (!static_key_fast_inc_not_disabled(key: &tcp_md5_needed.key.key)) | 
|---|
| 314 | goto out_free; | 
|---|
| 315 | tcp_md5_add_sigpool(); | 
|---|
| 316 | } | 
|---|
| 317 | return; | 
|---|
| 318 | out_free: | 
|---|
| 319 | WARN_ON_ONCE(1); | 
|---|
| 320 | kfree(objp: tcptw->tw_md5_key); | 
|---|
| 321 | tcptw->tw_md5_key = NULL; | 
|---|
| 322 | #endif | 
|---|
| 323 | } | 
|---|
| 324 |  | 
|---|
| 325 | /* | 
|---|
| 326 | * Move a socket to time-wait or dead fin-wait-2 state. | 
|---|
| 327 | */ | 
|---|
| 328 | void tcp_time_wait(struct sock *sk, int state, int timeo) | 
|---|
| 329 | { | 
|---|
| 330 | const struct inet_connection_sock *icsk = inet_csk(sk); | 
|---|
| 331 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 332 | struct net *net = sock_net(sk); | 
|---|
| 333 | struct inet_timewait_sock *tw; | 
|---|
| 334 |  | 
|---|
| 335 | tw = inet_twsk_alloc(sk, dr: &net->ipv4.tcp_death_row, state); | 
|---|
| 336 |  | 
|---|
| 337 | if (tw) { | 
|---|
| 338 | struct tcp_timewait_sock *tcptw = tcp_twsk(sk: (struct sock *)tw); | 
|---|
| 339 | const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); | 
|---|
| 340 |  | 
|---|
| 341 | tw->tw_transparent	= inet_test_bit(TRANSPARENT, sk); | 
|---|
| 342 | tw->tw_mark		= sk->sk_mark; | 
|---|
| 343 | tw->tw_priority		= READ_ONCE(sk->sk_priority); | 
|---|
| 344 | tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale; | 
|---|
| 345 | /* refreshed when we enter true TIME-WAIT state */ | 
|---|
| 346 | tw->tw_entry_stamp	= tcp_time_stamp_ms(tp); | 
|---|
| 347 | tcptw->tw_rcv_nxt	= tp->rcv_nxt; | 
|---|
| 348 | tcptw->tw_snd_nxt	= tp->snd_nxt; | 
|---|
| 349 | tcptw->tw_rcv_wnd	= tcp_receive_window(tp); | 
|---|
| 350 | tcptw->tw_ts_recent	= tp->rx_opt.ts_recent; | 
|---|
| 351 | tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; | 
|---|
| 352 | tcptw->tw_ts_offset	= tp->tsoffset; | 
|---|
| 353 | tw->tw_usec_ts		= tp->tcp_usec_ts; | 
|---|
| 354 | tcptw->tw_last_oow_ack_time = 0; | 
|---|
| 355 | tcptw->tw_tx_delay	= tp->tcp_tx_delay; | 
|---|
| 356 | tw->tw_txhash		= sk->sk_txhash; | 
|---|
| 357 | tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; | 
|---|
| 358 | #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING | 
|---|
| 359 | tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; | 
|---|
| 360 | #endif | 
|---|
| 361 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 362 | if (tw->tw_family == PF_INET6) { | 
|---|
| 363 | struct ipv6_pinfo *np = inet6_sk(sk: sk); | 
|---|
| 364 |  | 
|---|
| 365 | tw->tw_v6_daddr = sk->sk_v6_daddr; | 
|---|
| 366 | tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; | 
|---|
| 367 | tw->tw_tclass = np->tclass; | 
|---|
| 368 | tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); | 
|---|
| 369 | tw->tw_ipv6only = sk->sk_ipv6only; | 
|---|
| 370 | } | 
|---|
| 371 | #endif | 
|---|
| 372 |  | 
|---|
| 373 | tcp_time_wait_init(sk, tcptw); | 
|---|
| 374 | tcp_ao_time_wait(tcptw, tp); | 
|---|
| 375 |  | 
|---|
| 376 | /* Get the TIME_WAIT timeout firing. */ | 
|---|
| 377 | if (timeo < rto) | 
|---|
| 378 | timeo = rto; | 
|---|
| 379 |  | 
|---|
| 380 | if (state == TCP_TIME_WAIT) | 
|---|
| 381 | timeo = TCP_TIMEWAIT_LEN; | 
|---|
| 382 |  | 
|---|
| 383 | /* Linkage updates. | 
|---|
| 384 | * Note that access to tw after this point is illegal. | 
|---|
| 385 | */ | 
|---|
| 386 | inet_twsk_hashdance_schedule(tw, sk, hashinfo: net->ipv4.tcp_death_row.hashinfo, timeo); | 
|---|
| 387 | } else { | 
|---|
| 388 | /* Sorry, if we're out of memory, just CLOSE this | 
|---|
| 389 | * socket up.  We've got bigger problems than | 
|---|
| 390 | * non-graceful socket closings. | 
|---|
| 391 | */ | 
|---|
| 392 | NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); | 
|---|
| 393 | } | 
|---|
| 394 |  | 
|---|
| 395 | tcp_update_metrics(sk); | 
|---|
| 396 | tcp_done(sk); | 
|---|
| 397 | } | 
|---|
| 398 | EXPORT_SYMBOL(tcp_time_wait); | 
|---|
| 399 |  | 
|---|
| 400 | void tcp_twsk_destructor(struct sock *sk) | 
|---|
| 401 | { | 
|---|
| 402 | #ifdef CONFIG_TCP_MD5SIG | 
|---|
| 403 | if (static_branch_unlikely(&tcp_md5_needed.key)) { | 
|---|
| 404 | struct tcp_timewait_sock *twsk = tcp_twsk(sk); | 
|---|
| 405 |  | 
|---|
| 406 | if (twsk->tw_md5_key) { | 
|---|
| 407 | kfree(objp: twsk->tw_md5_key); | 
|---|
| 408 | static_branch_slow_dec_deferred(&tcp_md5_needed); | 
|---|
| 409 | tcp_md5_release_sigpool(); | 
|---|
| 410 | } | 
|---|
| 411 | } | 
|---|
| 412 | #endif | 
|---|
| 413 | tcp_ao_destroy_sock(sk, twsk: true); | 
|---|
| 414 | psp_twsk_assoc_free(tw: inet_twsk(sk)); | 
|---|
| 415 | } | 
|---|
| 416 |  | 
|---|
| 417 | void tcp_twsk_purge(struct list_head *net_exit_list) | 
|---|
| 418 | { | 
|---|
| 419 | bool purged_once = false; | 
|---|
| 420 | struct net *net; | 
|---|
| 421 |  | 
|---|
| 422 | list_for_each_entry(net, net_exit_list, exit_list) { | 
|---|
| 423 | if (net->ipv4.tcp_death_row.hashinfo->pernet) { | 
|---|
| 424 | /* Even if tw_refcount == 1, we must clean up kernel reqsk */ | 
|---|
| 425 | inet_twsk_purge(hashinfo: net->ipv4.tcp_death_row.hashinfo); | 
|---|
| 426 | } else if (!purged_once) { | 
|---|
| 427 | inet_twsk_purge(hashinfo: &tcp_hashinfo); | 
|---|
| 428 | purged_once = true; | 
|---|
| 429 | } | 
|---|
| 430 | } | 
|---|
| 431 | } | 
|---|
| 432 |  | 
|---|
| 433 | /* Warning : This function is called without sk_listener being locked. | 
|---|
| 434 | * Be sure to read socket fields once, as their value could change under us. | 
|---|
| 435 | */ | 
|---|
| 436 | void tcp_openreq_init_rwin(struct request_sock *req, | 
|---|
| 437 | const struct sock *sk_listener, | 
|---|
| 438 | const struct dst_entry *dst) | 
|---|
| 439 | { | 
|---|
| 440 | struct inet_request_sock *ireq = inet_rsk(sk: req); | 
|---|
| 441 | const struct tcp_sock *tp = tcp_sk(sk_listener); | 
|---|
| 442 | int full_space = tcp_full_space(sk: sk_listener); | 
|---|
| 443 | u32 window_clamp; | 
|---|
| 444 | __u8 rcv_wscale; | 
|---|
| 445 | u32 rcv_wnd; | 
|---|
| 446 | int mss; | 
|---|
| 447 |  | 
|---|
| 448 | mss = tcp_mss_clamp(tp, mss: dst_metric_advmss(dst)); | 
|---|
| 449 | window_clamp = READ_ONCE(tp->window_clamp); | 
|---|
| 450 | /* Set this up on the first call only */ | 
|---|
| 451 | req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); | 
|---|
| 452 |  | 
|---|
| 453 | /* limit the window selection if the user enforce a smaller rx buffer */ | 
|---|
| 454 | if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && | 
|---|
| 455 | (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) | 
|---|
| 456 | req->rsk_window_clamp = full_space; | 
|---|
| 457 |  | 
|---|
| 458 | rcv_wnd = tcp_rwnd_init_bpf(sk: (struct sock *)req); | 
|---|
| 459 | if (rcv_wnd == 0) | 
|---|
| 460 | rcv_wnd = dst_metric(dst, RTAX_INITRWND); | 
|---|
| 461 | else if (full_space < rcv_wnd * mss) | 
|---|
| 462 | full_space = rcv_wnd * mss; | 
|---|
| 463 |  | 
|---|
| 464 | /* tcp_full_space because it is guaranteed to be the first packet */ | 
|---|
| 465 | tcp_select_initial_window(sk: sk_listener, space: full_space, | 
|---|
| 466 | mss: mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | 
|---|
| 467 | rcv_wnd: &req->rsk_rcv_wnd, | 
|---|
| 468 | window_clamp: &req->rsk_window_clamp, | 
|---|
| 469 | wscale_ok: ireq->wscale_ok, | 
|---|
| 470 | rcv_wscale: &rcv_wscale, | 
|---|
| 471 | init_rcv_wnd: rcv_wnd); | 
|---|
| 472 | ireq->rcv_wscale = rcv_wscale; | 
|---|
| 473 | } | 
|---|
| 474 |  | 
|---|
| 475 | static void tcp_ecn_openreq_child(struct sock *sk, | 
|---|
| 476 | const struct request_sock *req, | 
|---|
| 477 | const struct sk_buff *skb) | 
|---|
| 478 | { | 
|---|
| 479 | const struct tcp_request_sock *treq = tcp_rsk(req); | 
|---|
| 480 | struct tcp_sock *tp = tcp_sk(sk); | 
|---|
| 481 |  | 
|---|
| 482 | if (treq->accecn_ok) { | 
|---|
| 483 | tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); | 
|---|
| 484 | tp->syn_ect_snt = treq->syn_ect_snt; | 
|---|
| 485 | tcp_accecn_third_ack(sk, skb, sent_ect: treq->syn_ect_snt); | 
|---|
| 486 | tp->saw_accecn_opt = treq->saw_accecn_opt; | 
|---|
| 487 | tp->prev_ecnfield = treq->syn_ect_rcv; | 
|---|
| 488 | tp->accecn_opt_demand = 1; | 
|---|
| 489 | tcp_ecn_received_counters_payload(sk, skb); | 
|---|
| 490 | } else { | 
|---|
| 491 | tcp_ecn_mode_set(tp, mode: inet_rsk(sk: req)->ecn_ok ? | 
|---|
| 492 | TCP_ECN_MODE_RFC3168 : | 
|---|
| 493 | TCP_ECN_DISABLED); | 
|---|
| 494 | } | 
|---|
| 495 | } | 
|---|
| 496 |  | 
|---|
| 497 | void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) | 
|---|
| 498 | { | 
|---|
| 499 | struct inet_connection_sock *icsk = inet_csk(sk); | 
|---|
| 500 | u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); | 
|---|
| 501 | bool ca_got_dst = false; | 
|---|
| 502 |  | 
|---|
| 503 | if (ca_key != TCP_CA_UNSPEC) { | 
|---|
| 504 | const struct tcp_congestion_ops *ca; | 
|---|
| 505 |  | 
|---|
| 506 | rcu_read_lock(); | 
|---|
| 507 | ca = tcp_ca_find_key(key: ca_key); | 
|---|
| 508 | if (likely(ca && bpf_try_module_get(ca, ca->owner))) { | 
|---|
| 509 | icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); | 
|---|
| 510 | icsk->icsk_ca_ops = ca; | 
|---|
| 511 | ca_got_dst = true; | 
|---|
| 512 | } | 
|---|
| 513 | rcu_read_unlock(); | 
|---|
| 514 | } | 
|---|
| 515 |  | 
|---|
| 516 | /* If no valid choice made yet, assign current system default ca. */ | 
|---|
| 517 | if (!ca_got_dst && | 
|---|
| 518 | (!icsk->icsk_ca_setsockopt || | 
|---|
| 519 | !bpf_try_module_get(data: icsk->icsk_ca_ops, owner: icsk->icsk_ca_ops->owner))) | 
|---|
| 520 | tcp_assign_congestion_control(sk); | 
|---|
| 521 |  | 
|---|
| 522 | tcp_set_ca_state(sk, ca_state: TCP_CA_Open); | 
|---|
| 523 | } | 
|---|
| 524 | EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); | 
|---|
| 525 |  | 
|---|
| 526 | static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, | 
|---|
| 527 | struct request_sock *req, | 
|---|
| 528 | struct tcp_sock *newtp) | 
|---|
| 529 | { | 
|---|
| 530 | #if IS_ENABLED(CONFIG_SMC) | 
|---|
| 531 | struct inet_request_sock *ireq; | 
|---|
| 532 |  | 
|---|
| 533 | if (static_branch_unlikely(&tcp_have_smc)) { | 
|---|
| 534 | ireq = inet_rsk(req); | 
|---|
| 535 | if (oldtp->syn_smc && !ireq->smc_ok) | 
|---|
| 536 | newtp->syn_smc = 0; | 
|---|
| 537 | } | 
|---|
| 538 | #endif | 
|---|
| 539 | } | 
|---|
| 540 |  | 
|---|
| 541 | /* This is not only more efficient than what we used to do, it eliminates | 
|---|
| 542 | * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM | 
|---|
| 543 | * | 
|---|
| 544 | * Actually, we could lots of memory writes here. tp of listening | 
|---|
| 545 | * socket contains all necessary default parameters. | 
|---|
| 546 | */ | 
|---|
| 547 | struct sock *tcp_create_openreq_child(const struct sock *sk, | 
|---|
| 548 | struct request_sock *req, | 
|---|
| 549 | struct sk_buff *skb) | 
|---|
| 550 | { | 
|---|
| 551 | struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); | 
|---|
| 552 | const struct inet_request_sock *ireq = inet_rsk(sk: req); | 
|---|
| 553 | struct tcp_request_sock *treq = tcp_rsk(req); | 
|---|
| 554 | struct inet_connection_sock *newicsk; | 
|---|
| 555 | const struct tcp_sock *oldtp; | 
|---|
| 556 | struct tcp_sock *newtp; | 
|---|
| 557 | u32 seq; | 
|---|
| 558 |  | 
|---|
| 559 | if (!newsk) | 
|---|
| 560 | return NULL; | 
|---|
| 561 |  | 
|---|
| 562 | newicsk = inet_csk(newsk); | 
|---|
| 563 | newtp = tcp_sk(newsk); | 
|---|
| 564 | oldtp = tcp_sk(sk); | 
|---|
| 565 |  | 
|---|
| 566 | smc_check_reset_syn_req(oldtp, req, newtp); | 
|---|
| 567 |  | 
|---|
| 568 | /* Now setup tcp_sock */ | 
|---|
| 569 | newtp->pred_flags = 0; | 
|---|
| 570 |  | 
|---|
| 571 | seq = treq->rcv_isn + 1; | 
|---|
| 572 | newtp->rcv_wup = seq; | 
|---|
| 573 | WRITE_ONCE(newtp->copied_seq, seq); | 
|---|
| 574 | WRITE_ONCE(newtp->rcv_nxt, seq); | 
|---|
| 575 | newtp->segs_in = 1; | 
|---|
| 576 |  | 
|---|
| 577 | seq = treq->snt_isn + 1; | 
|---|
| 578 | newtp->snd_sml = newtp->snd_una = seq; | 
|---|
| 579 | WRITE_ONCE(newtp->snd_nxt, seq); | 
|---|
| 580 | newtp->snd_up = seq; | 
|---|
| 581 |  | 
|---|
| 582 | INIT_LIST_HEAD(list: &newtp->tsq_node); | 
|---|
| 583 | INIT_LIST_HEAD(list: &newtp->tsorted_sent_queue); | 
|---|
| 584 |  | 
|---|
| 585 | tcp_init_wl(tp: newtp, seq: treq->rcv_isn); | 
|---|
| 586 |  | 
|---|
| 587 | minmax_reset(m: &newtp->rtt_min, tcp_jiffies32, meas: ~0U); | 
|---|
| 588 | newicsk->icsk_ack.lrcvtime = tcp_jiffies32; | 
|---|
| 589 |  | 
|---|
| 590 | newtp->lsndtime = tcp_jiffies32; | 
|---|
| 591 | newsk->sk_txhash = READ_ONCE(treq->txhash); | 
|---|
| 592 | newtp->total_retrans = req->num_retrans; | 
|---|
| 593 |  | 
|---|
| 594 | tcp_init_xmit_timers(newsk); | 
|---|
| 595 | WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); | 
|---|
| 596 |  | 
|---|
| 597 | if (sock_flag(sk: newsk, flag: SOCK_KEEPOPEN)) | 
|---|
| 598 | tcp_reset_keepalive_timer(sk: newsk, timeout: keepalive_time_when(tp: newtp)); | 
|---|
| 599 |  | 
|---|
| 600 | newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; | 
|---|
| 601 | newtp->rx_opt.sack_ok = ireq->sack_ok; | 
|---|
| 602 | newtp->window_clamp = req->rsk_window_clamp; | 
|---|
| 603 | newtp->rcv_ssthresh = req->rsk_rcv_wnd; | 
|---|
| 604 | newtp->rcv_wnd = req->rsk_rcv_wnd; | 
|---|
| 605 | newtp->rx_opt.wscale_ok = ireq->wscale_ok; | 
|---|
| 606 | if (newtp->rx_opt.wscale_ok) { | 
|---|
| 607 | newtp->rx_opt.snd_wscale = ireq->snd_wscale; | 
|---|
| 608 | newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; | 
|---|
| 609 | } else { | 
|---|
| 610 | newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; | 
|---|
| 611 | newtp->window_clamp = min(newtp->window_clamp, 65535U); | 
|---|
| 612 | } | 
|---|
| 613 | newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; | 
|---|
| 614 | newtp->max_window = newtp->snd_wnd; | 
|---|
| 615 |  | 
|---|
| 616 | if (newtp->rx_opt.tstamp_ok) { | 
|---|
| 617 | newtp->tcp_usec_ts = treq->req_usec_ts; | 
|---|
| 618 | newtp->rx_opt.ts_recent = req->ts_recent; | 
|---|
| 619 | newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); | 
|---|
| 620 | newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; | 
|---|
| 621 | } else { | 
|---|
| 622 | newtp->tcp_usec_ts = 0; | 
|---|
| 623 | newtp->rx_opt.ts_recent_stamp = 0; | 
|---|
| 624 | newtp->tcp_header_len = sizeof(struct tcphdr); | 
|---|
| 625 | } | 
|---|
| 626 | if (req->num_timeout) { | 
|---|
| 627 | newtp->total_rto = req->num_timeout; | 
|---|
| 628 | newtp->undo_marker = treq->snt_isn; | 
|---|
| 629 | if (newtp->tcp_usec_ts) { | 
|---|
| 630 | newtp->retrans_stamp = treq->snt_synack; | 
|---|
| 631 | newtp->total_rto_time = (u32)(tcp_clock_us() - | 
|---|
| 632 | newtp->retrans_stamp) / USEC_PER_MSEC; | 
|---|
| 633 | } else { | 
|---|
| 634 | newtp->retrans_stamp = div_u64(dividend: treq->snt_synack, | 
|---|
| 635 | USEC_PER_SEC / TCP_TS_HZ); | 
|---|
| 636 | newtp->total_rto_time = tcp_clock_ms() - | 
|---|
| 637 | newtp->retrans_stamp; | 
|---|
| 638 | } | 
|---|
| 639 | newtp->total_rto_recoveries = 1; | 
|---|
| 640 | } | 
|---|
| 641 | newtp->tsoffset = treq->ts_off; | 
|---|
| 642 | #ifdef CONFIG_TCP_MD5SIG | 
|---|
| 643 | newtp->md5sig_info = NULL;	/*XXX*/ | 
|---|
| 644 | #endif | 
|---|
| 645 | #ifdef CONFIG_TCP_AO | 
|---|
| 646 | newtp->ao_info = NULL; | 
|---|
| 647 |  | 
|---|
| 648 | if (tcp_rsk_used_ao(req)) { | 
|---|
| 649 | struct tcp_ao_key *ao_key; | 
|---|
| 650 |  | 
|---|
| 651 | ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); | 
|---|
| 652 | if (ao_key) | 
|---|
| 653 | newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); | 
|---|
| 654 | } | 
|---|
| 655 | #endif | 
|---|
| 656 | if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) | 
|---|
| 657 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; | 
|---|
| 658 | newtp->rx_opt.mss_clamp = req->mss; | 
|---|
| 659 | tcp_ecn_openreq_child(sk: newsk, req, skb); | 
|---|
| 660 | newtp->fastopen_req = NULL; | 
|---|
| 661 | RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); | 
|---|
| 662 |  | 
|---|
| 663 | newtp->bpf_chg_cc_inprogress = 0; | 
|---|
| 664 | tcp_bpf_clone(sk, newsk); | 
|---|
| 665 |  | 
|---|
| 666 | __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); | 
|---|
| 667 |  | 
|---|
| 668 | xa_init_flags(xa: &newsk->sk_user_frags, XA_FLAGS_ALLOC1); | 
|---|
| 669 |  | 
|---|
| 670 | return newsk; | 
|---|
| 671 | } | 
|---|
| 672 | EXPORT_SYMBOL(tcp_create_openreq_child); | 
|---|
| 673 |  | 
|---|
| 674 | /* | 
|---|
| 675 | * Process an incoming packet for SYN_RECV sockets represented as a | 
|---|
| 676 | * request_sock. Normally sk is the listener socket but for TFO it | 
|---|
| 677 | * points to the child socket. | 
|---|
| 678 | * | 
|---|
| 679 | * XXX (TFO) - The current impl contains a special check for ack | 
|---|
| 680 | * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? | 
|---|
| 681 | * | 
|---|
| 682 | * We don't need to initialize tmp_opt.sack_ok as we don't use the results | 
|---|
| 683 | * | 
|---|
| 684 | * Note: If @fastopen is true, this can be called from process context. | 
|---|
| 685 | *       Otherwise, this is from BH context. | 
|---|
| 686 | */ | 
|---|
| 687 |  | 
|---|
| 688 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | 
|---|
| 689 | struct request_sock *req, | 
|---|
| 690 | bool fastopen, bool *req_stolen, | 
|---|
| 691 | enum skb_drop_reason *drop_reason) | 
|---|
| 692 | { | 
|---|
| 693 | struct tcp_options_received tmp_opt; | 
|---|
| 694 | struct sock *child; | 
|---|
| 695 | const struct tcphdr *th = tcp_hdr(skb); | 
|---|
| 696 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 
|---|
| 697 | bool tsecr_reject = false; | 
|---|
| 698 | bool paws_reject = false; | 
|---|
| 699 | bool own_req; | 
|---|
| 700 |  | 
|---|
| 701 | tmp_opt.saw_tstamp = 0; | 
|---|
| 702 | tmp_opt.accecn = 0; | 
|---|
| 703 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 
|---|
| 704 | tcp_parse_options(net: sock_net(sk), skb, opt_rx: &tmp_opt, estab: 0, NULL); | 
|---|
| 705 |  | 
|---|
| 706 | if (tmp_opt.saw_tstamp) { | 
|---|
| 707 | tmp_opt.ts_recent = req->ts_recent; | 
|---|
| 708 | if (tmp_opt.rcv_tsecr) { | 
|---|
| 709 | if (inet_rsk(sk: req)->tstamp_ok && !fastopen) | 
|---|
| 710 | tsecr_reject = !between(seq1: tmp_opt.rcv_tsecr, | 
|---|
| 711 | seq2: tcp_rsk(req)->snt_tsval_first, | 
|---|
| 712 | READ_ONCE(tcp_rsk(req)->snt_tsval_last)); | 
|---|
| 713 | tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; | 
|---|
| 714 | } | 
|---|
| 715 | /* We do not store true stamp, but it is not required, | 
|---|
| 716 | * it can be estimated (approximately) | 
|---|
| 717 | * from another data. | 
|---|
| 718 | */ | 
|---|
| 719 | tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; | 
|---|
| 720 | paws_reject = tcp_paws_reject(rx_opt: &tmp_opt, rst: th->rst); | 
|---|
| 721 | } | 
|---|
| 722 | } | 
|---|
| 723 |  | 
|---|
| 724 | /* Check for pure retransmitted SYN. */ | 
|---|
| 725 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && | 
|---|
| 726 | flg == TCP_FLAG_SYN && | 
|---|
| 727 | !paws_reject) { | 
|---|
| 728 | /* | 
|---|
| 729 | * RFC793 draws (Incorrectly! It was fixed in RFC1122) | 
|---|
| 730 | * this case on figure 6 and figure 8, but formal | 
|---|
| 731 | * protocol description says NOTHING. | 
|---|
| 732 | * To be more exact, it says that we should send ACK, | 
|---|
| 733 | * because this segment (at least, if it has no data) | 
|---|
| 734 | * is out of window. | 
|---|
| 735 | * | 
|---|
| 736 | *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT | 
|---|
| 737 | *  describe SYN-RECV state. All the description | 
|---|
| 738 | *  is wrong, we cannot believe to it and should | 
|---|
| 739 | *  rely only on common sense and implementation | 
|---|
| 740 | *  experience. | 
|---|
| 741 | * | 
|---|
| 742 | * Enforce "SYN-ACK" according to figure 8, figure 6 | 
|---|
| 743 | * of RFC793, fixed by RFC1122. | 
|---|
| 744 | * | 
|---|
| 745 | * Note that even if there is new data in the SYN packet | 
|---|
| 746 | * they will be thrown away too. | 
|---|
| 747 | * | 
|---|
| 748 | * Reset timer after retransmitting SYNACK, similar to | 
|---|
| 749 | * the idea of fast retransmit in recovery. | 
|---|
| 750 | */ | 
|---|
| 751 | if (!tcp_oow_rate_limited(net: sock_net(sk), skb, | 
|---|
| 752 | mib_idx: LINUX_MIB_TCPACKSKIPPEDSYNRECV, | 
|---|
| 753 | last_oow_ack_time: &tcp_rsk(req)->last_oow_ack_time) && | 
|---|
| 754 |  | 
|---|
| 755 | !tcp_rtx_synack(sk, req)) { | 
|---|
| 756 | unsigned long expires = jiffies; | 
|---|
| 757 |  | 
|---|
| 758 | expires += reqsk_timeout(req, TCP_RTO_MAX); | 
|---|
| 759 | if (!fastopen) | 
|---|
| 760 | mod_timer_pending(timer: &req->rsk_timer, expires); | 
|---|
| 761 | else | 
|---|
| 762 | req->rsk_timer.expires = expires; | 
|---|
| 763 | } | 
|---|
| 764 | return NULL; | 
|---|
| 765 | } | 
|---|
| 766 |  | 
|---|
| 767 | /* Further reproduces section "SEGMENT ARRIVES" | 
|---|
| 768 | for state SYN-RECEIVED of RFC793. | 
|---|
| 769 | It is broken, however, it does not work only | 
|---|
| 770 | when SYNs are crossed. | 
|---|
| 771 |  | 
|---|
| 772 | You would think that SYN crossing is impossible here, since | 
|---|
| 773 | we should have a SYN_SENT socket (from connect()) on our end, | 
|---|
| 774 | but this is not true if the crossed SYNs were sent to both | 
|---|
| 775 | ends by a malicious third party.  We must defend against this, | 
|---|
| 776 | and to do that we first verify the ACK (as per RFC793, page | 
|---|
| 777 | 36) and reset if it is invalid.  Is this a true full defense? | 
|---|
| 778 | To convince ourselves, let us consider a way in which the ACK | 
|---|
| 779 | test can still pass in this 'malicious crossed SYNs' case. | 
|---|
| 780 | Malicious sender sends identical SYNs (and thus identical sequence | 
|---|
| 781 | numbers) to both A and B: | 
|---|
| 782 |  | 
|---|
| 783 | A: gets SYN, seq=7 | 
|---|
| 784 | B: gets SYN, seq=7 | 
|---|
| 785 |  | 
|---|
| 786 | By our good fortune, both A and B select the same initial | 
|---|
| 787 | send sequence number of seven :-) | 
|---|
| 788 |  | 
|---|
| 789 | A: sends SYN|ACK, seq=7, ack_seq=8 | 
|---|
| 790 | B: sends SYN|ACK, seq=7, ack_seq=8 | 
|---|
| 791 |  | 
|---|
| 792 | So we are now A eating this SYN|ACK, ACK test passes.  So | 
|---|
| 793 | does sequence test, SYN is truncated, and thus we consider | 
|---|
| 794 | it a bare ACK. | 
|---|
| 795 |  | 
|---|
| 796 | If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this | 
|---|
| 797 | bare ACK.  Otherwise, we create an established connection.  Both | 
|---|
| 798 | ends (listening sockets) accept the new incoming connection and try | 
|---|
| 799 | to talk to each other. 8-) | 
|---|
| 800 |  | 
|---|
| 801 | Note: This case is both harmless, and rare.  Possibility is about the | 
|---|
| 802 | same as us discovering intelligent life on another plant tomorrow. | 
|---|
| 803 |  | 
|---|
| 804 | But generally, we should (RFC lies!) to accept ACK | 
|---|
| 805 | from SYNACK both here and in tcp_rcv_state_process(). | 
|---|
| 806 | tcp_rcv_state_process() does not, hence, we do not too. | 
|---|
| 807 |  | 
|---|
| 808 | Note that the case is absolutely generic: | 
|---|
| 809 | we cannot optimize anything here without | 
|---|
| 810 | violating protocol. All the checks must be made | 
|---|
| 811 | before attempt to create socket. | 
|---|
| 812 | */ | 
|---|
| 813 |  | 
|---|
| 814 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... | 
|---|
| 815 | *                  and the incoming segment acknowledges something not yet | 
|---|
| 816 | *                  sent (the segment carries an unacceptable ACK) ... | 
|---|
| 817 | *                  a reset is sent." | 
|---|
| 818 | * | 
|---|
| 819 | * Invalid ACK: reset will be sent by listening socket. | 
|---|
| 820 | * Note that the ACK validity check for a Fast Open socket is done | 
|---|
| 821 | * elsewhere and is checked directly against the child socket rather | 
|---|
| 822 | * than req because user data may have been sent out. | 
|---|
| 823 | */ | 
|---|
| 824 | if ((flg & TCP_FLAG_ACK) && !fastopen && | 
|---|
| 825 | (TCP_SKB_CB(skb)->ack_seq != | 
|---|
| 826 | tcp_rsk(req)->snt_isn + 1)) | 
|---|
| 827 | return sk; | 
|---|
| 828 |  | 
|---|
| 829 | /* RFC793: "first check sequence number". */ | 
|---|
| 830 |  | 
|---|
| 831 | if (paws_reject || tsecr_reject || | 
|---|
| 832 | !tcp_in_window(TCP_SKB_CB(skb)->seq, | 
|---|
| 833 | TCP_SKB_CB(skb)->end_seq, | 
|---|
| 834 | s_win: tcp_rsk(req)->rcv_nxt, | 
|---|
| 835 | e_win: tcp_rsk(req)->rcv_nxt + | 
|---|
| 836 | tcp_synack_window(req))) { | 
|---|
| 837 | /* Out of window: send ACK and drop. */ | 
|---|
| 838 | if (!(flg & TCP_FLAG_RST) && | 
|---|
| 839 | !tcp_oow_rate_limited(net: sock_net(sk), skb, | 
|---|
| 840 | mib_idx: LINUX_MIB_TCPACKSKIPPEDSYNRECV, | 
|---|
| 841 | last_oow_ack_time: &tcp_rsk(req)->last_oow_ack_time)) | 
|---|
| 842 | req->rsk_ops->send_ack(sk, skb, req); | 
|---|
| 843 | if (paws_reject) { | 
|---|
| 844 | SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); | 
|---|
| 845 | NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | 
|---|
| 846 | } else if (tsecr_reject) { | 
|---|
| 847 | SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); | 
|---|
| 848 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); | 
|---|
| 849 | } else { | 
|---|
| 850 | SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); | 
|---|
| 851 | } | 
|---|
| 852 | return NULL; | 
|---|
| 853 | } | 
|---|
| 854 |  | 
|---|
| 855 | /* In sequence, PAWS is OK. */ | 
|---|
| 856 |  | 
|---|
| 857 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { | 
|---|
| 858 | /* Truncate SYN, it is out of window starting | 
|---|
| 859 | at tcp_rsk(req)->rcv_isn + 1. */ | 
|---|
| 860 | flg &= ~TCP_FLAG_SYN; | 
|---|
| 861 | } | 
|---|
| 862 |  | 
|---|
| 863 | /* RFC793: "second check the RST bit" and | 
|---|
| 864 | *	   "fourth, check the SYN bit" | 
|---|
| 865 | */ | 
|---|
| 866 | if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { | 
|---|
| 867 | TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); | 
|---|
| 868 | goto embryonic_reset; | 
|---|
| 869 | } | 
|---|
| 870 |  | 
|---|
| 871 | /* ACK sequence verified above, just make sure ACK is | 
|---|
| 872 | * set.  If ACK not set, just silently drop the packet. | 
|---|
| 873 | * | 
|---|
| 874 | * XXX (TFO) - if we ever allow "data after SYN", the | 
|---|
| 875 | * following check needs to be removed. | 
|---|
| 876 | */ | 
|---|
| 877 | if (!(flg & TCP_FLAG_ACK)) | 
|---|
| 878 | return NULL; | 
|---|
| 879 |  | 
|---|
| 880 | if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && | 
|---|
| 881 | tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { | 
|---|
| 882 | u8 saw_opt = tcp_accecn_option_init(skb, opt_offset: tmp_opt.accecn); | 
|---|
| 883 |  | 
|---|
| 884 | tcp_rsk(req)->saw_accecn_opt = saw_opt; | 
|---|
| 885 | if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { | 
|---|
| 886 | u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; | 
|---|
| 887 |  | 
|---|
| 888 | tcp_rsk(req)->accecn_fail_mode |= fail_mode; | 
|---|
| 889 | } | 
|---|
| 890 | } | 
|---|
| 891 |  | 
|---|
| 892 | /* For Fast Open no more processing is needed (sk is the | 
|---|
| 893 | * child socket). | 
|---|
| 894 | */ | 
|---|
| 895 | if (fastopen) | 
|---|
| 896 | return sk; | 
|---|
| 897 |  | 
|---|
| 898 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ | 
|---|
| 899 | if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && | 
|---|
| 900 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | 
|---|
| 901 | inet_rsk(sk: req)->acked = 1; | 
|---|
| 902 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); | 
|---|
| 903 | return NULL; | 
|---|
| 904 | } | 
|---|
| 905 |  | 
|---|
| 906 | /* OK, ACK is valid, create big socket and | 
|---|
| 907 | * feed this segment to it. It will repeat all | 
|---|
| 908 | * the tests. THIS SEGMENT MUST MOVE SOCKET TO | 
|---|
| 909 | * ESTABLISHED STATE. If it will be dropped after | 
|---|
| 910 | * socket is created, wait for troubles. | 
|---|
| 911 | */ | 
|---|
| 912 | child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, | 
|---|
| 913 | req, &own_req); | 
|---|
| 914 | if (!child) | 
|---|
| 915 | goto listen_overflow; | 
|---|
| 916 |  | 
|---|
| 917 | if (own_req && tmp_opt.saw_tstamp && | 
|---|
| 918 | !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) | 
|---|
| 919 | tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; | 
|---|
| 920 |  | 
|---|
| 921 | if (own_req && rsk_drop_req(req)) { | 
|---|
| 922 | reqsk_queue_removed(queue: &inet_csk(req->rsk_listener)->icsk_accept_queue, req); | 
|---|
| 923 | inet_csk_reqsk_queue_drop_and_put(sk: req->rsk_listener, req); | 
|---|
| 924 | return child; | 
|---|
| 925 | } | 
|---|
| 926 |  | 
|---|
| 927 | sock_rps_save_rxhash(sk: child, skb); | 
|---|
| 928 | tcp_synack_rtt_meas(sk: child, req); | 
|---|
| 929 | *req_stolen = !own_req; | 
|---|
| 930 | return inet_csk_complete_hashdance(sk, child, req, own_req); | 
|---|
| 931 |  | 
|---|
| 932 | listen_overflow: | 
|---|
| 933 | SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); | 
|---|
| 934 | if (sk != req->rsk_listener) | 
|---|
| 935 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); | 
|---|
| 936 |  | 
|---|
| 937 | if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { | 
|---|
| 938 | inet_rsk(sk: req)->acked = 1; | 
|---|
| 939 | return NULL; | 
|---|
| 940 | } | 
|---|
| 941 |  | 
|---|
| 942 | embryonic_reset: | 
|---|
| 943 | if (!(flg & TCP_FLAG_RST)) { | 
|---|
| 944 | /* Received a bad SYN pkt - for TFO We try not to reset | 
|---|
| 945 | * the local connection unless it's really necessary to | 
|---|
| 946 | * avoid becoming vulnerable to outside attack aiming at | 
|---|
| 947 | * resetting legit local connections. | 
|---|
| 948 | */ | 
|---|
| 949 | req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); | 
|---|
| 950 | } else if (fastopen) { /* received a valid RST pkt */ | 
|---|
| 951 | reqsk_fastopen_remove(sk, req, reset: true); | 
|---|
| 952 | tcp_reset(sk, skb); | 
|---|
| 953 | } | 
|---|
| 954 | if (!fastopen) { | 
|---|
| 955 | bool unlinked = inet_csk_reqsk_queue_drop(sk, req); | 
|---|
| 956 |  | 
|---|
| 957 | if (unlinked) | 
|---|
| 958 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | 
|---|
| 959 | *req_stolen = !unlinked; | 
|---|
| 960 | } | 
|---|
| 961 | return NULL; | 
|---|
| 962 | } | 
|---|
| 963 | EXPORT_IPV6_MOD(tcp_check_req); | 
|---|
| 964 |  | 
|---|
| 965 | /* | 
|---|
| 966 | * Queue segment on the new socket if the new socket is active, | 
|---|
| 967 | * otherwise we just shortcircuit this and continue with | 
|---|
| 968 | * the new socket. | 
|---|
| 969 | * | 
|---|
| 970 | * For the vast majority of cases child->sk_state will be TCP_SYN_RECV | 
|---|
| 971 | * when entering. But other states are possible due to a race condition | 
|---|
| 972 | * where after __inet_lookup_established() fails but before the listener | 
|---|
| 973 | * locked is obtained, other packets cause the same connection to | 
|---|
| 974 | * be created. | 
|---|
| 975 | */ | 
|---|
| 976 |  | 
|---|
| 977 | enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, | 
|---|
| 978 | struct sk_buff *skb) | 
|---|
| 979 | __releases(&((child)->sk_lock.slock)) | 
|---|
| 980 | { | 
|---|
| 981 | enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; | 
|---|
| 982 | int state = child->sk_state; | 
|---|
| 983 |  | 
|---|
| 984 | /* record sk_napi_id and sk_rx_queue_mapping of child. */ | 
|---|
| 985 | sk_mark_napi_id_set(sk: child, skb); | 
|---|
| 986 |  | 
|---|
| 987 | tcp_segs_in(tcp_sk(child), skb); | 
|---|
| 988 | if (!sock_owned_by_user(sk: child)) { | 
|---|
| 989 | reason = tcp_rcv_state_process(sk: child, skb); | 
|---|
| 990 | /* Wakeup parent, send SIGIO */ | 
|---|
| 991 | if (state == TCP_SYN_RECV && child->sk_state != state) | 
|---|
| 992 | parent->sk_data_ready(parent); | 
|---|
| 993 | } else { | 
|---|
| 994 | /* Alas, it is possible again, because we do lookup | 
|---|
| 995 | * in main socket hash table and lock on listening | 
|---|
| 996 | * socket does not protect us more. | 
|---|
| 997 | */ | 
|---|
| 998 | __sk_add_backlog(sk: child, skb); | 
|---|
| 999 | } | 
|---|
| 1000 |  | 
|---|
| 1001 | bh_unlock_sock(child); | 
|---|
| 1002 | sock_put(sk: child); | 
|---|
| 1003 | return reason; | 
|---|
| 1004 | } | 
|---|
| 1005 | EXPORT_IPV6_MOD(tcp_child_process); | 
|---|
| 1006 |  | 
|---|