tcp_fastopen.c source code [Linux/net/ipv4/tcp_fastopen.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/kernel.h>
3	#include <linux/tcp.h>
4	#include <linux/rcupdate.h>
5	#include <net/tcp.h>
6	#include <net/busy_poll.h>
7
8	void tcp_fastopen_init_key_once(struct net *net)
9	{
10	u8 key[TCP_FASTOPEN_KEY_LENGTH];
11	struct tcp_fastopen_context *ctxt;
12
13	rcu_read_lock();
14	ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
15	if (ctxt) {
16	rcu_read_unlock();
17	return;
18	}
19	rcu_read_unlock();
20
21	/ tcp_fastopen_reset_cipher publishes the new context*
22	* atomically, so we allow this race happening here.
23	*
24	* All call sites of tcp_fastopen_cookie_gen also check
25	* for a valid cookie, so this is an acceptable risk.
26	*/
27	get_random_bytes(buf: key, len: sizeof(key));
28	tcp_fastopen_reset_cipher(net, NULL, primary_key: key, NULL);
29	}
30
31	static void tcp_fastopen_ctx_free(struct rcu_head *head)
32	{
33	struct tcp_fastopen_context *ctx =
34	container_of(head, struct tcp_fastopen_context, rcu);
35
36	kfree_sensitive(objp: ctx);
37	}
38
39	void tcp_fastopen_destroy_cipher(struct sock *sk)
40	{
41	struct tcp_fastopen_context *ctx;
42
43	ctx = rcu_dereference_protected(
44	inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, `1`);
45	if (ctx)
46	call_rcu(head: &ctx->rcu, func: tcp_fastopen_ctx_free);
47	}
48
49	void tcp_fastopen_ctx_destroy(struct net *net)
50	{
51	struct tcp_fastopen_context *ctxt;
52
53	ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL));
54
55	if (ctxt)
56	call_rcu(head: &ctxt->rcu, func: tcp_fastopen_ctx_free);
57	}
58
59	int tcp_fastopen_reset_cipher(struct net net, struct* sock *sk,
60	void primary_key, void* *backup_key)
61	{
62	struct tcp_fastopen_context ctx, octx;
63	struct fastopen_queue *q;
64	int err = `0`;
65
66	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
67	if (!ctx) {
68	err = -ENOMEM;
69	goto out;
70	}
71
72	ctx->key[`0`].key[`0`] = get_unaligned_le64(p: primary_key);
73	ctx->key[`0`].key[`1`] = get_unaligned_le64(p: primary_key + `8`);
74	if (backup_key) {
75	ctx->key[`1`].key[`0`] = get_unaligned_le64(p: backup_key);
76	ctx->key[`1`].key[`1`] = get_unaligned_le64(p: backup_key + `8`);
77	ctx->num = `2`;
78	} else {
79	ctx->num = `1`;
80	}
81
82	if (sk) {
83	q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
84	octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx)));
85	} else {
86	octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx,
87	RCU_INITIALIZER(ctx)));
88	}
89
90	if (octx)
91	call_rcu(head: &octx->rcu, func: tcp_fastopen_ctx_free);
92	out:
93	return err;
94	}
95
96	int tcp_fastopen_get_cipher(struct net net, struct* inet_connection_sock *icsk,
97	u64 *key)
98	{
99	struct tcp_fastopen_context *ctx;
100	int n_keys = `0`, i;
101
102	rcu_read_lock();
103	if (icsk)
104	ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
105	else
106	ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
107	if (ctx) {
108	n_keys = tcp_fastopen_context_len(ctx);
109	for (i = `0`; i < n_keys; i++) {
110	put_unaligned_le64(val: ctx->key[i].key[`0`], p: key + (i * `2`));
111	put_unaligned_le64(val: ctx->key[i].key[`1`], p: key + (i * `2`) + `1`);
112	}
113	}
114	rcu_read_unlock();
115
116	return n_keys;
117	}
118
119	static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
120	struct sk_buff *syn,
121	const siphash_key_t *key,
122	struct tcp_fastopen_cookie *foc)
123	{
124	BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
125
126	if (req->rsk_ops->family == AF_INET) {
127	const struct iphdr *iph = ip_hdr(skb: syn);
128
129	foc->val[`0`] = cpu_to_le64(siphash(&iph->saddr,
130	sizeof(iph->saddr) +
131	sizeof(iph->daddr),
132	key));
133	foc->len = TCP_FASTOPEN_COOKIE_SIZE;
134	return true;
135	}
136	#if IS_ENABLED(CONFIG_IPV6)
137	if (req->rsk_ops->family == AF_INET6) {
138	const struct ipv6hdr *ip6h = ipv6_hdr(skb: syn);
139
140	foc->val[`0`] = cpu_to_le64(siphash(&ip6h->saddr,
141	sizeof(ip6h->saddr) +
142	sizeof(ip6h->daddr),
143	key));
144	foc->len = TCP_FASTOPEN_COOKIE_SIZE;
145	return true;
146	}
147	#endif
148	return false;
149	}
150
151	/ Generate the fastopen cookie by applying SipHash to both the source and*
152	* destination addresses.
153	*/
154	static void tcp_fastopen_cookie_gen(struct sock *sk,
155	struct request_sock *req,
156	struct sk_buff *syn,
157	struct tcp_fastopen_cookie *foc)
158	{
159	struct tcp_fastopen_context *ctx;
160
161	rcu_read_lock();
162	ctx = tcp_fastopen_get_ctx(sk);
163	if (ctx)
164	__tcp_fastopen_cookie_gen_cipher(req, syn, key: &ctx->key[`0`], foc);
165	rcu_read_unlock();
166	}
167
168	/ If an incoming SYN or SYNACK frame contains a payload and/or FIN,*
169	* queue this additional data / FIN.
170	*/
171	void tcp_fastopen_add_skb(struct sock sk, struct* sk_buff *skb)
172	{
173	struct tcp_sock *tp = tcp_sk(sk);
174
175	if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
176	return;
177
178	skb = skb_clone(skb, GFP_ATOMIC);
179	if (!skb)
180	return;
181
182	tcp_cleanup_skb(skb);
183	/ segs_in has been initialized to 1 in tcp_create_openreq_child().*
184	* Hence, reset segs_in to 0 before calling tcp_segs_in()
185	* to avoid double counting. Also, tcp_segs_in() expects
186	* skb->len to include the tcp_hdrlen. Hence, it should
187	* be called before __skb_pull().
188	*/
189	tp->segs_in = `0`;
190	tcp_segs_in(tp, skb);
191	__skb_pull(skb, len: tcp_hdrlen(skb));
192	sk_forced_mem_schedule(sk, size: skb->truesize);
193	skb_set_owner_r(skb, sk);
194
195	TCP_SKB_CB(skb)->seq++;
196	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
197
198	tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
199	tcp_add_receive_queue(sk, skb);
200	tp->syn_data_acked = `1`;
201
202	/ u64_stats_update_begin(&tp->syncp) not needed here,*
203	* as we certainly are not changing upper 32bit value (0)
204	*/
205	tp->bytes_received = skb->len;
206
207	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
208	tcp_fin(sk);
209	}
210
211	/ returns 0 - no key match, 1 for primary, 2 for backup /
212	static int tcp_fastopen_cookie_gen_check(struct sock *sk,
213	struct request_sock *req,
214	struct sk_buff *syn,
215	struct tcp_fastopen_cookie *orig,
216	struct tcp_fastopen_cookie *valid_foc)
217	{
218	struct tcp_fastopen_cookie search_foc = { .len = -`1` };
219	struct tcp_fastopen_cookie *foc = valid_foc;
220	struct tcp_fastopen_context *ctx;
221	int i, ret = `0`;
222
223	rcu_read_lock();
224	ctx = tcp_fastopen_get_ctx(sk);
225	if (!ctx)
226	goto out;
227	for (i = `0`; i < tcp_fastopen_context_len(ctx); i++) {
228	__tcp_fastopen_cookie_gen_cipher(req, syn, key: &ctx->key[i], foc);
229	if (tcp_fastopen_cookie_match(foc, orig)) {
230	ret = i + `1`;
231	goto out;
232	}
233	foc = &search_foc;
234	}
235	out:
236	rcu_read_unlock();
237	return ret;
238	}
239
240	static struct sock tcp_fastopen_create_child(struct* sock *sk,
241	struct sk_buff *skb,
242	struct request_sock *req)
243	{
244	struct tcp_sock *tp;
245	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
246	struct sock *child;
247	bool own_req;
248
249	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
250	NULL, &own_req);
251	if (!child)
252	return NULL;
253
254	spin_lock(lock: &queue->fastopenq.lock);
255	queue->fastopenq.qlen++;
256	spin_unlock(lock: &queue->fastopenq.lock);
257
258	/ Initialize the child socket. Have to fix some values to take*
259	* into account the child is a Fast Open socket and is created
260	* only out of the bits carried in the SYN packet.
261	*/
262	tp = tcp_sk(child);
263
264	rcu_assign_pointer(tp->fastopen_rsk, req);
265	tcp_rsk(req)->tfo_listener = true;
266
267	/ RFC1323: The window in SYN & SYN/ACK segments is never*
268	* scaled. So correct it appropriately.
269	*/
270	tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
271	tp->max_window = tp->snd_wnd;
272
273	/ Activate the retrans timer so that SYNACK can be retransmitted.*
274	* The request socket is not added to the ehash
275	* because it's been added to the accept queue directly.
276	*/
277	req->timeout = tcp_timeout_init(sk: child);
278	tcp_reset_xmit_timer(sk: child, ICSK_TIME_RETRANS,
279	when: req->timeout, pace_delay: false);
280
281	refcount_set(r: &req->rsk_refcnt, n: `2`);
282
283	sk_mark_napi_id_set(sk: child, skb);
284
285	/ Now finish processing the fastopen child socket. /
286	tcp_init_transfer(sk: child, bpf_op: BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
287
288	tp->rcv_nxt = TCP_SKB_CB(skb)->seq + `1`;
289
290	tcp_fastopen_add_skb(sk: child, skb);
291
292	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
293	tp->rcv_wup = tp->rcv_nxt;
294	/ tcp_conn_request() is sending the SYNACK,*
295	* and queues the child into listener accept queue.
296	*/
297	return child;
298	}
299
300	static bool tcp_fastopen_queue_check(struct sock *sk)
301	{
302	struct fastopen_queue *fastopenq;
303	int max_qlen;
304
305	/ Make sure the listener has enabled fastopen, and we don't*
306	* exceed the max # of pending TFO requests allowed before trying
307	* to validating the cookie in order to avoid burning CPU cycles
308	* unnecessarily.
309	*
310	* XXX (TFO) - The implication of checking the max_qlen before
311	* processing a cookie request is that clients can't differentiate
312	* between qlen overflow causing Fast Open to be disabled
313	* temporarily vs a server not supporting Fast Open at all.
314	*/
315	fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
316	max_qlen = READ_ONCE(fastopenq->max_qlen);
317	if (max_qlen == `0`)
318	return false;
319
320	if (fastopenq->qlen >= max_qlen) {
321	struct request_sock *req1;
322	spin_lock(lock: &fastopenq->lock);
323	req1 = fastopenq->rskq_rst_head;
324	if (!req1 \|\| time_after(req1->rsk_timer.expires, jiffies)) {
325	__NET_INC_STATS(sock_net(sk),
326	LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
327	spin_unlock(lock: &fastopenq->lock);
328	return false;
329	}
330	fastopenq->rskq_rst_head = req1->dl_next;
331	fastopenq->qlen--;
332	spin_unlock(lock: &fastopenq->lock);
333	reqsk_put(req: req1);
334	}
335	return true;
336	}
337
338	static bool tcp_fastopen_no_cookie(const struct sock *sk,
339	const struct dst_entry *dst,
340	int flag)
341	{
342	return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) \|\|
343	tcp_sk(sk)->fastopen_no_cookie \|\|
344	(dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
345	}
346
347	/ Returns true if we should perform Fast Open on the SYN. The cookie (foc)*
348	* may be updated and return the client in the SYN-ACK later. E.g., Fast Open
349	* cookie request (foc->len == 0).
350	*/
351	struct sock tcp_try_fastopen(struct* sock sk, struct* sk_buff *skb,
352	struct request_sock *req,
353	struct tcp_fastopen_cookie *foc,
354	const struct dst_entry *dst)
355	{
356	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + `1`;
357	int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
358	struct tcp_fastopen_cookie valid_foc = { .len = -`1` };
359	struct sock *child;
360	int ret = `0`;
361
362	if (foc->len == `0`) / Client requests a cookie /
363	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
364
365	if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
366	(syn_data \|\| foc->len >= `0`) &&
367	tcp_fastopen_queue_check(sk))) {
368	foc->len = -`1`;
369	return NULL;
370	}
371
372	if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
373	goto fastopen;
374
375	if (foc->len == `0`) {
376	/ Client requests a cookie. /
377	tcp_fastopen_cookie_gen(sk, req, syn: skb, foc: &valid_foc);
378	} else if (foc->len > `0`) {
379	ret = tcp_fastopen_cookie_gen_check(sk, req, syn: skb, orig: foc,
380	valid_foc: &valid_foc);
381	if (!ret) {
382	NET_INC_STATS(sock_net(sk),
383	LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
384	} else {
385	/ Cookie is valid. Create a (full) child socket to*
386	* accept the data in SYN before returning a SYN-ACK to
387	* ack the data. If we fail to create the socket, fall
388	* back and ack the ISN only but includes the same
389	* cookie.
390	*
391	* Note: Data-less SYN with valid cookie is allowed to
392	* send data in SYN_RECV state.
393	*/
394	fastopen:
395	child = tcp_fastopen_create_child(sk, skb, req);
396	if (child) {
397	if (ret == `2`) {
398	valid_foc.exp = foc->exp;
399	*foc = valid_foc;
400	NET_INC_STATS(sock_net(sk),
401	LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
402	} else {
403	foc->len = -`1`;
404	}
405	NET_INC_STATS(sock_net(sk),
406	LINUX_MIB_TCPFASTOPENPASSIVE);
407	tcp_sk(child)->syn_fastopen_child = `1`;
408	return child;
409	}
410	NET_INC_STATS(sock_net(sk),
411	LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
412	}
413	}
414	valid_foc.exp = foc->exp;
415	*foc = valid_foc;
416	return NULL;
417	}
418
419	bool tcp_fastopen_cookie_check(struct sock sk, u16 mss,
420	struct tcp_fastopen_cookie *cookie)
421	{
422	const struct dst_entry *dst;
423
424	tcp_fastopen_cache_get(sk, mss, cookie);
425
426	/ Firewall blackhole issue check /
427	if (tcp_fastopen_active_should_disable(sk)) {
428	cookie->len = -`1`;
429	return false;
430	}
431
432	dst = __sk_dst_get(sk);
433
434	if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
435	cookie->len = -`1`;
436	return true;
437	}
438	if (cookie->len > `0`)
439	return true;
440	tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
441	return false;
442	}
443
444	/ This function checks if we want to defer sending SYN until the first*
445	* write(). We defer under the following conditions:
446	* 1. fastopen_connect sockopt is set
447	* 2. we have a valid cookie
448	* Return value: return true if we want to defer until application writes data
449	* return false if we want to send out SYN immediately
450	*/
451	bool tcp_fastopen_defer_connect(struct sock sk, int* *err)
452	{
453	struct tcp_fastopen_cookie cookie = { .len = `0` };
454	struct tcp_sock *tp = tcp_sk(sk);
455	u16 mss;
456
457	if (tp->fastopen_connect && !tp->fastopen_req) {
458	if (tcp_fastopen_cookie_check(sk, mss: &mss, cookie: &cookie)) {
459	inet_set_bit(DEFER_CONNECT, sk);
460	return true;
461	}
462
463	/ Alloc fastopen_req in order for FO option to be included*
464	* in SYN
465	*/
466	tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
467	sk->sk_allocation);
468	if (tp->fastopen_req)
469	tp->fastopen_req->cookie = cookie;
470	else
471	*err = -ENOBUFS;
472	}
473	return false;
474	}
475	EXPORT_IPV6_MOD(tcp_fastopen_defer_connect);
476
477	/*
478	* The following code block is to deal with middle box issues with TFO:
479	* Middlebox firewall issues can potentially cause server's data being
480	* blackholed after a successful 3WHS using TFO.
481	* The proposed solution is to disable active TFO globally under the
482	* following circumstances:
483	* 1. client side TFO socket receives out of order FIN
484	* 2. client side TFO socket receives out of order RST
485	* 3. client side TFO socket has timed out three times consecutively during
486	* or after handshake
487	* We disable active side TFO globally for 1hr at first. Then if it
488	* happens again, we disable it for 2h, then 4h, 8h, ...
489	* And we reset the timeout back to 1hr when we see a successful active
490	* TFO connection with data exchanges.
491	*/
492
493	/ Disable active TFO and record current jiffies and*
494	* tfo_active_disable_times
495	*/
496	void tcp_fastopen_active_disable(struct sock *sk)
497	{
498	struct net *net = sock_net(sk);
499
500	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout))
501	return;
502
503	/ Paired with READ_ONCE() in tcp_fastopen_active_should_disable() /
504	WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies);
505
506	/ Paired with smp_rmb() in tcp_fastopen_active_should_disable().*
507	* We want net->ipv4.tfo_active_disable_stamp to be updated first.
508	*/
509	smp_mb__before_atomic();
510	atomic_inc(v: &net->ipv4.tfo_active_disable_times);
511
512	NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
513	}
514
515	/ Calculate timeout for tfo active disable*
516	* Return true if we are still in the active TFO disable period
517	* Return false if timeout already expired and we should use active TFO
518	*/
519	bool tcp_fastopen_active_should_disable(struct sock *sk)
520	{
521	unsigned int tfo_bh_timeout =
522	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout);
523	unsigned long timeout;
524	int tfo_da_times;
525	int multiplier;
526
527	if (!tfo_bh_timeout)
528	return false;
529
530	tfo_da_times = atomic_read(v: &sock_net(sk)->ipv4.tfo_active_disable_times);
531	if (!tfo_da_times)
532	return false;
533
534	/ Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() /
535	smp_rmb();
536
537	/ Limit timeout to max: 2^6 * initial timeout /
538	multiplier = `1` << min(tfo_da_times - `1`, `6`);
539
540	/ Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). /
541	timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) +
542	multiplier * tfo_bh_timeout * HZ;
543	if (time_before(jiffies, timeout))
544	return true;
545
546	/ Mark check bit so we can check for successful active TFO*
547	* condition and reset tfo_active_disable_times
548	*/
549	tcp_sk(sk)->syn_fastopen_ch = `1`;
550	return false;
551	}
552
553	/ Disable active TFO if FIN is the only packet in the ofo queue*
554	* and no data is received.
555	* Also check if we can reset tfo_active_disable_times if data is
556	* received successfully on a marked active TFO sockets opened on
557	* a non-loopback interface
558	*/
559	void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
560	{
561	struct tcp_sock *tp = tcp_sk(sk);
562	struct net_device *dev;
563	struct dst_entry *dst;
564	struct sk_buff *skb;
565
566	if (!tp->syn_fastopen)
567	return;
568
569	if (!tp->data_segs_in) {
570	skb = skb_rb_first(&tp->out_of_order_queue);
571	if (skb && !skb_rb_next(skb)) {
572	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
573	tcp_fastopen_active_disable(sk);
574	return;
575	}
576	}
577	} else if (tp->syn_fastopen_ch &&
578	atomic_read(v: &sock_net(sk)->ipv4.tfo_active_disable_times)) {
579	rcu_read_lock();
580	dst = __sk_dst_get(sk);
581	dev = dst ? dst_dev_rcu(dst) : NULL;
582	if (!(dev && (dev->flags & IFF_LOOPBACK)))
583	atomic_set(v: &sock_net(sk)->ipv4.tfo_active_disable_times, i: `0`);
584	rcu_read_unlock();
585	}
586	}
587
588	void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
589	{
590	u32 timeouts = inet_csk(sk)->icsk_retransmits;
591	struct tcp_sock *tp = tcp_sk(sk);
592
593	/ Broken middle-boxes may black-hole Fast Open connection during or*
594	* even after the handshake. Be extremely conservative and pause
595	* Fast Open globally after hitting the third consecutive timeout or
596	* exceeding the configured timeout limit.
597	*/
598	if ((tp->syn_fastopen \|\| tp->syn_data \|\| tp->syn_data_acked) &&
599	(timeouts == `2` \|\| (timeouts < `2` && expired))) {
600	tcp_fastopen_active_disable(sk);
601	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
602	}
603	}
604

Browse the source code of Linux/net/ipv4/tcp_fastopen.c