tcp_ipv4.c source code [Linux/net/ipv4/tcp_ipv4.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* IPv4 specific functions
10	*
11	* code split from:
12	* linux/ipv4/tcp.c
13	* linux/ipv4/tcp_input.c
14	* linux/ipv4/tcp_output.c
15	*
16	* See tcp.c for author information
17	*/
18
19	/*
20	* Changes:
21	* David S. Miller : New socket lookup architecture.
22	* This code is dedicated to John Dyson.
23	* David S. Miller : Change semantics of established hash,
24	* half is devoted to TIME_WAIT sockets
25	* and the rest go in the other half.
26	* Andi Kleen : Add support for syncookies and fixed
27	* some bugs: ip options weren't passed to
28	* the TCP layer, missed a check for an
29	* ACK bit.
30	* Andi Kleen : Implemented fast path mtu discovery.
31	* Fixed many serious bugs in the
32	* request_sock handling and moved
33	* most of it into the af independent code.
34	* Added tail drop and some other bugfixes.
35	* Added new listen semantics.
36	* Mike McLagan : Routing by source
37	* Juan Jose Ciarlante: ip_dynaddr bits
38	* Andi Kleen: various fixes.
39	* Vitaly E. Lavrov : Transparent proxy revived after year
40	* coma.
41	* Andi Kleen : Fix new listen.
42	* Andi Kleen : Fix accept error reporting.
43	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45	* a single port at the same time.
46	*/
47
48	#define pr_fmt(fmt) "TCP: " fmt
49
50	#include <linux/bottom_half.h>
51	#include <linux/types.h>
52	#include <linux/fcntl.h>
53	#include <linux/module.h>
54	#include <linux/random.h>
55	#include <linux/cache.h>
56	#include <linux/jhash.h>
57	#include <linux/init.h>
58	#include <linux/times.h>
59	#include <linux/slab.h>
60	#include <linux/sched.h>
61	#include <linux/sock_diag.h>
62
63	#include <net/aligned_data.h>
64	#include <net/net_namespace.h>
65	#include <net/icmp.h>
66	#include <net/inet_hashtables.h>
67	#include <net/tcp.h>
68	#include <net/tcp_ecn.h>
69	#include <net/transp_v6.h>
70	#include <net/ipv6.h>
71	#include <net/inet_common.h>
72	#include <net/inet_ecn.h>
73	#include <net/timewait_sock.h>
74	#include <net/xfrm.h>
75	#include <net/secure_seq.h>
76	#include <net/busy_poll.h>
77	#include <net/rstreason.h>
78	#include <net/psp.h>
79
80	#include <linux/inet.h>
81	#include <linux/ipv6.h>
82	#include <linux/stddef.h>
83	#include <linux/proc_fs.h>
84	#include <linux/seq_file.h>
85	#include <linux/inetdevice.h>
86	#include <linux/btf_ids.h>
87	#include <linux/skbuff_ref.h>
88
89	#include <crypto/hash.h>
90	#include <linux/scatterlist.h>
91
92	#include <trace/events/tcp.h>
93
94	#ifdef CONFIG_TCP_MD5SIG
95	static int tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
96	__be32 daddr, __be32 saddr, const struct tcphdr *th);
97	#endif
98
99	struct inet_hashinfo tcp_hashinfo;
100
101	static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
103	};
104
105	static DEFINE_MUTEX(tcp_exit_batch_mutex);
106
107	static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108	{
109	return secure_tcp_seq(saddr: ip_hdr(skb)->daddr,
110	daddr: ip_hdr(skb)->saddr,
111	sport: tcp_hdr(skb)->dest,
112	dport: tcp_hdr(skb)->source);
113	}
114
115	static u32 tcp_v4_init_ts_off(const struct net net, const* struct sk_buff *skb)
116	{
117	return secure_tcp_ts_off(net, saddr: ip_hdr(skb)->daddr, daddr: ip_hdr(skb)->saddr);
118	}
119
120	int tcp_twsk_unique(struct sock sk, struct* sock sktw, void* *twp)
121	{
122	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123	const struct inet_timewait_sock *tw = inet_twsk(sk: sktw);
124	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk: sktw);
125	struct tcp_sock *tp = tcp_sk(sk);
126	int ts_recent_stamp;
127	u32 reuse_thresh;
128
129	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130	reuse = `0`;
131
132	if (reuse == `2`) {
133	/ Still does not detect everything that goes through*
134	* lo, since we require a loopback src or dst address
135	* or direct binding to 'lo' interface.
136	*/
137	bool loopback = false;
138	if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139	loopback = true;
140	#if IS_ENABLED(CONFIG_IPV6)
141	if (tw->tw_family == AF_INET6) {
142	if (ipv6_addr_loopback(a: &tw->tw_v6_daddr) \|\|
143	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_daddr) \|\|
144	ipv6_addr_loopback(a: &tw->tw_v6_rcv_saddr) \|\|
145	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_rcv_saddr))
146	loopback = true;
147	} else
148	#endif
149	{
150	if (ipv4_is_loopback(addr: tw->tw_daddr) \|\|
151	ipv4_is_loopback(addr: tw->tw_rcv_saddr))
152	loopback = true;
153	}
154	if (!loopback)
155	reuse = `0`;
156	}
157
158	/ With PAWS, it is safe from the viewpoint*
159	of data integrity. Even without PAWS it is safe provided sequence
160	spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161
162	Actually, the idea is close to VJ's one, only timestamp cache is
163	held not per host, but per port pair and TW bucket is used as state
164	holder.
165
166	If TW bucket has been already destroyed we fall back to VJ's scheme
167	and use initial timestamp retrieved from peer table.
168	*/
169	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172	if (ts_recent_stamp &&
173	(!twp \|\| (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174	/ inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk*
175	* and releasing the bucket lock.
176	*/
177	if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178	return `0`;
179
180	/ In case of repair and re-using TIME-WAIT sockets we still*
181	* want to be sure that it is safe as above but honor the
182	* sequence numbers and time stamps set as part of the repair
183	* process.
184	*
185	* Without this check re-using a TIME-WAIT socket with TCP
186	* repair would accumulate a -1 on the repair assigned
187	* sequence number. The first time it is reused the sequence
188	* is -1, the second time -2, etc. This fixes that issue
189	* without appearing to create any others.
190	*/
191	if (likely(!tp->repair)) {
192	u32 seq = tcptw->tw_snd_nxt + `65535` + `2`;
193
194	if (!seq)
195	seq = `1`;
196	WRITE_ONCE(tp->write_seq, seq);
197	tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
198	tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199	}
200
201	return `1`;
202	}
203
204	return `0`;
205	}
206	EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207
208	static int tcp_v4_pre_connect(struct sock sk, struct* sockaddr *uaddr,
209	int addr_len)
210	{
211	/ This check is replicated from tcp_v4_connect() and intended to*
212	* prevent BPF program called below from accessing bytes that are out
213	* of the bound specified by user in addr_len.
214	*/
215	if (addr_len < sizeof(struct sockaddr_in))
216	return -EINVAL;
217
218	sock_owned_by_me(sk);
219
220	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221	}
222
223	/ This will initiate an outgoing connection. /
224	int tcp_v4_connect(struct sock sk, struct* sockaddr uaddr, int* addr_len)
225	{
226	struct sockaddr_in usin = (struct* sockaddr_in *)uaddr;
227	struct inet_timewait_death_row *tcp_death_row;
228	struct inet_sock *inet = inet_sk(sk);
229	struct tcp_sock *tp = tcp_sk(sk);
230	struct ip_options_rcu *inet_opt;
231	struct net *net = sock_net(sk);
232	__be16 orig_sport, orig_dport;
233	__be32 daddr, nexthop;
234	struct flowi4 *fl4;
235	struct rtable *rt;
236	int err;
237
238	if (addr_len < sizeof(struct sockaddr_in))
239	return -EINVAL;
240
241	if (usin->sin_family != AF_INET)
242	return -EAFNOSUPPORT;
243
244	nexthop = daddr = usin->sin_addr.s_addr;
245	inet_opt = rcu_dereference_protected(inet->inet_opt,
246	lockdep_sock_is_held(sk));
247	if (inet_opt && inet_opt->opt.srr) {
248	if (!daddr)
249	return -EINVAL;
250	nexthop = inet_opt->opt.faddr;
251	}
252
253	orig_sport = inet->inet_sport;
254	orig_dport = usin->sin_port;
255	fl4 = &inet->cork.fl.u.ip4;
256	rt = ip_route_connect(fl4, dst: nexthop, src: inet->inet_saddr,
257	oif: sk->sk_bound_dev_if, IPPROTO_TCP, sport: orig_sport,
258	dport: orig_dport, sk);
259	if (IS_ERR(ptr: rt)) {
260	err = PTR_ERR(ptr: rt);
261	if (err == -ENETUNREACH)
262	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263	return err;
264	}
265
266	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
267	ip_rt_put(rt);
268	return -ENETUNREACH;
269	}
270
271	if (!inet_opt \|\| !inet_opt->opt.srr)
272	daddr = fl4->daddr;
273
274	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275
276	if (!inet->inet_saddr) {
277	err = inet_bhash2_update_saddr(sk, saddr: &fl4->saddr, AF_INET);
278	if (err) {
279	ip_rt_put(rt);
280	return err;
281	}
282	} else {
283	sk_rcv_saddr_set(sk, addr: inet->inet_saddr);
284	}
285
286	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287	/ Reset inherited state /
288	tp->rx_opt.ts_recent = `0`;
289	tp->rx_opt.ts_recent_stamp = `0`;
290	if (likely(!tp->repair))
291	WRITE_ONCE(tp->write_seq, `0`);
292	}
293
294	inet->inet_dport = usin->sin_port;
295	sk_daddr_set(sk, addr: daddr);
296
297	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298	if (inet_opt)
299	inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300
301	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302
303	/ Socket identity is still unknown (sport may be zero).*
304	* However we set state to SYN-SENT and not releasing socket
305	* lock select source port, enter ourselves into the hash tables and
306	* complete initialization after this.
307	*/
308	tcp_set_state(sk, state: TCP_SYN_SENT);
309	err = inet_hash_connect(death_row: tcp_death_row, sk);
310	if (err)
311	goto failure;
312
313	sk_set_txhash(sk);
314
315	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316	sport: inet->inet_sport, dport: inet->inet_dport, sk);
317	if (IS_ERR(ptr: rt)) {
318	err = PTR_ERR(ptr: rt);
319	rt = NULL;
320	goto failure;
321	}
322	tp->tcp_usec_ts = dst_tcp_usec_ts(dst: &rt->dst);
323	/ OK, now commit destination to socket. /
324	sk->sk_gso_type = SKB_GSO_TCPV4;
325	sk_setup_caps(sk, dst: &rt->dst);
326	rt = NULL;
327
328	if (likely(!tp->repair)) {
329	if (!tp->write_seq)
330	WRITE_ONCE(tp->write_seq,
331	secure_tcp_seq(inet->inet_saddr,
332	inet->inet_daddr,
333	inet->inet_sport,
334	usin->sin_port));
335	WRITE_ONCE(tp->tsoffset,
336	secure_tcp_ts_off(net, inet->inet_saddr,
337	inet->inet_daddr));
338	}
339
340	atomic_set(v: &inet->inet_id, i: get_random_u16());
341
342	if (tcp_fastopen_defer_connect(sk, err: &err))
343	return err;
344	if (err)
345	goto failure;
346
347	err = tcp_connect(sk);
348
349	if (err)
350	goto failure;
351
352	return `0`;
353
354	failure:
355	/*
356	* This unhashes the socket and releases the local port,
357	* if necessary.
358	*/
359	tcp_set_state(sk, state: TCP_CLOSE);
360	inet_bhash2_reset_saddr(sk);
361	ip_rt_put(rt);
362	sk->sk_route_caps = `0`;
363	inet->inet_dport = `0`;
364	return err;
365	}
366	EXPORT_IPV6_MOD(tcp_v4_connect);
367
368	/*
369	* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370	* It can be called through tcp_release_cb() if socket was owned by user
371	* at the time tcp_v4_err() was called to handle ICMP message.
372	*/
373	void tcp_v4_mtu_reduced(struct sock *sk)
374	{
375	struct inet_sock *inet = inet_sk(sk);
376	struct dst_entry *dst;
377	u32 mtu;
378
379	if ((`1` << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
380	return;
381	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382	dst = inet_csk_update_pmtu(sk, mtu);
383	if (!dst)
384	return;
385
386	/ Something is about to be wrong... Remember soft error*
387	* for the case, if this connection will not able to recover.
388	*/
389	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
390	WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
391
392	mtu = dst_mtu(dst);
393
394	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
395	ip_sk_accept_pmtu(sk) &&
396	inet_csk(sk)->icsk_pmtu_cookie > mtu) {
397	tcp_sync_mss(sk, pmtu: mtu);
398
399	/ Resend the TCP packet because it's*
400	* clear that the old packet has been
401	* dropped. This is the new "fast" path mtu
402	* discovery.
403	*/
404	tcp_simple_retransmit(sk);
405	} / else let the usual retransmit timer handle it /
406	}
407	EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
408
409	static void do_redirect(struct sk_buff skb, struct* sock *sk)
410	{
411	struct dst_entry *dst = __sk_dst_check(sk, cookie: `0`);
412
413	if (dst)
414	dst->ops->redirect(dst, sk, skb);
415	}
416
417
418	/ handle ICMP messages on TCP_NEW_SYN_RECV request sockets /
419	void tcp_req_err(struct sock *sk, u32 seq, bool abort)
420	{
421	struct request_sock *req = inet_reqsk(sk);
422	struct net *net = sock_net(sk);
423
424	/ ICMPs are not backlogged, hence we cannot get*
425	* an established socket here.
426	*/
427	if (seq != tcp_rsk(req)->snt_isn) {
428	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
429	} else if (abort) {
430	/*
431	* Still in SYN_RECV, just remove it silently.
432	* There is no good way to pass the error to the newly
433	* created socket, and POSIX does not want network
434	* errors returned from accept().
435	*/
436	inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
437	tcp_listendrop(sk: req->rsk_listener);
438	}
439	reqsk_put(req);
440	}
441	EXPORT_IPV6_MOD(tcp_req_err);
442
443	/ TCP-LD (RFC 6069) logic /
444	void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
445	{
446	struct inet_connection_sock *icsk = inet_csk(sk);
447	struct tcp_sock *tp = tcp_sk(sk);
448	struct sk_buff *skb;
449	s32 remaining;
450	u32 delta_us;
451
452	if (sock_owned_by_user(sk))
453	return;
454
455	if (seq != tp->snd_una \|\| !icsk->icsk_retransmits \|\|
456	!icsk->icsk_backoff)
457	return;
458
459	skb = tcp_rtx_queue_head(sk);
460	if (WARN_ON_ONCE(!skb))
461	return;
462
463	icsk->icsk_backoff--;
464	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
465	icsk->icsk_rto = inet_csk_rto_backoff(icsk, max_when: tcp_rto_max(sk));
466
467	tcp_mstamp_refresh(tp);
468	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
469	remaining = icsk->icsk_rto - usecs_to_jiffies(u: delta_us);
470
471	if (remaining > `0`) {
472	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: remaining, pace_delay: false);
473	} else {
474	/ RTO revert clocked out retransmission.*
475	* Will retransmit now.
476	*/
477	tcp_retransmit_timer(sk);
478	}
479	}
480	EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
481
482	/*
483	* This routine is called by the ICMP module when it gets some
484	* sort of error condition. If err < 0 then the socket should
485	* be closed and the error returned to the user. If err > 0
486	* it's just the icmp type << 8 \| icmp code. After adjustment
487	* header points to the first 8 bytes of the tcp header. We need
488	* to find the appropriate port.
489	*
490	* The locking strategy used here is very "optimistic". When
491	* someone else accesses the socket the ICMP is just dropped
492	* and for some paths there is no check at all.
493	* A more general error queue to queue errors for later handling
494	* is probably better.
495	*
496	*/
497
498	int tcp_v4_err(struct sk_buff *skb, u32 info)
499	{
500	const struct iphdr iph = (const* struct iphdr *)skb->data;
501	struct tcphdr th = (struct* tcphdr *)(skb->data + (iph->ihl << `2`));
502	struct net *net = dev_net_rcu(dev: skb->dev);
503	const int type = icmp_hdr(skb)->type;
504	const int code = icmp_hdr(skb)->code;
505	struct request_sock *fastopen;
506	struct tcp_sock *tp;
507	u32 seq, snd_una;
508	struct sock *sk;
509	int err;
510
511	sk = __inet_lookup_established(net, saddr: iph->daddr, sport: th->dest, daddr: iph->saddr,
512	ntohs(th->source), dif: inet_iif(skb), sdif: `0`);
513	if (!sk) {
514	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
515	return -ENOENT;
516	}
517	if (sk->sk_state == TCP_TIME_WAIT) {
518	/ To increase the counter of ignored icmps for TCP-AO /
519	tcp_ao_ignore_icmp(sk, AF_INET, type, code);
520	inet_twsk_put(tw: inet_twsk(sk));
521	return `0`;
522	}
523	seq = ntohl(th->seq);
524	if (sk->sk_state == TCP_NEW_SYN_RECV) {
525	tcp_req_err(sk, seq, abort: type == ICMP_PARAMETERPROB \|\|
526	type == ICMP_TIME_EXCEEDED \|\|
527	(type == ICMP_DEST_UNREACH &&
528	(code == ICMP_NET_UNREACH \|\|
529	code == ICMP_HOST_UNREACH)));
530	return `0`;
531	}
532
533	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
534	sock_put(sk);
535	return `0`;
536	}
537
538	bh_lock_sock(sk);
539	/ If too many ICMPs get dropped on busy*
540	* servers this needs to be solved differently.
541	* We do take care of PMTU discovery (RFC1191) special case :
542	* we can receive locally generated ICMP messages while socket is held.
543	*/
544	if (sock_owned_by_user(sk)) {
545	if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
546	__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
547	}
548	if (sk->sk_state == TCP_CLOSE)
549	goto out;
550
551	if (static_branch_unlikely(&ip4_min_ttl)) {
552	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
553	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
554	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
555	goto out;
556	}
557	}
558
559	tp = tcp_sk(sk);
560	/ XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() /
561	fastopen = rcu_dereference(tp->fastopen_rsk);
562	snd_una = fastopen ? tcp_rsk(req: fastopen)->snt_isn : tp->snd_una;
563	if (sk->sk_state != TCP_LISTEN &&
564	!between(seq1: seq, seq2: snd_una, seq3: tp->snd_nxt)) {
565	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
566	goto out;
567	}
568
569	switch (type) {
570	case ICMP_REDIRECT:
571	if (!sock_owned_by_user(sk))
572	do_redirect(skb, sk);
573	goto out;
574	case ICMP_SOURCE_QUENCH:
575	/ Just silently ignore these. /
576	goto out;
577	case ICMP_PARAMETERPROB:
578	err = EPROTO;
579	break;
580	case ICMP_DEST_UNREACH:
581	if (code > NR_ICMP_UNREACH)
582	goto out;
583
584	if (code == ICMP_FRAG_NEEDED) { / PMTU discovery (RFC1191) /
585	/ We are not interested in TCP_LISTEN and open_requests*
586	* (SYN-ACKs send out by Linux are always <576bytes so
587	* they should go through unfragmented).
588	*/
589	if (sk->sk_state == TCP_LISTEN)
590	goto out;
591
592	WRITE_ONCE(tp->mtu_info, info);
593	if (!sock_owned_by_user(sk)) {
594	tcp_v4_mtu_reduced(sk);
595	} else {
596	if (!test_and_set_bit(nr: TCP_MTU_REDUCED_DEFERRED, addr: &sk->sk_tsq_flags))
597	sock_hold(sk);
598	}
599	goto out;
600	}
601
602	err = icmp_err_convert[code].errno;
603	/ check if this ICMP message allows revert of backoff.*
604	* (see RFC 6069)
605	*/
606	if (!fastopen &&
607	(code == ICMP_NET_UNREACH \|\| code == ICMP_HOST_UNREACH))
608	tcp_ld_RTO_revert(sk, seq);
609	break;
610	case ICMP_TIME_EXCEEDED:
611	err = EHOSTUNREACH;
612	break;
613	default:
614	goto out;
615	}
616
617	switch (sk->sk_state) {
618	case TCP_SYN_SENT:
619	case TCP_SYN_RECV:
620	/ Only in fast or simultaneous open. If a fast open socket is*
621	* already accepted it is treated as a connected one below.
622	*/
623	if (fastopen && !fastopen->sk)
624	break;
625
626	ip_icmp_error(sk, skb, err, port: th->dest, info, payload: (u8 *)th);
627
628	if (!sock_owned_by_user(sk))
629	tcp_done_with_error(sk, err);
630	else
631	WRITE_ONCE(sk->sk_err_soft, err);
632	goto out;
633	}
634
635	/ If we've already connected we will keep trying*
636	* until we time out, or the user gives up.
637	*
638	* rfc1122 4.2.3.9 allows to consider as hard errors
639	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
640	* but it is obsoleted by pmtu discovery).
641	*
642	* Note, that in modern internet, where routing is unreliable
643	* and in each dark corner broken firewalls sit, sending random
644	* errors ordered by their masters even this two messages finally lose
645	* their original sense (even Linux sends invalid PORT_UNREACHs)
646	*
647	* Now we are in compliance with RFCs.
648	* --ANK (980905)
649	*/
650
651	if (!sock_owned_by_user(sk) &&
652	inet_test_bit(RECVERR, sk)) {
653	WRITE_ONCE(sk->sk_err, err);
654	sk_error_report(sk);
655	} else { / Only an error on timeout /
656	WRITE_ONCE(sk->sk_err_soft, err);
657	}
658
659	out:
660	bh_unlock_sock(sk);
661	sock_put(sk);
662	return `0`;
663	}
664
665	void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
666	{
667	struct tcphdr *th = tcp_hdr(skb);
668
669	th->check = ~tcp_v4_check(len: skb->len, saddr, daddr, base: `0`);
670	skb->csum_start = skb_transport_header(skb) - skb->head;
671	skb->csum_offset = offsetof(struct tcphdr, check);
672	}
673
674	/ This routine computes an IPv4 TCP checksum. /
675	void tcp_v4_send_check(struct sock sk, struct* sk_buff *skb)
676	{
677	const struct inet_sock *inet = inet_sk(sk);
678
679	__tcp_v4_send_check(skb, saddr: inet->inet_saddr, daddr: inet->inet_daddr);
680	}
681	EXPORT_IPV6_MOD(tcp_v4_send_check);
682
683	#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
684
685	static bool tcp_v4_ao_sign_reset(const struct sock sk, struct* sk_buff *skb,
686	const struct tcp_ao_hdr *aoh,
687	struct ip_reply_arg arg, struct* tcphdr *reply,
688	__be32 reply_options[REPLY_OPTIONS_LEN])
689	{
690	#ifdef CONFIG_TCP_AO
691	int sdif = tcp_v4_sdif(skb);
692	int dif = inet_iif(skb);
693	int l3index = sdif ? dif : `0`;
694	bool allocated_traffic_key;
695	struct tcp_ao_key *key;
696	char *traffic_key;
697	bool drop = true;
698	u32 ao_sne = `0`;
699	u8 keyid;
700
701	rcu_read_lock();
702	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
703	&key, &traffic_key, &allocated_traffic_key,
704	&keyid, &ao_sne))
705	goto out;
706
707	reply_options[`0`] = htonl((TCPOPT_AO << `24`) \| (tcp_ao_len(key) << `16`) \|
708	(aoh->rnext_keyid << `8`) \| keyid);
709	arg->iov[`0`].iov_len += tcp_ao_len_aligned(key);
710	reply->doff = arg->iov[`0`].iov_len / `4`;
711
712	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[`1`],
713	key, traffic_key,
714	(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
715	(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
716	reply, ao_sne))
717	goto out;
718	drop = false;
719	out:
720	rcu_read_unlock();
721	if (allocated_traffic_key)
722	kfree(traffic_key);
723	return drop;
724	#else
725	return true;
726	#endif
727	}
728
729	/*
730	* This routine will send an RST to the other tcp.
731	*
732	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
733	* for reset.
734	* Answer: if a packet caused RST, it is not for a socket
735	* existing in our system, if it is matched to a socket,
736	* it is just duplicate segment or bug in other side's TCP.
737	* So that we build reply only basing on parameters
738	* arrived with segment.
739	* Exception: precedence violation. We do not implement it in any case.
740	*/
741
742	static void tcp_v4_send_reset(const struct sock sk, struct* sk_buff *skb,
743	enum sk_rst_reason reason)
744	{
745	const struct tcphdr *th = tcp_hdr(skb);
746	struct {
747	struct tcphdr th;
748	__be32 opt[REPLY_OPTIONS_LEN];
749	} rep;
750	const __u8 *md5_hash_location = NULL;
751	const struct tcp_ao_hdr *aoh;
752	struct ip_reply_arg arg;
753	#ifdef CONFIG_TCP_MD5SIG
754	struct tcp_md5sig_key *key = NULL;
755	unsigned char newhash[`16`];
756	struct sock *sk1 = NULL;
757	int genhash;
758	#endif
759	u64 transmit_time = `0`;
760	struct sock *ctl_sk;
761	struct net *net;
762	u32 txhash = `0`;
763
764	/ Never send a reset in response to a reset. /
765	if (th->rst)
766	return;
767
768	/ If sk not NULL, it means we did a successful lookup and incoming*
769	* route had to be correct. prequeue might have dropped our dst.
770	*/
771	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
772	return;
773
774	/ Swap the send and the receive. /
775	memset(s: &rep, c: `0`, n: sizeof(rep));
776	rep.th.dest = th->source;
777	rep.th.source = th->dest;
778	rep.th.doff = sizeof(struct tcphdr) / `4`;
779	rep.th.rst = `1`;
780
781	if (th->ack) {
782	rep.th.seq = th->ack_seq;
783	} else {
784	rep.th.ack = `1`;
785	rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
786	skb->len - (th->doff << `2`));
787	}
788
789	memset(s: &arg, c: `0`, n: sizeof(arg));
790	arg.iov[`0`].iov_base = (unsigned char *)&rep;
791	arg.iov[`0`].iov_len = sizeof(rep.th);
792
793	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
794
795	/ Invalid TCP option size or twice included auth /
796	if (tcp_parse_auth_options(th: tcp_hdr(skb), md5_hash: &md5_hash_location, aoh: &aoh))
797	return;
798
799	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, arg: &arg, reply: &rep.th, reply_options: rep.opt))
800	return;
801
802	#ifdef CONFIG_TCP_MD5SIG
803	rcu_read_lock();
804	if (sk && sk_fullsock(sk)) {
805	const union tcp_md5_addr *addr;
806	int l3index;
807
808	/ sdif set, means packet ingressed via a device*
809	* in an L3 domain and inet_iif is set to it.
810	*/
811	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
812	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
813	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
814	} else if (md5_hash_location) {
815	const union tcp_md5_addr *addr;
816	int sdif = tcp_v4_sdif(skb);
817	int dif = inet_iif(skb);
818	int l3index;
819
820	/*
821	* active side is lost. Try to find listening socket through
822	* source port, and then find md5 key through listening socket.
823	* we are not loose security here:
824	* Incoming packet is checked with md5 hash with finding key,
825	* no RST generated if md5 hash doesn't match.
826	*/
827	sk1 = __inet_lookup_listener(net, NULL, doff: `0`, saddr: ip_hdr(skb)->saddr,
828	sport: th->source, daddr: ip_hdr(skb)->daddr,
829	ntohs(th->source), dif, sdif);
830	/ don't send rst if it can't find key /
831	if (!sk1)
832	goto out;
833
834	/ sdif set, means packet ingressed via a device*
835	* in an L3 domain and dif is set to it.
836	*/
837	l3index = sdif ? dif : `0`;
838	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
839	key = tcp_md5_do_lookup(sk: sk1, l3index, addr, AF_INET);
840	if (!key)
841	goto out;
842
843
844	genhash = tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
845	if (genhash \|\| memcmp(md5_hash_location, newhash, `16`) != `0`)
846	goto out;
847
848	}
849
850	if (key) {
851	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \|
852	(TCPOPT_NOP << `16`) \|
853	(TCPOPT_MD5SIG << `8`) \|
854	TCPOLEN_MD5SIG);
855	/ Update length and the length the header thinks exists /
856	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
857	rep.th.doff = arg.iov[`0`].iov_len / `4`;
858
859	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[`1`],
860	key, daddr: ip_hdr(skb)->saddr,
861	saddr: ip_hdr(skb)->daddr, th: &rep.th);
862	}
863	#endif
864	/ Can't co-exist with TCPMD5, hence check rep.opt[0] /
865	if (rep.opt[`0`] == `0`) {
866	__be32 mrst = mptcp_reset_option(skb);
867
868	if (mrst) {
869	rep.opt[`0`] = mrst;
870	arg.iov[`0`].iov_len += sizeof(mrst);
871	rep.th.doff = arg.iov[`0`].iov_len / `4`;
872	}
873	}
874
875	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
876	daddr: ip_hdr(skb)->saddr, / XXX /
877	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
878	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
879	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : `0`;
880
881	/ When socket is gone, all binding information is lost.*
882	* routing might fail in this case. No choice here, if we choose to force
883	* input interface, we will misroute in case of asymmetric route.
884	*/
885	if (sk)
886	arg.bound_dev_if = sk->sk_bound_dev_if;
887
888	trace_tcp_send_reset(sk, skb__nullable: skb, reason);
889
890	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
891	offsetof(struct inet_timewait_sock, tw_bound_dev_if));
892
893	/ ECN bits of TW reset are cleared /
894	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
895	arg.uid = sock_net_uid(net, sk: sk && sk_fullsock(sk) ? sk : NULL);
896	local_bh_disable();
897	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
898	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
899
900	sock_net_set(sk: ctl_sk, net);
901	if (sk) {
902	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
903	inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
904	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
905	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
906	transmit_time = tcp_transmit_time(sk);
907	xfrm_sk_clone_policy(sk: ctl_sk, osk: sk);
908	txhash = (sk->sk_state == TCP_TIME_WAIT) ?
909	inet_twsk(sk)->tw_txhash : sk->sk_txhash;
910	} else {
911	ctl_sk->sk_mark = `0`;
912	ctl_sk->sk_priority = `0`;
913	}
914	ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
915	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
916	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
917	arg: &arg, len: arg.iov[`0`].iov_len,
918	transmit_time, txhash);
919
920	xfrm_sk_free_policy(sk: ctl_sk);
921	sock_net_set(sk: ctl_sk, net: &init_net);
922	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
923	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
924	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
925	local_bh_enable();
926
927	#ifdef CONFIG_TCP_MD5SIG
928	out:
929	rcu_read_unlock();
930	#endif
931	}
932
933	/ The code following below sending ACKs in SYN-RECV and TIME-WAIT states*
934	outside socket context is ugly, certainly. What can I do?
935	*/
936
937	static void tcp_v4_send_ack(const struct sock *sk,
938	struct sk_buff *skb, u32 seq, u32 ack,
939	u32 win, u32 tsval, u32 tsecr, int oif,
940	struct tcp_key *key,
941	int reply_flags, u8 tos, u32 txhash)
942	{
943	const struct tcphdr *th = tcp_hdr(skb);
944	struct {
945	struct tcphdr th;
946	__be32 opt[(MAX_TCP_OPTION_SPACE >> `2`)];
947	} rep;
948	struct net *net = sock_net(sk);
949	struct ip_reply_arg arg;
950	struct sock *ctl_sk;
951	u64 transmit_time;
952
953	memset(s: &rep.th, c: `0`, n: sizeof(struct tcphdr));
954	memset(s: &arg, c: `0`, n: sizeof(arg));
955
956	arg.iov[`0`].iov_base = (unsigned char *)&rep;
957	arg.iov[`0`].iov_len = sizeof(rep.th);
958	if (tsecr) {
959	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`) \|
960	(TCPOPT_TIMESTAMP << `8`) \|
961	TCPOLEN_TIMESTAMP);
962	rep.opt[`1`] = htonl(tsval);
963	rep.opt[`2`] = htonl(tsecr);
964	arg.iov[`0`].iov_len += TCPOLEN_TSTAMP_ALIGNED;
965	}
966
967	/ Swap the send and the receive. /
968	rep.th.dest = th->source;
969	rep.th.source = th->dest;
970	rep.th.doff = arg.iov[`0`].iov_len / `4`;
971	rep.th.seq = htonl(seq);
972	rep.th.ack_seq = htonl(ack);
973	rep.th.ack = `1`;
974	rep.th.window = htons(win);
975
976	#ifdef CONFIG_TCP_MD5SIG
977	if (tcp_key_is_md5(key)) {
978	int offset = (tsecr) ? `3` : `0`;
979
980	rep.opt[offset++] = htonl((TCPOPT_NOP << `24`) \|
981	(TCPOPT_NOP << `16`) \|
982	(TCPOPT_MD5SIG << `8`) \|
983	TCPOLEN_MD5SIG);
984	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
985	rep.th.doff = arg.iov[`0`].iov_len/`4`;
986
987	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[offset],
988	key: key->md5_key, daddr: ip_hdr(skb)->saddr,
989	saddr: ip_hdr(skb)->daddr, th: &rep.th);
990	}
991	#endif
992	#ifdef CONFIG_TCP_AO
993	if (tcp_key_is_ao(key)) {
994	int offset = (tsecr) ? `3` : `0`;
995
996	rep.opt[offset++] = htonl((TCPOPT_AO << `24`) \|
997	(tcp_ao_len(key->ao_key) << `16`) \|
998	(key->ao_key->sndid << `8`) \|
999	key->rcv_next);
1000	arg.iov[`0`].iov_len += tcp_ao_len_aligned(key->ao_key);
1001	rep.th.doff = arg.iov[`0`].iov_len / `4`;
1002
1003	tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1004	key->ao_key, key->traffic_key,
1005	(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1006	(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1007	&rep.th, key->sne);
1008	}
1009	#endif
1010	arg.flags = reply_flags;
1011	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
1012	daddr: ip_hdr(skb)->saddr, / XXX /
1013	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
1014	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
1015	if (oif)
1016	arg.bound_dev_if = oif;
1017	arg.tos = tos;
1018	arg.uid = sock_net_uid(net, sk: sk_fullsock(sk) ? sk : NULL);
1019	local_bh_disable();
1020	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1021	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1022	sock_net_set(sk: ctl_sk, net);
1023	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1024	inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1025	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1026	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1027	transmit_time = tcp_transmit_time(sk);
1028	ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
1029	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
1030	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
1031	arg: &arg, len: arg.iov[`0`].iov_len,
1032	transmit_time, txhash);
1033
1034	sock_net_set(sk: ctl_sk, net: &init_net);
1035	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1036	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1037	local_bh_enable();
1038	}
1039
1040	static void tcp_v4_timewait_ack(struct sock sk, struct* sk_buff *skb,
1041	enum tcp_tw_status tw_status)
1042	{
1043	struct inet_timewait_sock *tw = inet_twsk(sk);
1044	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1045	struct tcp_key key = {};
1046	u8 tos = tw->tw_tos;
1047
1048	/ Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,*
1049	* while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1050	* being placed in a different service queues (Classic rather than L4S)
1051	*/
1052	if (tw_status == TCP_TW_ACK_OOW)
1053	tos &= ~INET_ECN_MASK;
1054
1055	#ifdef CONFIG_TCP_AO
1056	struct tcp_ao_info *ao_info;
1057
1058	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1059	/ FIXME: the segment to-be-acked is not verified yet /
1060	ao_info = rcu_dereference(tcptw->ao_info);
1061	if (ao_info) {
1062	const struct tcp_ao_hdr *aoh;
1063
1064	if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1065	inet_twsk_put(tw);
1066	return;
1067	}
1068
1069	if (aoh)
1070	key.ao_key = tcp_ao_established_key(sk, ao_info,
1071	aoh->rnext_keyid, -`1`);
1072	}
1073	}
1074	if (key.ao_key) {
1075	struct tcp_ao_key *rnext_key;
1076
1077	key.traffic_key = snd_other_key(key.ao_key);
1078	key.sne = READ_ONCE(ao_info->snd_sne);
1079	rnext_key = READ_ONCE(ao_info->rnext_key);
1080	key.rcv_next = rnext_key->rcvid;
1081	key.type = TCP_KEY_AO;
1082	#else
1083	if (`0`) {
1084	#endif
1085	} else if (static_branch_tcp_md5()) {
1086	key.md5_key = tcp_twsk_md5_key(tcptw);
1087	if (key.md5_key)
1088	key.type = TCP_KEY_MD5;
1089	}
1090
1091	tcp_v4_send_ack(sk, skb,
1092	seq: tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1093	win: tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1094	tsval: tcp_tw_tsval(tcptw),
1095	READ_ONCE(tcptw->tw_ts_recent),
1096	oif: tw->tw_bound_dev_if, key: &key,
1097	reply_flags: tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1098	tos,
1099	txhash: tw->tw_txhash);
1100
1101	inet_twsk_put(tw);
1102	}
1103
1104	static void tcp_v4_reqsk_send_ack(const struct sock sk, struct* sk_buff *skb,
1105	struct request_sock *req)
1106	{
1107	struct tcp_key key = {};
1108
1109	/ sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV*
1110	* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1111	*/
1112	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + `1` :
1113	tcp_sk(sk)->snd_nxt;
1114
1115	#ifdef CONFIG_TCP_AO
1116	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1117	tcp_rsk_used_ao(req)) {
1118	const union tcp_md5_addr *addr;
1119	const struct tcp_ao_hdr *aoh;
1120	int l3index;
1121
1122	/ Invalid TCP option size or twice included auth /
1123	if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1124	return;
1125	if (!aoh)
1126	return;
1127
1128	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1129	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1130	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1131	aoh->rnext_keyid, -`1`);
1132	if (unlikely(!key.ao_key)) {
1133	/ Send ACK with any matching MKT for the peer /
1134	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -`1`, -`1`);
1135	/ Matching key disappeared (user removed the key?)*
1136	* let the handshake timeout.
1137	*/
1138	if (!key.ao_key) {
1139	net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1140	addr,
1141	ntohs(tcp_hdr(skb)->source),
1142	&ip_hdr(skb)->daddr,
1143	ntohs(tcp_hdr(skb)->dest));
1144	return;
1145	}
1146	}
1147	key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1148	if (!key.traffic_key)
1149	return;
1150
1151	key.type = TCP_KEY_AO;
1152	key.rcv_next = aoh->keyid;
1153	tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1154	#else
1155	if (`0`) {
1156	#endif
1157	} else if (static_branch_tcp_md5()) {
1158	const union tcp_md5_addr *addr;
1159	int l3index;
1160
1161	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1162	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1163	key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1164	if (key.md5_key)
1165	key.type = TCP_KEY_MD5;
1166	}
1167
1168	/ Cleaning ECN bits of TW ACKs of oow data or is paws_reject /
1169	tcp_v4_send_ack(sk, skb, seq,
1170	ack: tcp_rsk(req)->rcv_nxt,
1171	win: tcp_synack_window(req) >> inet_rsk(sk: req)->rcv_wscale,
1172	tsval: tcp_rsk_tsval(treq: tcp_rsk(req)),
1173	tsecr: req->ts_recent,
1174	oif: `0`, key: &key,
1175	reply_flags: inet_rsk(sk: req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1176	tos: ip_hdr(skb)->tos & ~INET_ECN_MASK,
1177	READ_ONCE(tcp_rsk(req)->txhash));
1178	if (tcp_key_is_ao(key: &key))
1179	kfree(objp: key.traffic_key);
1180	}
1181
1182	/*
1183	* Send a SYN-ACK after having received a SYN.
1184	* This still operates on a request_sock only, not on a big
1185	* socket.
1186	*/
1187	static int tcp_v4_send_synack(const struct sock sk, struct* dst_entry *dst,
1188	struct flowi *fl,
1189	struct request_sock *req,
1190	struct tcp_fastopen_cookie *foc,
1191	enum tcp_synack_type synack_type,
1192	struct sk_buff *syn_skb)
1193	{
1194	struct inet_request_sock *ireq = inet_rsk(sk: req);
1195	struct flowi4 fl4;
1196	int err = -`1`;
1197	struct sk_buff *skb;
1198	u8 tos;
1199
1200	/ First, grab a route. /
1201	if (!dst && (dst = inet_csk_route_req(sk, fl4: &fl4, req)) == NULL)
1202	return -`1`;
1203
1204	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1205
1206	if (skb) {
1207	tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1208	__tcp_v4_send_check(skb, saddr: ireq->ir_loc_addr, daddr: ireq->ir_rmt_addr);
1209
1210	tos = READ_ONCE(inet_sk(sk)->tos);
1211
1212	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1213	tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) \|
1214	(tos & INET_ECN_MASK);
1215
1216	if (!INET_ECN_is_capable(dsfield: tos) &&
1217	tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
1218	tos \|= INET_ECN_ECT_0;
1219
1220	rcu_read_lock();
1221	err = ip_build_and_send_pkt(skb, sk, saddr: ireq->ir_loc_addr,
1222	daddr: ireq->ir_rmt_addr,
1223	rcu_dereference(ireq->ireq_opt),
1224	tos);
1225	rcu_read_unlock();
1226	err = net_xmit_eval(err);
1227	}
1228
1229	return err;
1230	}
1231
1232	/*
1233	* IPv4 request_sock destructor.
1234	*/
1235	static void tcp_v4_reqsk_destructor(struct request_sock *req)
1236	{
1237	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, `1`));
1238	}
1239
1240	#ifdef CONFIG_TCP_MD5SIG
1241	/*
1242	* RFC2385 MD5 checksumming requires a mapping of
1243	* IP address->MD5 Key.
1244	* We need to maintain these in the sk structure.
1245	*/
1246
1247	DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1248	EXPORT_IPV6_MOD(tcp_md5_needed);
1249
1250	static bool better_md5_match(struct tcp_md5sig_key old, struct* tcp_md5sig_key *new)
1251	{
1252	if (!old)
1253	return true;
1254
1255	/ l3index always overrides non-l3index /
1256	if (old->l3index && new->l3index == `0`)
1257	return false;
1258	if (old->l3index == `0` && new->l3index)
1259	return true;
1260
1261	return old->prefixlen < new->prefixlen;
1262	}
1263
1264	/ Find the Key structure for an address. /
1265	struct tcp_md5sig_key __tcp_md5_do_lookup(const* struct sock sk, int* l3index,
1266	const union tcp_md5_addr *addr,
1267	int family, bool any_l3index)
1268	{
1269	const struct tcp_sock *tp = tcp_sk(sk);
1270	struct tcp_md5sig_key *key;
1271	const struct tcp_md5sig_info *md5sig;
1272	__be32 mask;
1273	struct tcp_md5sig_key *best_match = NULL;
1274	bool match;
1275
1276	/ caller either holds rcu_read_lock() or socket lock /
1277	md5sig = rcu_dereference_check(tp->md5sig_info,
1278	lockdep_sock_is_held(sk));
1279	if (!md5sig)
1280	return NULL;
1281
1282	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1283	lockdep_sock_is_held(sk)) {
1284	if (key->family != family)
1285	continue;
1286	if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1287	key->l3index != l3index)
1288	continue;
1289	if (family == AF_INET) {
1290	mask = inet_make_mask(logmask: key->prefixlen);
1291	match = (key->addr.a4.s_addr & mask) ==
1292	(addr->a4.s_addr & mask);
1293	#if IS_ENABLED(CONFIG_IPV6)
1294	} else if (family == AF_INET6) {
1295	match = ipv6_prefix_equal(addr1: &key->addr.a6, addr2: &addr->a6,
1296	prefixlen: key->prefixlen);
1297	#endif
1298	} else {
1299	match = false;
1300	}
1301
1302	if (match && better_md5_match(old: best_match, new: key))
1303	best_match = key;
1304	}
1305	return best_match;
1306	}
1307	EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1308
1309	static struct tcp_md5sig_key tcp_md5_do_lookup_exact(const* struct sock *sk,
1310	const union tcp_md5_addr *addr,
1311	int family, u8 prefixlen,
1312	int l3index, u8 flags)
1313	{
1314	const struct tcp_sock *tp = tcp_sk(sk);
1315	struct tcp_md5sig_key *key;
1316	unsigned int size = sizeof(struct in_addr);
1317	const struct tcp_md5sig_info *md5sig;
1318
1319	/ caller either holds rcu_read_lock() or socket lock /
1320	md5sig = rcu_dereference_check(tp->md5sig_info,
1321	lockdep_sock_is_held(sk));
1322	if (!md5sig)
1323	return NULL;
1324	#if IS_ENABLED(CONFIG_IPV6)
1325	if (family == AF_INET6)
1326	size = sizeof(struct in6_addr);
1327	#endif
1328	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1329	lockdep_sock_is_held(sk)) {
1330	if (key->family != family)
1331	continue;
1332	if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1333	continue;
1334	if (key->l3index != l3index)
1335	continue;
1336	if (!memcmp(&key->addr, addr, size) &&
1337	key->prefixlen == prefixlen)
1338	return key;
1339	}
1340	return NULL;
1341	}
1342
1343	struct tcp_md5sig_key tcp_v4_md5_lookup(const* struct sock *sk,
1344	const struct sock *addr_sk)
1345	{
1346	const union tcp_md5_addr *addr;
1347	int l3index;
1348
1349	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk),
1350	ifindex: addr_sk->sk_bound_dev_if);
1351	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1352	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1353	}
1354	EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1355
1356	static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1357	{
1358	struct tcp_sock *tp = tcp_sk(sk);
1359	struct tcp_md5sig_info *md5sig;
1360
1361	md5sig = kmalloc(sizeof(*md5sig), gfp);
1362	if (!md5sig)
1363	return -ENOMEM;
1364
1365	sk_gso_disable(sk);
1366	INIT_HLIST_HEAD(&md5sig->head);
1367	rcu_assign_pointer(tp->md5sig_info, md5sig);
1368	return `0`;
1369	}
1370
1371	/ This can be called on a newly created socket, from other files /
1372	static int __tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1373	int family, u8 prefixlen, int l3index, u8 flags,
1374	const u8 *newkey, u8 newkeylen, gfp_t gfp)
1375	{
1376	/ Add Key to the list /
1377	struct tcp_md5sig_key *key;
1378	struct tcp_sock *tp = tcp_sk(sk);
1379	struct tcp_md5sig_info *md5sig;
1380
1381	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1382	if (key) {
1383	/ Pre-existing entry - just update that one.*
1384	* Note that the key might be used concurrently.
1385	* data_race() is telling kcsan that we do not care of
1386	* key mismatches, since changing MD5 key on live flows
1387	* can lead to packet drops.
1388	*/
1389	data_race(memcpy(key->key, newkey, newkeylen));
1390
1391	/ Pairs with READ_ONCE() in tcp_md5_hash_key().*
1392	* Also note that a reader could catch new key->keylen value
1393	* but old key->key[], this is the reason we use __GFP_ZERO
1394	* at sock_kmalloc() time below these lines.
1395	*/
1396	WRITE_ONCE(key->keylen, newkeylen);
1397
1398	return `0`;
1399	}
1400
1401	md5sig = rcu_dereference_protected(tp->md5sig_info,
1402	lockdep_sock_is_held(sk));
1403
1404	key = sock_kmalloc(sk, size: sizeof(*key), priority: gfp \| __GFP_ZERO);
1405	if (!key)
1406	return -ENOMEM;
1407
1408	memcpy(to: key->key, from: newkey, len: newkeylen);
1409	key->keylen = newkeylen;
1410	key->family = family;
1411	key->prefixlen = prefixlen;
1412	key->l3index = l3index;
1413	key->flags = flags;
1414	memcpy(to: &key->addr, from: addr,
1415	len: (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1416	sizeof(struct in_addr));
1417	hlist_add_head_rcu(n: &key->node, h: &md5sig->head);
1418	return `0`;
1419	}
1420
1421	int tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1422	int family, u8 prefixlen, int l3index, u8 flags,
1423	const u8 *newkey, u8 newkeylen)
1424	{
1425	struct tcp_sock *tp = tcp_sk(sk);
1426
1427	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1428	if (tcp_md5_alloc_sigpool())
1429	return -ENOMEM;
1430
1431	if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1432	tcp_md5_release_sigpool();
1433	return -ENOMEM;
1434	}
1435
1436	if (!static_branch_inc(&tcp_md5_needed.key)) {
1437	struct tcp_md5sig_info *md5sig;
1438
1439	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1440	rcu_assign_pointer(tp->md5sig_info, NULL);
1441	kfree_rcu(md5sig, rcu);
1442	tcp_md5_release_sigpool();
1443	return -EUSERS;
1444	}
1445	}
1446
1447	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1448	newkey, newkeylen, GFP_KERNEL);
1449	}
1450	EXPORT_IPV6_MOD(tcp_md5_do_add);
1451
1452	int tcp_md5_key_copy(struct sock sk, const* union tcp_md5_addr *addr,
1453	int family, u8 prefixlen, int l3index,
1454	struct tcp_md5sig_key *key)
1455	{
1456	struct tcp_sock *tp = tcp_sk(sk);
1457
1458	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1459	tcp_md5_add_sigpool();
1460
1461	if (tcp_md5sig_info_add(sk, gfp: sk_gfp_mask(sk, GFP_ATOMIC))) {
1462	tcp_md5_release_sigpool();
1463	return -ENOMEM;
1464	}
1465
1466	if (!static_key_fast_inc_not_disabled(key: &tcp_md5_needed.key.key)) {
1467	struct tcp_md5sig_info *md5sig;
1468
1469	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1470	net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1471	rcu_assign_pointer(tp->md5sig_info, NULL);
1472	kfree_rcu(md5sig, rcu);
1473	tcp_md5_release_sigpool();
1474	return -EUSERS;
1475	}
1476	}
1477
1478	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1479	flags: key->flags, newkey: key->key, newkeylen: key->keylen,
1480	gfp: sk_gfp_mask(sk, GFP_ATOMIC));
1481	}
1482	EXPORT_IPV6_MOD(tcp_md5_key_copy);
1483
1484	int tcp_md5_do_del(struct sock sk, const* union tcp_md5_addr addr, int* family,
1485	u8 prefixlen, int l3index, u8 flags)
1486	{
1487	struct tcp_md5sig_key *key;
1488
1489	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1490	if (!key)
1491	return -ENOENT;
1492	hlist_del_rcu(n: &key->node);
1493	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1494	kfree_rcu(key, rcu);
1495	return `0`;
1496	}
1497	EXPORT_IPV6_MOD(tcp_md5_do_del);
1498
1499	void tcp_clear_md5_list(struct sock *sk)
1500	{
1501	struct tcp_sock *tp = tcp_sk(sk);
1502	struct tcp_md5sig_key *key;
1503	struct hlist_node *n;
1504	struct tcp_md5sig_info *md5sig;
1505
1506	md5sig = rcu_dereference_protected(tp->md5sig_info, `1`);
1507
1508	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1509	hlist_del(n: &key->node);
1510	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1511	kfree(objp: key);
1512	}
1513	}
1514
1515	static int tcp_v4_parse_md5_keys(struct sock sk, int* optname,
1516	sockptr_t optval, int optlen)
1517	{
1518	struct tcp_md5sig cmd;
1519	struct sockaddr_in sin = (struct* sockaddr_in *)&cmd.tcpm_addr;
1520	const union tcp_md5_addr *addr;
1521	u8 prefixlen = `32`;
1522	int l3index = `0`;
1523	bool l3flag;
1524	u8 flags;
1525
1526	if (optlen < sizeof(cmd))
1527	return -EINVAL;
1528
1529	if (copy_from_sockptr(dst: &cmd, src: optval, size: sizeof(cmd)))
1530	return -EFAULT;
1531
1532	if (sin->sin_family != AF_INET)
1533	return -EINVAL;
1534
1535	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1536	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1537
1538	if (optname == TCP_MD5SIG_EXT &&
1539	cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1540	prefixlen = cmd.tcpm_prefixlen;
1541	if (prefixlen > `32`)
1542	return -EINVAL;
1543	}
1544
1545	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1546	cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1547	struct net_device *dev;
1548
1549	rcu_read_lock();
1550	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: cmd.tcpm_ifindex);
1551	if (dev && netif_is_l3_master(dev))
1552	l3index = dev->ifindex;
1553
1554	rcu_read_unlock();
1555
1556	/ ok to reference set/not set outside of rcu;*
1557	* right now device MUST be an L3 master
1558	*/
1559	if (!dev \|\| !l3index)
1560	return -EINVAL;
1561	}
1562
1563	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1564
1565	if (!cmd.tcpm_keylen)
1566	return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1567
1568	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1569	return -EINVAL;
1570
1571	/ Don't allow keys for peers that have a matching TCP-AO key.*
1572	* See the comment in tcp_ao_add_cmd()
1573	*/
1574	if (tcp_ao_required(sk, saddr: addr, AF_INET, l3index: l3flag ? l3index : -`1`, stat_inc: false))
1575	return -EKEYREJECTED;
1576
1577	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1578	newkey: cmd.tcpm_key, newkeylen: cmd.tcpm_keylen);
1579	}
1580
1581	static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1582	__be32 daddr, __be32 saddr,
1583	const struct tcphdr th, int* nbytes)
1584	{
1585	struct tcp4_pseudohdr *bp;
1586	struct scatterlist sg;
1587	struct tcphdr *_th;
1588
1589	bp = hp->scratch;
1590	bp->saddr = saddr;
1591	bp->daddr = daddr;
1592	bp->pad = `0`;
1593	bp->protocol = IPPROTO_TCP;
1594	bp->len = cpu_to_be16(nbytes);
1595
1596	_th = (struct tcphdr *)(bp + `1`);
1597	memcpy(to: _th, from: th, len: sizeof(*th));
1598	_th->check = `0`;
1599
1600	sg_init_one(&sg, bp, sizeof(bp) + sizeof(th));
1601	ahash_request_set_crypt(req: hp->req, src: &sg, NULL,
1602	nbytes: sizeof(bp) + sizeof(th));
1603	return crypto_ahash_update(req: hp->req);
1604	}
1605
1606	static int tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
1607	__be32 daddr, __be32 saddr, const struct tcphdr *th)
1608	{
1609	struct tcp_sigpool hp;
1610
1611	if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1612	goto clear_hash_nostart;
1613
1614	if (crypto_ahash_init(req: hp.req))
1615	goto clear_hash;
1616	if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: th->doff << `2`))
1617	goto clear_hash;
1618	if (tcp_md5_hash_key(hp: &hp, key))
1619	goto clear_hash;
1620	ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: `0`);
1621	if (crypto_ahash_final(req: hp.req))
1622	goto clear_hash;
1623
1624	tcp_sigpool_end(c: &hp);
1625	return `0`;
1626
1627	clear_hash:
1628	tcp_sigpool_end(c: &hp);
1629	clear_hash_nostart:
1630	memset(s: md5_hash, c: `0`, n: `16`);
1631	return `1`;
1632	}
1633
1634	int tcp_v4_md5_hash_skb(char md5_hash, const* struct tcp_md5sig_key *key,
1635	const struct sock *sk,
1636	const struct sk_buff *skb)
1637	{
1638	const struct tcphdr *th = tcp_hdr(skb);
1639	struct tcp_sigpool hp;
1640	__be32 saddr, daddr;
1641
1642	if (sk) { / valid for establish/request sockets /
1643	saddr = sk->sk_rcv_saddr;
1644	daddr = sk->sk_daddr;
1645	} else {
1646	const struct iphdr *iph = ip_hdr(skb);
1647	saddr = iph->saddr;
1648	daddr = iph->daddr;
1649	}
1650
1651	if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1652	goto clear_hash_nostart;
1653
1654	if (crypto_ahash_init(req: hp.req))
1655	goto clear_hash;
1656
1657	if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: skb->len))
1658	goto clear_hash;
1659	if (tcp_sigpool_hash_skb_data(hp: &hp, skb, header_len: th->doff << `2`))
1660	goto clear_hash;
1661	if (tcp_md5_hash_key(hp: &hp, key))
1662	goto clear_hash;
1663	ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: `0`);
1664	if (crypto_ahash_final(req: hp.req))
1665	goto clear_hash;
1666
1667	tcp_sigpool_end(c: &hp);
1668	return `0`;
1669
1670	clear_hash:
1671	tcp_sigpool_end(c: &hp);
1672	clear_hash_nostart:
1673	memset(s: md5_hash, c: `0`, n: `16`);
1674	return `1`;
1675	}
1676	EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1677
1678	#endif
1679
1680	static void tcp_v4_init_req(struct request_sock *req,
1681	const struct sock *sk_listener,
1682	struct sk_buff *skb)
1683	{
1684	struct inet_request_sock *ireq = inet_rsk(sk: req);
1685	struct net *net = sock_net(sk: sk_listener);
1686
1687	sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr);
1688	sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr);
1689	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1690	}
1691
1692	static struct dst_entry tcp_v4_route_req(const* struct sock *sk,
1693	struct sk_buff *skb,
1694	struct flowi *fl,
1695	struct request_sock *req,
1696	u32 tw_isn)
1697	{
1698	tcp_v4_init_req(req, sk_listener: sk, skb);
1699
1700	if (security_inet_conn_request(sk, skb, req))
1701	return NULL;
1702
1703	return inet_csk_route_req(sk, fl4: &fl->u.ip4, req);
1704	}
1705
1706	struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1707	.family = PF_INET,
1708	.obj_size = sizeof(struct tcp_request_sock),
1709	.send_ack = tcp_v4_reqsk_send_ack,
1710	.destructor = tcp_v4_reqsk_destructor,
1711	.send_reset = tcp_v4_send_reset,
1712	.syn_ack_timeout = tcp_syn_ack_timeout,
1713	};
1714
1715	const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1716	.mss_clamp = TCP_MSS_DEFAULT,
1717	#ifdef CONFIG_TCP_MD5SIG
1718	.req_md5_lookup = tcp_v4_md5_lookup,
1719	.calc_md5_hash = tcp_v4_md5_hash_skb,
1720	#endif
1721	#ifdef CONFIG_TCP_AO
1722	.ao_lookup = tcp_v4_ao_lookup_rsk,
1723	.ao_calc_key = tcp_v4_ao_calc_key_rsk,
1724	.ao_synack_hash = tcp_v4_ao_synack_hash,
1725	#endif
1726	#ifdef CONFIG_SYN_COOKIES
1727	.cookie_init_seq = cookie_v4_init_sequence,
1728	#endif
1729	.route_req = tcp_v4_route_req,
1730	.init_seq = tcp_v4_init_seq,
1731	.init_ts_off = tcp_v4_init_ts_off,
1732	.send_synack = tcp_v4_send_synack,
1733	};
1734
1735	int tcp_v4_conn_request(struct sock sk, struct* sk_buff *skb)
1736	{
1737	/ Never answer to SYNs send to broadcast or multicast /
1738	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
1739	goto drop;
1740
1741	return tcp_conn_request(rsk_ops: &tcp_request_sock_ops,
1742	af_ops: &tcp_request_sock_ipv4_ops, sk, skb);
1743
1744	drop:
1745	tcp_listendrop(sk);
1746	return `0`;
1747	}
1748	EXPORT_IPV6_MOD(tcp_v4_conn_request);
1749
1750
1751	/*
1752	* The three way handshake has completed - we got a valid synack -
1753	* now create the new socket.
1754	*/
1755	struct sock tcp_v4_syn_recv_sock(const* struct sock sk, struct* sk_buff *skb,
1756	struct request_sock *req,
1757	struct dst_entry *dst,
1758	struct request_sock *req_unhash,
1759	bool *own_req)
1760	{
1761	struct inet_request_sock *ireq;
1762	bool found_dup_sk = false;
1763	struct inet_sock *newinet;
1764	struct tcp_sock *newtp;
1765	struct sock *newsk;
1766	#ifdef CONFIG_TCP_MD5SIG
1767	const union tcp_md5_addr *addr;
1768	struct tcp_md5sig_key *key;
1769	int l3index;
1770	#endif
1771	struct ip_options_rcu *inet_opt;
1772
1773	if (sk_acceptq_is_full(sk))
1774	goto exit_overflow;
1775
1776	newsk = tcp_create_openreq_child(sk, req, skb);
1777	if (!newsk)
1778	goto exit_nonewsk;
1779
1780	newsk->sk_gso_type = SKB_GSO_TCPV4;
1781	inet_sk_rx_dst_set(sk: newsk, skb);
1782
1783	newtp = tcp_sk(newsk);
1784	newinet = inet_sk(newsk);
1785	ireq = inet_rsk(sk: req);
1786	inet_opt = rcu_dereference(ireq->ireq_opt);
1787	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1788	newinet->mc_index = inet_iif(skb);
1789	newinet->mc_ttl = ip_hdr(skb)->ttl;
1790	newinet->rcv_tos = ip_hdr(skb)->tos;
1791	inet_csk(newsk)->icsk_ext_hdr_len = `0`;
1792	if (inet_opt)
1793	inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1794	atomic_set(v: &newinet->inet_id, i: get_random_u16());
1795
1796	/ Set ToS of the new socket based upon the value of incoming SYN.*
1797	* ECT bits are set later in tcp_init_transfer().
1798	*/
1799	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1800	newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1801
1802	if (!dst) {
1803	dst = inet_csk_route_child_sock(sk, newsk, req);
1804	if (!dst)
1805	goto put_and_exit;
1806	} else {
1807	/ syncookie case : see end of cookie_v4_check() /
1808	}
1809	sk_setup_caps(sk: newsk, dst);
1810
1811	tcp_ca_openreq_child(sk: newsk, dst);
1812
1813	tcp_sync_mss(sk: newsk, pmtu: dst_mtu(dst));
1814	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), mss: dst_metric_advmss(dst));
1815
1816	tcp_initialize_rcv_mss(sk: newsk);
1817
1818	#ifdef CONFIG_TCP_MD5SIG
1819	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk), ifindex: ireq->ir_iif);
1820	/ Copy over the MD5 key from the original socket /
1821	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1822	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1823	if (key && !tcp_rsk_used_ao(req)) {
1824	if (tcp_md5_key_copy(sk: newsk, addr, AF_INET, prefixlen: `32`, l3index, key))
1825	goto put_and_exit;
1826	sk_gso_disable(sk: newsk);
1827	}
1828	#endif
1829	#ifdef CONFIG_TCP_AO
1830	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1831	goto put_and_exit; / OOM, release back memory /
1832	#endif
1833
1834	if (__inet_inherit_port(sk, child: newsk) < `0`)
1835	goto put_and_exit;
1836	*own_req = inet_ehash_nolisten(sk: newsk, osk: req_to_sk(req: req_unhash),
1837	found_dup_sk: &found_dup_sk);
1838	if (likely(*own_req)) {
1839	tcp_move_syn(tp: newtp, req);
1840	ireq->ireq_opt = NULL;
1841	} else {
1842	newinet->inet_opt = NULL;
1843
1844	if (!req_unhash && found_dup_sk) {
1845	/ This code path should only be executed in the*
1846	* syncookie case only
1847	*/
1848	bh_unlock_sock(newsk);
1849	sock_put(sk: newsk);
1850	newsk = NULL;
1851	}
1852	}
1853	return newsk;
1854
1855	exit_overflow:
1856	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1857	exit_nonewsk:
1858	dst_release(dst);
1859	exit:
1860	tcp_listendrop(sk);
1861	return NULL;
1862	put_and_exit:
1863	newinet->inet_opt = NULL;
1864	inet_csk_prepare_forced_close(sk: newsk);
1865	tcp_done(sk: newsk);
1866	goto exit;
1867	}
1868	EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1869
1870	static struct sock tcp_v4_cookie_check(struct* sock sk, struct* sk_buff *skb)
1871	{
1872	#ifdef CONFIG_SYN_COOKIES
1873	const struct tcphdr *th = tcp_hdr(skb);
1874
1875	if (!th->syn)
1876	sk = cookie_v4_check(sk, skb);
1877	#endif
1878	return sk;
1879	}
1880
1881	u16 tcp_v4_get_syncookie(struct sock sk, struct* iphdr *iph,
1882	struct tcphdr th, u32 cookie)
1883	{
1884	u16 mss = `0`;
1885	#ifdef CONFIG_SYN_COOKIES
1886	mss = tcp_get_syncookie_mss(rsk_ops: &tcp_request_sock_ops,
1887	af_ops: &tcp_request_sock_ipv4_ops, sk, th);
1888	if (mss) {
1889	*cookie = __cookie_v4_init_sequence(iph, th, mssp: &mss);
1890	tcp_synq_overflow(sk);
1891	}
1892	#endif
1893	return mss;
1894	}
1895
1896	INDIRECT_CALLABLE_DECLARE(struct dst_entry ipv4_dst_check(struct* dst_entry *,
1897	u32));
1898	/ The socket must have it's spinlock held when we get*
1899	* here, unless it is a TCP_LISTEN socket.
1900	*
1901	* We have a potential double-lock case here, so even when
1902	* doing backlog processing we use the BH locking scheme.
1903	* This is because we cannot sleep with the original spinlock
1904	* held.
1905	*/
1906	int tcp_v4_do_rcv(struct sock sk, struct* sk_buff *skb)
1907	{
1908	enum skb_drop_reason reason;
1909	struct sock *rsk;
1910
1911	reason = psp_sk_rx_policy_check(sk, skb);
1912	if (reason)
1913	goto err_discard;
1914
1915	if (sk->sk_state == TCP_ESTABLISHED) { / Fast path /
1916	struct dst_entry *dst;
1917
1918	dst = rcu_dereference_protected(sk->sk_rx_dst,
1919	lockdep_sock_is_held(sk));
1920
1921	sock_rps_save_rxhash(sk, skb);
1922	sk_mark_napi_id(sk, skb);
1923	if (dst) {
1924	if (sk->sk_rx_dst_ifindex != skb->skb_iif \|\|
1925	!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1926	dst, `0`)) {
1927	RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1928	dst_release(dst);
1929	}
1930	}
1931	tcp_rcv_established(sk, skb);
1932	return `0`;
1933	}
1934
1935	if (tcp_checksum_complete(skb))
1936	goto csum_err;
1937
1938	if (sk->sk_state == TCP_LISTEN) {
1939	struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1940
1941	if (!nsk)
1942	return `0`;
1943	if (nsk != sk) {
1944	reason = tcp_child_process(parent: sk, child: nsk, skb);
1945	if (reason) {
1946	rsk = nsk;
1947	goto reset;
1948	}
1949	return `0`;
1950	}
1951	} else
1952	sock_rps_save_rxhash(sk, skb);
1953
1954	reason = tcp_rcv_state_process(sk, skb);
1955	if (reason) {
1956	rsk = sk;
1957	goto reset;
1958	}
1959	return `0`;
1960
1961	reset:
1962	tcp_v4_send_reset(sk: rsk, skb, reason: sk_rst_convert_drop_reason(reason));
1963	discard:
1964	sk_skb_reason_drop(sk, skb, reason);
1965	/ Be careful here. If this function gets more complicated and*
1966	* gcc suffers from register pressure on the x86, sk (in %ebx)
1967	* might be destroyed here. This current version compiles correctly,
1968	* but you have been warned.
1969	*/
1970	return `0`;
1971
1972	csum_err:
1973	reason = SKB_DROP_REASON_TCP_CSUM;
1974	trace_tcp_bad_csum(skb);
1975	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1976	err_discard:
1977	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1978	goto discard;
1979	}
1980	EXPORT_SYMBOL(tcp_v4_do_rcv);
1981
1982	int tcp_v4_early_demux(struct sk_buff *skb)
1983	{
1984	struct net *net = dev_net_rcu(dev: skb->dev);
1985	const struct iphdr *iph;
1986	const struct tcphdr *th;
1987	struct sock *sk;
1988
1989	if (skb->pkt_type != PACKET_HOST)
1990	return `0`;
1991
1992	if (!pskb_may_pull(skb, len: skb_transport_offset(skb) + sizeof(struct tcphdr)))
1993	return `0`;
1994
1995	iph = ip_hdr(skb);
1996	th = tcp_hdr(skb);
1997
1998	if (th->doff < sizeof(struct tcphdr) / `4`)
1999	return `0`;
2000
2001	sk = __inet_lookup_established(net, saddr: iph->saddr, sport: th->source,
2002	daddr: iph->daddr, ntohs(th->dest),
2003	dif: skb->skb_iif, sdif: inet_sdif(skb));
2004	if (sk) {
2005	skb->sk = sk;
2006	skb->destructor = sock_edemux;
2007	if (sk_fullsock(sk)) {
2008	struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2009
2010	if (dst)
2011	dst = dst_check(dst, cookie: `0`);
2012	if (dst &&
2013	sk->sk_rx_dst_ifindex == skb->skb_iif)
2014	skb_dst_set_noref(skb, dst);
2015	}
2016	}
2017	return `0`;
2018	}
2019
2020	bool tcp_add_backlog(struct sock sk, struct* sk_buff *skb,
2021	enum skb_drop_reason *reason)
2022	{
2023	u32 tail_gso_size, tail_gso_segs;
2024	struct skb_shared_info *shinfo;
2025	const struct tcphdr *th;
2026	struct tcphdr *thtail;
2027	struct sk_buff *tail;
2028	unsigned int hdrlen;
2029	bool fragstolen;
2030	u32 gso_segs;
2031	u32 gso_size;
2032	u64 limit;
2033	int delta;
2034	int err;
2035
2036	/ In case all data was pulled from skb frags (in __pskb_pull_tail()),*
2037	* we can fix skb->truesize to its real value to avoid future drops.
2038	* This is valid because skb is not yet charged to the socket.
2039	* It has been noticed pure SACK packets were sometimes dropped
2040	* (if cooked by drivers without copybreak feature).
2041	*/
2042	skb_condense(skb);
2043
2044	tcp_cleanup_skb(skb);
2045
2046	if (unlikely(tcp_checksum_complete(skb))) {
2047	bh_unlock_sock(sk);
2048	trace_tcp_bad_csum(skb);
2049	*reason = SKB_DROP_REASON_TCP_CSUM;
2050	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2051	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2052	return true;
2053	}
2054
2055	/ Attempt coalescing to last skb in backlog, even if we are*
2056	* above the limits.
2057	* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2058	*/
2059	th = (const struct tcphdr *)skb->data;
2060	hdrlen = th->doff * `4`;
2061
2062	tail = sk->sk_backlog.tail;
2063	if (!tail)
2064	goto no_coalesce;
2065	thtail = (struct tcphdr *)tail->data;
2066
2067	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq \|\|
2068	TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield \|\|
2069	((TCP_SKB_CB(tail)->tcp_flags \|
2070	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN \| TCPHDR_RST \| TCPHDR_URG)) \|\|
2071	!((TCP_SKB_CB(tail)->tcp_flags &
2072	TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) \|\|
2073	((TCP_SKB_CB(tail)->tcp_flags ^
2074	TCP_SKB_CB(skb)->tcp_flags) &
2075	(TCPHDR_ECE \| TCPHDR_CWR \| TCPHDR_AE)) \|\|
2076	!tcp_skb_can_collapse_rx(to: tail, from: skb) \|\|
2077	thtail->doff != th->doff \|\|
2078	memcmp(thtail + `1`, th + `1`, hdrlen - sizeof(*th)) \|\|
2079	/ prior to PSP Rx policy check, retain exact PSP metadata /
2080	psp_skb_coalesce_diff(one: tail, two: skb))
2081	goto no_coalesce;
2082
2083	__skb_pull(skb, len: hdrlen);
2084
2085	shinfo = skb_shinfo(skb);
2086	gso_size = shinfo->gso_size ?: skb->len;
2087	gso_segs = shinfo->gso_segs ?: `1`;
2088
2089	shinfo = skb_shinfo(tail);
2090	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2091	tail_gso_segs = shinfo->gso_segs ?: `1`;
2092
2093	if (skb_try_coalesce(to: tail, from: skb, fragstolen: &fragstolen, delta_truesize: &delta)) {
2094	TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2095
2096	if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2097	TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2098	thtail->window = th->window;
2099	}
2100
2101	/ We have to update both TCP_SKB_CB(tail)->tcp_flags and*
2102	* thtail->fin, so that the fast path in tcp_rcv_established()
2103	* is not entered if we append a packet with a FIN.
2104	* SYN, RST, URG are not present.
2105	* ACK is set on both packets.
2106	* PSH : we do not really care in TCP stack,
2107	* at least for 'GRO' packets.
2108	*/
2109	thtail->fin \|= th->fin;
2110	TCP_SKB_CB(tail)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
2111
2112	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2113	TCP_SKB_CB(tail)->has_rxtstamp = true;
2114	tail->tstamp = skb->tstamp;
2115	skb_hwtstamps(skb: tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2116	}
2117
2118	/ Not as strict as GRO. We only need to carry mss max value /
2119	shinfo->gso_size = max(gso_size, tail_gso_size);
2120	shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, `0xFFFF`);
2121
2122	sk->sk_backlog.len += delta;
2123	__NET_INC_STATS(sock_net(sk),
2124	LINUX_MIB_TCPBACKLOGCOALESCE);
2125	kfree_skb_partial(skb, head_stolen: fragstolen);
2126	return false;
2127	}
2128	__skb_push(skb, len: hdrlen);
2129
2130	no_coalesce:
2131	/ sk->sk_backlog.len is reset only at the end of __release_sock().*
2132	* Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2133	* sk_rcvbuf in normal conditions.
2134	*/
2135	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << `1`;
2136
2137	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> `1`;
2138
2139	/ Only socket owner can try to collapse/prune rx queues*
2140	* to reduce memory overhead, so add a little headroom here.
2141	* Few sockets backlog are possibly concurrently non empty.
2142	*/
2143	limit += `64` * `1024`;
2144
2145	limit = min_t(u64, limit, UINT_MAX);
2146
2147	err = sk_add_backlog(sk, skb, limit);
2148	if (unlikely(err)) {
2149	bh_unlock_sock(sk);
2150	if (err == -ENOMEM) {
2151	*reason = SKB_DROP_REASON_PFMEMALLOC;
2152	__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2153	} else {
2154	*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2155	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2156	}
2157	return true;
2158	}
2159	return false;
2160	}
2161	EXPORT_IPV6_MOD(tcp_add_backlog);
2162
2163	int tcp_filter(struct sock sk, struct* sk_buff skb, enum* skb_drop_reason *reason)
2164	{
2165	struct tcphdr th = (struct* tcphdr *)skb->data;
2166
2167	return sk_filter_trim_cap(sk, skb, cap: th->doff * `4`, reason);
2168	}
2169	EXPORT_IPV6_MOD(tcp_filter);
2170
2171	static void tcp_v4_restore_cb(struct sk_buff *skb)
2172	{
2173	memmove(IPCB(skb), src: &TCP_SKB_CB(skb)->header.h4,
2174	count: sizeof(struct inet_skb_parm));
2175	}
2176
2177	static void tcp_v4_fill_cb(struct sk_buff skb, const* struct iphdr *iph,
2178	const struct tcphdr *th)
2179	{
2180	/ This is tricky : We move IPCB at its correct location into TCP_SKB_CB()*
2181	* barrier() makes sure compiler wont play fool^Waliasing games.
2182	*/
2183	memmove(dest: &TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2184	count: sizeof(struct inet_skb_parm));
2185	barrier();
2186
2187	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2188	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2189	skb->len - th->doff * `4`);
2190	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2191	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2192	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2193	TCP_SKB_CB(skb)->sacked = `0`;
2194	TCP_SKB_CB(skb)->has_rxtstamp =
2195	skb->tstamp \|\| skb_hwtstamps(skb)->hwtstamp;
2196	}
2197
2198	/*
2199	* From tcp_input.c
2200	*/
2201
2202	int tcp_v4_rcv(struct sk_buff *skb)
2203	{
2204	struct net *net = dev_net_rcu(dev: skb->dev);
2205	enum skb_drop_reason drop_reason;
2206	enum tcp_tw_status tw_status;
2207	int sdif = inet_sdif(skb);
2208	int dif = inet_iif(skb);
2209	const struct iphdr *iph;
2210	const struct tcphdr *th;
2211	struct sock *sk = NULL;
2212	bool refcounted;
2213	int ret;
2214	u32 isn;
2215
2216	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2217	if (skb->pkt_type != PACKET_HOST)
2218	goto discard_it;
2219
2220	/ Count it even if it's bad /
2221	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2222
2223	if (!pskb_may_pull(skb, len: sizeof(struct tcphdr)))
2224	goto discard_it;
2225
2226	th = (const struct tcphdr *)skb->data;
2227
2228	if (unlikely(th->doff < sizeof(struct tcphdr) / `4`)) {
2229	drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2230	goto bad_packet;
2231	}
2232	if (!pskb_may_pull(skb, len: th->doff * `4`))
2233	goto discard_it;
2234
2235	/ An explanation is required here, I think.*
2236	* Packet length and doff are validated by header prediction,
2237	* provided case of th->doff==0 is eliminated.
2238	* So, we defer the checks. */
2239
2240	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2241	goto csum_error;
2242
2243	th = (const struct tcphdr *)skb->data;
2244	iph = ip_hdr(skb);
2245	lookup:
2246	sk = __inet_lookup_skb(skb, doff: __tcp_hdrlen(th), sport: th->source,
2247	dport: th->dest, sdif, refcounted: &refcounted);
2248	if (!sk)
2249	goto no_tcp_socket;
2250
2251	if (sk->sk_state == TCP_TIME_WAIT)
2252	goto do_time_wait;
2253
2254	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2255	struct request_sock *req = inet_reqsk(sk);
2256	bool req_stolen = false;
2257	struct sock *nsk;
2258
2259	sk = req->rsk_listener;
2260	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb))
2261	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2262	else
2263	drop_reason = tcp_inbound_hash(sk, req, skb,
2264	saddr: &iph->saddr, daddr: &iph->daddr,
2265	AF_INET, dif, sdif);
2266	if (unlikely(drop_reason)) {
2267	sk_drops_skbadd(sk, skb);
2268	reqsk_put(req);
2269	goto discard_it;
2270	}
2271	if (tcp_checksum_complete(skb)) {
2272	reqsk_put(req);
2273	goto csum_error;
2274	}
2275	if (unlikely(sk->sk_state != TCP_LISTEN)) {
2276	nsk = reuseport_migrate_sock(sk, migrating_sk: req_to_sk(req), skb);
2277	if (!nsk) {
2278	inet_csk_reqsk_queue_drop_and_put(sk, req);
2279	goto lookup;
2280	}
2281	sk = nsk;
2282	/ reuseport_migrate_sock() has already held one sk_refcnt*
2283	* before returning.
2284	*/
2285	} else {
2286	/ We own a reference on the listener, increase it again*
2287	* as we might lose it too soon.
2288	*/
2289	sock_hold(sk);
2290	}
2291	refcounted = true;
2292	nsk = NULL;
2293	if (!tcp_filter(sk, skb, reason: &drop_reason)) {
2294	th = (const struct tcphdr *)skb->data;
2295	iph = ip_hdr(skb);
2296	tcp_v4_fill_cb(skb, iph, th);
2297	nsk = tcp_check_req(sk, skb, req, fastopen: false, lost_race: &req_stolen,
2298	drop_reason: &drop_reason);
2299	}
2300	if (!nsk) {
2301	reqsk_put(req);
2302	if (req_stolen) {
2303	/ Another cpu got exclusive access to req*
2304	* and created a full blown socket.
2305	* Try to feed this packet to this socket
2306	* instead of discarding it.
2307	*/
2308	tcp_v4_restore_cb(skb);
2309	sock_put(sk);
2310	goto lookup;
2311	}
2312	goto discard_and_relse;
2313	}
2314	nf_reset_ct(skb);
2315	if (nsk == sk) {
2316	reqsk_put(req);
2317	tcp_v4_restore_cb(skb);
2318	} else {
2319	drop_reason = tcp_child_process(parent: sk, child: nsk, skb);
2320	if (drop_reason) {
2321	enum sk_rst_reason rst_reason;
2322
2323	rst_reason = sk_rst_convert_drop_reason(reason: drop_reason);
2324	tcp_v4_send_reset(sk: nsk, skb, reason: rst_reason);
2325	goto discard_and_relse;
2326	}
2327	sock_put(sk);
2328	return `0`;
2329	}
2330	}
2331
2332	process:
2333	if (static_branch_unlikely(&ip4_min_ttl)) {
2334	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
2335	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2336	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2337	drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2338	goto discard_and_relse;
2339	}
2340	}
2341
2342	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb)) {
2343	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2344	goto discard_and_relse;
2345	}
2346
2347	drop_reason = tcp_inbound_hash(sk, NULL, skb, saddr: &iph->saddr, daddr: &iph->daddr,
2348	AF_INET, dif, sdif);
2349	if (drop_reason)
2350	goto discard_and_relse;
2351
2352	nf_reset_ct(skb);
2353
2354	if (tcp_filter(sk, skb, reason: &drop_reason))
2355	goto discard_and_relse;
2356
2357	th = (const struct tcphdr *)skb->data;
2358	iph = ip_hdr(skb);
2359	tcp_v4_fill_cb(skb, iph, th);
2360
2361	skb->dev = NULL;
2362
2363	if (sk->sk_state == TCP_LISTEN) {
2364	ret = tcp_v4_do_rcv(sk, skb);
2365	goto put_and_return;
2366	}
2367
2368	sk_incoming_cpu_update(sk);
2369
2370	bh_lock_sock_nested(sk);
2371	tcp_segs_in(tcp_sk(sk), skb);
2372	ret = `0`;
2373	if (!sock_owned_by_user(sk)) {
2374	ret = tcp_v4_do_rcv(sk, skb);
2375	} else {
2376	if (tcp_add_backlog(sk, skb, reason: &drop_reason))
2377	goto discard_and_relse;
2378	}
2379	bh_unlock_sock(sk);
2380
2381	put_and_return:
2382	if (refcounted)
2383	sock_put(sk);
2384
2385	return ret;
2386
2387	no_tcp_socket:
2388	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2389	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb))
2390	goto discard_it;
2391
2392	tcp_v4_fill_cb(skb, iph, th);
2393
2394	if (tcp_checksum_complete(skb)) {
2395	csum_error:
2396	drop_reason = SKB_DROP_REASON_TCP_CSUM;
2397	trace_tcp_bad_csum(skb);
2398	__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2399	bad_packet:
2400	__TCP_INC_STATS(net, TCP_MIB_INERRS);
2401	} else {
2402	tcp_v4_send_reset(NULL, skb, reason: sk_rst_convert_drop_reason(reason: drop_reason));
2403	}
2404
2405	discard_it:
2406	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2407	/ Discard frame. /
2408	sk_skb_reason_drop(sk, skb, reason: drop_reason);
2409	return `0`;
2410
2411	discard_and_relse:
2412	sk_drops_skbadd(sk, skb);
2413	if (refcounted)
2414	sock_put(sk);
2415	goto discard_it;
2416
2417	do_time_wait:
2418	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb)) {
2419	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2420	inet_twsk_put(tw: inet_twsk(sk));
2421	goto discard_it;
2422	}
2423
2424	tcp_v4_fill_cb(skb, iph, th);
2425
2426	if (tcp_checksum_complete(skb)) {
2427	inet_twsk_put(tw: inet_twsk(sk));
2428	goto csum_error;
2429	}
2430
2431	tw_status = tcp_timewait_state_process(tw: inet_twsk(sk), skb, th, tw_isn: &isn,
2432	drop_reason: &drop_reason);
2433	switch (tw_status) {
2434	case TCP_TW_SYN: {
2435	struct sock *sk2 = inet_lookup_listener(net, skb, doff: __tcp_hdrlen(th),
2436	saddr: iph->saddr, sport: th->source,
2437	daddr: iph->daddr, dport: th->dest,
2438	dif: inet_iif(skb),
2439	sdif);
2440	if (sk2) {
2441	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2442	sk = sk2;
2443	tcp_v4_restore_cb(skb);
2444	refcounted = false;
2445	__this_cpu_write(tcp_tw_isn, isn);
2446	goto process;
2447	}
2448
2449	drop_reason = psp_twsk_rx_policy_check(tw: inet_twsk(sk), skb);
2450	if (drop_reason)
2451	break;
2452	}
2453	/ to ACK /
2454	fallthrough;
2455	case TCP_TW_ACK:
2456	case TCP_TW_ACK_OOW:
2457	tcp_v4_timewait_ack(sk, skb, tw_status);
2458	break;
2459	case TCP_TW_RST:
2460	tcp_v4_send_reset(sk, skb, reason: SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2461	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2462	goto discard_it;
2463	case TCP_TW_SUCCESS:;
2464	}
2465	goto discard_it;
2466	}
2467
2468	static struct timewait_sock_ops tcp_timewait_sock_ops = {
2469	.twsk_obj_size = sizeof(struct tcp_timewait_sock),
2470	};
2471
2472	void inet_sk_rx_dst_set(struct sock sk, const* struct sk_buff *skb)
2473	{
2474	struct dst_entry *dst = skb_dst(skb);
2475
2476	if (dst && dst_hold_safe(dst)) {
2477	rcu_assign_pointer(sk->sk_rx_dst, dst);
2478	sk->sk_rx_dst_ifindex = skb->skb_iif;
2479	}
2480	}
2481	EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2482
2483	const struct inet_connection_sock_af_ops ipv4_specific = {
2484	.queue_xmit = ip_queue_xmit,
2485	.send_check = tcp_v4_send_check,
2486	.rebuild_header = inet_sk_rebuild_header,
2487	.sk_rx_dst_set = inet_sk_rx_dst_set,
2488	.conn_request = tcp_v4_conn_request,
2489	.syn_recv_sock = tcp_v4_syn_recv_sock,
2490	.net_header_len = sizeof(struct iphdr),
2491	.setsockopt = ip_setsockopt,
2492	.getsockopt = ip_getsockopt,
2493	.mtu_reduced = tcp_v4_mtu_reduced,
2494	};
2495	EXPORT_IPV6_MOD(ipv4_specific);
2496
2497	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2498	static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2499	#ifdef CONFIG_TCP_MD5SIG
2500	.md5_lookup = tcp_v4_md5_lookup,
2501	.calc_md5_hash = tcp_v4_md5_hash_skb,
2502	.md5_parse = tcp_v4_parse_md5_keys,
2503	#endif
2504	#ifdef CONFIG_TCP_AO
2505	.ao_lookup = tcp_v4_ao_lookup,
2506	.calc_ao_hash = tcp_v4_ao_hash_skb,
2507	.ao_parse = tcp_v4_parse_ao,
2508	.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2509	#endif
2510	};
2511
2512	static void tcp4_destruct_sock(struct sock *sk)
2513	{
2514	tcp_md5_destruct_sock(sk);
2515	tcp_ao_destroy_sock(sk, twsk: false);
2516	inet_sock_destruct(sk);
2517	}
2518	#endif
2519
2520	/ NOTE: A lot of things set to zero explicitly by call to*
2521	* sk_alloc() so need not be done here.
2522	*/
2523	static int tcp_v4_init_sock(struct sock *sk)
2524	{
2525	struct inet_connection_sock *icsk = inet_csk(sk);
2526
2527	tcp_init_sock(sk);
2528
2529	icsk->icsk_af_ops = &ipv4_specific;
2530
2531	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2532	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2533	sk->sk_destruct = tcp4_destruct_sock;
2534	#endif
2535
2536	return `0`;
2537	}
2538
2539	static void tcp_release_user_frags(struct sock *sk)
2540	{
2541	#ifdef CONFIG_PAGE_POOL
2542	unsigned long index;
2543	void *netmem;
2544
2545	xa_for_each(&sk->sk_user_frags, index, netmem)
2546	WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2547	#endif
2548	}
2549
2550	void tcp_v4_destroy_sock(struct sock *sk)
2551	{
2552	struct tcp_sock *tp = tcp_sk(sk);
2553
2554	tcp_release_user_frags(sk);
2555
2556	xa_destroy(&sk->sk_user_frags);
2557
2558	trace_tcp_destroy_sock(sk);
2559
2560	tcp_clear_xmit_timers(sk);
2561
2562	tcp_cleanup_congestion_control(sk);
2563
2564	tcp_cleanup_ulp(sk);
2565
2566	/ Cleanup up the write buffer. /
2567	tcp_write_queue_purge(sk);
2568
2569	/ Check if we want to disable active TFO /
2570	tcp_fastopen_active_disable_ofo_check(sk);
2571
2572	/ Cleans up our, hopefully empty, out_of_order_queue. /
2573	skb_rbtree_purge(root: &tp->out_of_order_queue);
2574
2575	/ Clean up a referenced TCP bind bucket. /
2576	if (inet_csk(sk)->icsk_bind_hash)
2577	inet_put_port(sk);
2578
2579	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2580
2581	/ If socket is aborted during connect operation /
2582	tcp_free_fastopen_req(tp);
2583	tcp_fastopen_destroy_cipher(sk);
2584	tcp_saved_syn_free(tp);
2585
2586	sk_sockets_allocated_dec(sk);
2587	}
2588	EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2589
2590	#ifdef CONFIG_PROC_FS
2591	/ Proc filesystem TCP sock list dumping. /
2592
2593	static unsigned short seq_file_family(const struct seq_file *seq);
2594
2595	static bool seq_sk_match(struct seq_file seq, const* struct sock *sk)
2596	{
2597	unsigned short family = seq_file_family(seq);
2598
2599	/ AF_UNSPEC is used as a match all /
2600	return ((family == AF_UNSPEC \|\| family == sk->sk_family) &&
2601	net_eq(net1: sock_net(sk), net2: seq_file_net(seq)));
2602	}
2603
2604	/ Find a non empty bucket (starting from st->bucket)*
2605	* and return the first sk from it.
2606	*/
2607	static void listening_get_first(struct* seq_file *seq)
2608	{
2609	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610	struct tcp_iter_state *st = seq->private;
2611
2612	st->offset = `0`;
2613	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2614	struct inet_listen_hashbucket *ilb2;
2615	struct hlist_nulls_node *node;
2616	struct sock *sk;
2617
2618	ilb2 = &hinfo->lhash2[st->bucket];
2619	if (hlist_nulls_empty(h: &ilb2->nulls_head))
2620	continue;
2621
2622	spin_lock(lock: &ilb2->lock);
2623	sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2624	if (seq_sk_match(seq, sk))
2625	return sk;
2626	}
2627	spin_unlock(lock: &ilb2->lock);
2628	}
2629
2630	return NULL;
2631	}
2632
2633	/ Find the next sk of "cur" within the same bucket (i.e. st->bucket).*
2634	* If "cur" is the last one in the st->bucket,
2635	* call listening_get_first() to return the first sk of the next
2636	* non empty bucket.
2637	*/
2638	static void listening_get_next(struct* seq_file seq, void* *cur)
2639	{
2640	struct tcp_iter_state *st = seq->private;
2641	struct inet_listen_hashbucket *ilb2;
2642	struct hlist_nulls_node *node;
2643	struct inet_hashinfo *hinfo;
2644	struct sock *sk = cur;
2645
2646	++st->num;
2647	++st->offset;
2648
2649	sk = sk_nulls_next(sk);
2650	sk_nulls_for_each_from(sk, node) {
2651	if (seq_sk_match(seq, sk))
2652	return sk;
2653	}
2654
2655	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2656	ilb2 = &hinfo->lhash2[st->bucket];
2657	spin_unlock(lock: &ilb2->lock);
2658	++st->bucket;
2659	return listening_get_first(seq);
2660	}
2661
2662	static void listening_get_idx(struct* seq_file seq, loff_t pos)
2663	{
2664	struct tcp_iter_state *st = seq->private;
2665	void *rc;
2666
2667	st->bucket = `0`;
2668	st->offset = `0`;
2669	rc = listening_get_first(seq);
2670
2671	while (rc && *pos) {
2672	rc = listening_get_next(seq, cur: rc);
2673	--*pos;
2674	}
2675	return rc;
2676	}
2677
2678	static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2679	const struct tcp_iter_state *st)
2680	{
2681	return hlist_nulls_empty(h: &hinfo->ehash[st->bucket].chain);
2682	}
2683
2684	/*
2685	* Get first established socket starting from bucket given in st->bucket.
2686	* If st->bucket is zero, the very first socket in the hash is returned.
2687	*/
2688	static void established_get_first(struct* seq_file *seq)
2689	{
2690	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2691	struct tcp_iter_state *st = seq->private;
2692
2693	st->offset = `0`;
2694	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2695	struct sock *sk;
2696	struct hlist_nulls_node *node;
2697	spinlock_t *lock = inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket);
2698
2699	cond_resched();
2700
2701	/ Lockless fast path for the common case of empty buckets /
2702	if (empty_bucket(hinfo, st))
2703	continue;
2704
2705	spin_lock_bh(lock);
2706	sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2707	if (seq_sk_match(seq, sk))
2708	return sk;
2709	}
2710	spin_unlock_bh(lock);
2711	}
2712
2713	return NULL;
2714	}
2715
2716	static void established_get_next(struct* seq_file seq, void* *cur)
2717	{
2718	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2719	struct tcp_iter_state *st = seq->private;
2720	struct hlist_nulls_node *node;
2721	struct sock *sk = cur;
2722
2723	++st->num;
2724	++st->offset;
2725
2726	sk = sk_nulls_next(sk);
2727
2728	sk_nulls_for_each_from(sk, node) {
2729	if (seq_sk_match(seq, sk))
2730	return sk;
2731	}
2732
2733	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2734	++st->bucket;
2735	return established_get_first(seq);
2736	}
2737
2738	static void established_get_idx(struct* seq_file *seq, loff_t pos)
2739	{
2740	struct tcp_iter_state *st = seq->private;
2741	void *rc;
2742
2743	st->bucket = `0`;
2744	rc = established_get_first(seq);
2745
2746	while (rc && pos) {
2747	rc = established_get_next(seq, cur: rc);
2748	--pos;
2749	}
2750	return rc;
2751	}
2752
2753	static void tcp_get_idx(struct* seq_file *seq, loff_t pos)
2754	{
2755	void *rc;
2756	struct tcp_iter_state *st = seq->private;
2757
2758	st->state = TCP_SEQ_STATE_LISTENING;
2759	rc = listening_get_idx(seq, pos: &pos);
2760
2761	if (!rc) {
2762	st->state = TCP_SEQ_STATE_ESTABLISHED;
2763	rc = established_get_idx(seq, pos);
2764	}
2765
2766	return rc;
2767	}
2768
2769	static void tcp_seek_last_pos(struct* seq_file *seq)
2770	{
2771	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2772	struct tcp_iter_state *st = seq->private;
2773	int bucket = st->bucket;
2774	int offset = st->offset;
2775	int orig_num = st->num;
2776	void *rc = NULL;
2777
2778	switch (st->state) {
2779	case TCP_SEQ_STATE_LISTENING:
2780	if (st->bucket > hinfo->lhash2_mask)
2781	break;
2782	rc = listening_get_first(seq);
2783	while (offset-- && rc && bucket == st->bucket)
2784	rc = listening_get_next(seq, cur: rc);
2785	if (rc)
2786	break;
2787	st->bucket = `0`;
2788	st->state = TCP_SEQ_STATE_ESTABLISHED;
2789	fallthrough;
2790	case TCP_SEQ_STATE_ESTABLISHED:
2791	if (st->bucket > hinfo->ehash_mask)
2792	break;
2793	rc = established_get_first(seq);
2794	while (offset-- && rc && bucket == st->bucket)
2795	rc = established_get_next(seq, cur: rc);
2796	}
2797
2798	st->num = orig_num;
2799
2800	return rc;
2801	}
2802
2803	void tcp_seq_start(struct* seq_file seq, loff_t pos)
2804	{
2805	struct tcp_iter_state *st = seq->private;
2806	void *rc;
2807
2808	if (pos && pos == st->last_pos) {
2809	rc = tcp_seek_last_pos(seq);
2810	if (rc)
2811	goto out;
2812	}
2813
2814	st->state = TCP_SEQ_STATE_LISTENING;
2815	st->num = `0`;
2816	st->bucket = `0`;
2817	st->offset = `0`;
2818	rc = pos ? tcp_get_idx(seq, pos: pos - `1`) : SEQ_START_TOKEN;
2819
2820	out:
2821	st->last_pos = *pos;
2822	return rc;
2823	}
2824	EXPORT_IPV6_MOD(tcp_seq_start);
2825
2826	void tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
2827	{
2828	struct tcp_iter_state *st = seq->private;
2829	void *rc = NULL;
2830
2831	if (v == SEQ_START_TOKEN) {
2832	rc = tcp_get_idx(seq, pos: `0`);
2833	goto out;
2834	}
2835
2836	switch (st->state) {
2837	case TCP_SEQ_STATE_LISTENING:
2838	rc = listening_get_next(seq, cur: v);
2839	if (!rc) {
2840	st->state = TCP_SEQ_STATE_ESTABLISHED;
2841	st->bucket = `0`;
2842	st->offset = `0`;
2843	rc = established_get_first(seq);
2844	}
2845	break;
2846	case TCP_SEQ_STATE_ESTABLISHED:
2847	rc = established_get_next(seq, cur: v);
2848	break;
2849	}
2850	out:
2851	++*pos;
2852	st->last_pos = *pos;
2853	return rc;
2854	}
2855	EXPORT_IPV6_MOD(tcp_seq_next);
2856
2857	void tcp_seq_stop(struct seq_file seq, void* *v)
2858	{
2859	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2860	struct tcp_iter_state *st = seq->private;
2861
2862	switch (st->state) {
2863	case TCP_SEQ_STATE_LISTENING:
2864	if (v != SEQ_START_TOKEN)
2865	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
2866	break;
2867	case TCP_SEQ_STATE_ESTABLISHED:
2868	if (v)
2869	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2870	break;
2871	}
2872	}
2873	EXPORT_IPV6_MOD(tcp_seq_stop);
2874
2875	static void get_openreq4(const struct request_sock *req,
2876	struct seq_file f, int* i)
2877	{
2878	const struct inet_request_sock *ireq = inet_rsk(sk: req);
2879	long delta = req->rsk_timer.expires - jiffies;
2880
2881	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2882	" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2883	i,
2884	ireq->ir_loc_addr,
2885	ireq->ir_num,
2886	ireq->ir_rmt_addr,
2887	ntohs(ireq->ir_rmt_port),
2888	TCP_SYN_RECV,
2889	`0`, `0`, / could print option size, but that is af dependent. /
2890	`1`, / timers active (only the expire timer) /
2891	jiffies_delta_to_clock_t(delta),
2892	req->num_timeout,
2893	from_kuid_munged(to: seq_user_ns(seq: f),
2894	kuid: sk_uid(sk: req->rsk_listener)),
2895	`0`, / non standard timer /
2896	`0`, / open_requests have no inode /
2897	`0`,
2898	req);
2899	}
2900
2901	static void get_tcp4_sock(struct sock sk, struct* seq_file f, int* i)
2902	{
2903	int timer_active;
2904	unsigned long timer_expires;
2905	const struct tcp_sock *tp = tcp_sk(sk);
2906	const struct inet_connection_sock *icsk = inet_csk(sk);
2907	const struct inet_sock *inet = inet_sk(sk);
2908	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2909	__be32 dest = inet->inet_daddr;
2910	__be32 src = inet->inet_rcv_saddr;
2911	__u16 destp = ntohs(inet->inet_dport);
2912	__u16 srcp = ntohs(inet->inet_sport);
2913	u8 icsk_pending;
2914	int rx_queue;
2915	int state;
2916
2917	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2918	if (icsk_pending == ICSK_TIME_RETRANS \|\|
2919	icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
2920	icsk_pending == ICSK_TIME_LOSS_PROBE) {
2921	timer_active = `1`;
2922	timer_expires = icsk_timeout(icsk);
2923	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2924	timer_active = `4`;
2925	timer_expires = icsk_timeout(icsk);
2926	} else if (timer_pending(timer: &sk->sk_timer)) {
2927	timer_active = `2`;
2928	timer_expires = sk->sk_timer.expires;
2929	} else {
2930	timer_active = `0`;
2931	timer_expires = jiffies;
2932	}
2933
2934	state = inet_sk_state_load(sk);
2935	if (state == TCP_LISTEN)
2936	rx_queue = READ_ONCE(sk->sk_ack_backlog);
2937	else
2938	/ Because we don't lock the socket,*
2939	* we might find a transient negative value.
2940	*/
2941	rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2942	READ_ONCE(tp->copied_seq), `0`);
2943
2944	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2945	"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2946	i, src, srcp, dest, destp, state,
2947	READ_ONCE(tp->write_seq) - tp->snd_una,
2948	rx_queue,
2949	timer_active,
2950	jiffies_delta_to_clock_t(delta: timer_expires - jiffies),
2951	READ_ONCE(icsk->icsk_retransmits),
2952	from_kuid_munged(to: seq_user_ns(seq: f), kuid: sk_uid(sk)),
2953	READ_ONCE(icsk->icsk_probes_out),
2954	sock_i_ino(sk),
2955	refcount_read(r: &sk->sk_refcnt), sk,
2956	jiffies_to_clock_t(x: icsk->icsk_rto),
2957	jiffies_to_clock_t(x: icsk->icsk_ack.ato),
2958	(icsk->icsk_ack.quick << `1`) \| inet_csk_in_pingpong_mode(sk),
2959	tcp_snd_cwnd(tp),
2960	state == TCP_LISTEN ?
2961	fastopenq->max_qlen :
2962	(tcp_in_initial_slowstart(tp) ? -`1` : tp->snd_ssthresh));
2963	}
2964
2965	static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2966	struct seq_file f, int* i)
2967	{
2968	long delta = tw->tw_timer.expires - jiffies;
2969	__be32 dest, src;
2970	__u16 destp, srcp;
2971
2972	dest = tw->tw_daddr;
2973	src = tw->tw_rcv_saddr;
2974	destp = ntohs(tw->tw_dport);
2975	srcp = ntohs(tw->tw_sport);
2976
2977	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2978	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2979	i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), `0`, `0`,
2980	`3`, jiffies_delta_to_clock_t(delta), `0`, `0`, `0`, `0`,
2981	refcount_read(r: &tw->tw_refcnt), tw);
2982	}
2983
2984	#define TMPSZ 150
2985
2986	static int tcp4_seq_show(struct seq_file seq, void* *v)
2987	{
2988	struct tcp_iter_state *st;
2989	struct sock *sk = v;
2990
2991	seq_setwidth(m: seq, TMPSZ - `1`);
2992	if (v == SEQ_START_TOKEN) {
2993	seq_puts(m: seq, s: " sl local_address rem_address st tx_queue "
2994	"rx_queue tr tm->when retrnsmt uid timeout "
2995	"inode");
2996	goto out;
2997	}
2998	st = seq->private;
2999
3000	if (sk->sk_state == TCP_TIME_WAIT)
3001	get_timewait4_sock(tw: v, f: seq, i: st->num);
3002	else if (sk->sk_state == TCP_NEW_SYN_RECV)
3003	get_openreq4(req: v, f: seq, i: st->num);
3004	else
3005	get_tcp4_sock(sk: v, f: seq, i: st->num);
3006	out:
3007	seq_pad(m: seq, c: `'\n'`);
3008	return `0`;
3009	}
3010
3011	#ifdef CONFIG_BPF_SYSCALL
3012	union bpf_tcp_iter_batch_item {
3013	struct sock *sk;
3014	__u64 cookie;
3015	};
3016
3017	struct bpf_tcp_iter_state {
3018	struct tcp_iter_state state;
3019	unsigned int cur_sk;
3020	unsigned int end_sk;
3021	unsigned int max_sk;
3022	union bpf_tcp_iter_batch_item *batch;
3023	};
3024
3025	struct bpf_iter__tcp {
3026	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3027	__bpf_md_ptr(struct sock_common *, sk_common);
3028	uid_t uid __aligned(`8`);
3029	};
3030
3031	static int tcp_prog_seq_show(struct bpf_prog prog, struct* bpf_iter_meta *meta,
3032	struct sock_common *sk_common, uid_t uid)
3033	{
3034	struct bpf_iter__tcp ctx;
3035
3036	meta->seq_num--; / skip SEQ_START_TOKEN /
3037	ctx.meta = meta;
3038	ctx.sk_common = sk_common;
3039	ctx.uid = uid;
3040	return bpf_iter_run_prog(prog, &ctx);
3041	}
3042
3043	static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3044	{
3045	union bpf_tcp_iter_batch_item *item;
3046	unsigned int cur_sk = iter->cur_sk;
3047	__u64 cookie;
3048
3049	/ Remember the cookies of the sockets we haven't seen yet, so we can*
3050	* pick up where we left off next time around.
3051	*/
3052	while (cur_sk < iter->end_sk) {
3053	item = &iter->batch[cur_sk++];
3054	cookie = sock_gen_cookie(item->sk);
3055	sock_gen_put(item->sk);
3056	item->cookie = cookie;
3057	}
3058	}
3059
3060	static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3061	unsigned int new_batch_sz, gfp_t flags)
3062	{
3063	union bpf_tcp_iter_batch_item *new_batch;
3064
3065	new_batch = kvmalloc(sizeof(new_batch) new_batch_sz,
3066	flags \| __GFP_NOWARN);
3067	if (!new_batch)
3068	return -ENOMEM;
3069
3070	memcpy(new_batch, iter->batch, sizeof(iter->batch) iter->end_sk);
3071	kvfree(iter->batch);
3072	iter->batch = new_batch;
3073	iter->max_sk = new_batch_sz;
3074
3075	return `0`;
3076	}
3077
3078	static struct sock bpf_iter_tcp_resume_bucket(struct* sock *first_sk,
3079	union bpf_tcp_iter_batch_item *cookies,
3080	int n_cookies)
3081	{
3082	struct hlist_nulls_node *node;
3083	struct sock *sk;
3084	int i;
3085
3086	for (i = `0`; i < n_cookies; i++) {
3087	sk = first_sk;
3088	sk_nulls_for_each_from(sk, node)
3089	if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3090	return sk;
3091	}
3092
3093	return NULL;
3094	}
3095
3096	static struct sock bpf_iter_tcp_resume_listening(struct* seq_file *seq)
3097	{
3098	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3099	struct bpf_tcp_iter_state *iter = seq->private;
3100	struct tcp_iter_state *st = &iter->state;
3101	unsigned int find_cookie = iter->cur_sk;
3102	unsigned int end_cookie = iter->end_sk;
3103	int resume_bucket = st->bucket;
3104	struct sock *sk;
3105
3106	if (end_cookie && find_cookie == end_cookie)
3107	++st->bucket;
3108
3109	sk = listening_get_first(seq);
3110	iter->cur_sk = `0`;
3111	iter->end_sk = `0`;
3112
3113	if (sk && st->bucket == resume_bucket && end_cookie) {
3114	sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3115	end_cookie - find_cookie);
3116	if (!sk) {
3117	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3118	++st->bucket;
3119	sk = listening_get_first(seq);
3120	}
3121	}
3122
3123	return sk;
3124	}
3125
3126	static struct sock bpf_iter_tcp_resume_established(struct* seq_file *seq)
3127	{
3128	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3129	struct bpf_tcp_iter_state *iter = seq->private;
3130	struct tcp_iter_state *st = &iter->state;
3131	unsigned int find_cookie = iter->cur_sk;
3132	unsigned int end_cookie = iter->end_sk;
3133	int resume_bucket = st->bucket;
3134	struct sock *sk;
3135
3136	if (end_cookie && find_cookie == end_cookie)
3137	++st->bucket;
3138
3139	sk = established_get_first(seq);
3140	iter->cur_sk = `0`;
3141	iter->end_sk = `0`;
3142
3143	if (sk && st->bucket == resume_bucket && end_cookie) {
3144	sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3145	end_cookie - find_cookie);
3146	if (!sk) {
3147	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3148	++st->bucket;
3149	sk = established_get_first(seq);
3150	}
3151	}
3152
3153	return sk;
3154	}
3155
3156	static struct sock bpf_iter_tcp_resume(struct* seq_file *seq)
3157	{
3158	struct bpf_tcp_iter_state *iter = seq->private;
3159	struct tcp_iter_state *st = &iter->state;
3160	struct sock *sk = NULL;
3161
3162	switch (st->state) {
3163	case TCP_SEQ_STATE_LISTENING:
3164	sk = bpf_iter_tcp_resume_listening(seq);
3165	if (sk)
3166	break;
3167	st->bucket = `0`;
3168	st->state = TCP_SEQ_STATE_ESTABLISHED;
3169	fallthrough;
3170	case TCP_SEQ_STATE_ESTABLISHED:
3171	sk = bpf_iter_tcp_resume_established(seq);
3172	break;
3173	}
3174
3175	return sk;
3176	}
3177
3178	static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3179	struct sock **start_sk)
3180	{
3181	struct bpf_tcp_iter_state *iter = seq->private;
3182	struct hlist_nulls_node *node;
3183	unsigned int expected = `1`;
3184	struct sock *sk;
3185
3186	sock_hold(*start_sk);
3187	iter->batch[iter->end_sk++].sk = *start_sk;
3188
3189	sk = sk_nulls_next(*start_sk);
3190	*start_sk = NULL;
3191	sk_nulls_for_each_from(sk, node) {
3192	if (seq_sk_match(seq, sk)) {
3193	if (iter->end_sk < iter->max_sk) {
3194	sock_hold(sk);
3195	iter->batch[iter->end_sk++].sk = sk;
3196	} else if (!*start_sk) {
3197	/ Remember where we left off. /
3198	*start_sk = sk;
3199	}
3200	expected++;
3201	}
3202	}
3203
3204	return expected;
3205	}
3206
3207	static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3208	struct sock **start_sk)
3209	{
3210	struct bpf_tcp_iter_state *iter = seq->private;
3211	struct hlist_nulls_node *node;
3212	unsigned int expected = `1`;
3213	struct sock *sk;
3214
3215	sock_hold(*start_sk);
3216	iter->batch[iter->end_sk++].sk = *start_sk;
3217
3218	sk = sk_nulls_next(*start_sk);
3219	*start_sk = NULL;
3220	sk_nulls_for_each_from(sk, node) {
3221	if (seq_sk_match(seq, sk)) {
3222	if (iter->end_sk < iter->max_sk) {
3223	sock_hold(sk);
3224	iter->batch[iter->end_sk++].sk = sk;
3225	} else if (!*start_sk) {
3226	/ Remember where we left off. /
3227	*start_sk = sk;
3228	}
3229	expected++;
3230	}
3231	}
3232
3233	return expected;
3234	}
3235
3236	static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3237	struct sock **start_sk)
3238	{
3239	struct bpf_tcp_iter_state *iter = seq->private;
3240	struct tcp_iter_state *st = &iter->state;
3241
3242	if (st->state == TCP_SEQ_STATE_LISTENING)
3243	return bpf_iter_tcp_listening_batch(seq, start_sk);
3244	else
3245	return bpf_iter_tcp_established_batch(seq, start_sk);
3246	}
3247
3248	static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3249	{
3250	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3251	struct bpf_tcp_iter_state *iter = seq->private;
3252	struct tcp_iter_state *st = &iter->state;
3253
3254	if (st->state == TCP_SEQ_STATE_LISTENING)
3255	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3256	else
3257	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3258	}
3259
3260	static struct sock bpf_iter_tcp_batch(struct* seq_file *seq)
3261	{
3262	struct bpf_tcp_iter_state *iter = seq->private;
3263	unsigned int expected;
3264	struct sock *sk;
3265	int err;
3266
3267	sk = bpf_iter_tcp_resume(seq);
3268	if (!sk)
3269	return NULL; / Done /
3270
3271	expected = bpf_iter_fill_batch(seq, &sk);
3272	if (likely(iter->end_sk == expected))
3273	goto done;
3274
3275	/ Batch size was too small. /
3276	bpf_iter_tcp_unlock_bucket(seq);
3277	bpf_iter_tcp_put_batch(iter);
3278	err = bpf_iter_tcp_realloc_batch(iter, expected * `3` / `2`,
3279	GFP_USER);
3280	if (err)
3281	return ERR_PTR(err);
3282
3283	sk = bpf_iter_tcp_resume(seq);
3284	if (!sk)
3285	return NULL; / Done /
3286
3287	expected = bpf_iter_fill_batch(seq, &sk);
3288	if (likely(iter->end_sk == expected))
3289	goto done;
3290
3291	/ Batch size was still too small. Hold onto the lock while we try*
3292	* again with a larger batch to make sure the current bucket's size
3293	* does not change in the meantime.
3294	*/
3295	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3296	if (err) {
3297	bpf_iter_tcp_unlock_bucket(seq);
3298	return ERR_PTR(err);
3299	}
3300
3301	expected = bpf_iter_fill_batch(seq, &sk);
3302	WARN_ON_ONCE(iter->end_sk != expected);
3303	done:
3304	bpf_iter_tcp_unlock_bucket(seq);
3305	return iter->batch[`0`].sk;
3306	}
3307
3308	static void bpf_iter_tcp_seq_start(struct* seq_file seq, loff_t pos)
3309	{
3310	/ bpf iter does not support lseek, so it always*
3311	* continue from where it was stop()-ped.
3312	*/
3313	if (*pos)
3314	return bpf_iter_tcp_batch(seq);
3315
3316	return SEQ_START_TOKEN;
3317	}
3318
3319	static void bpf_iter_tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
3320	{
3321	struct bpf_tcp_iter_state *iter = seq->private;
3322	struct tcp_iter_state *st = &iter->state;
3323	struct sock *sk;
3324
3325	/ Whenever seq_next() is called, the iter->cur_sk is*
3326	* done with seq_show(), so advance to the next sk in
3327	* the batch.
3328	*/
3329	if (iter->cur_sk < iter->end_sk) {
3330	/ Keeping st->num consistent in tcp_iter_state.*
3331	* bpf_iter_tcp does not use st->num.
3332	* meta.seq_num is used instead.
3333	*/
3334	st->num++;
3335	sock_gen_put(iter->batch[iter->cur_sk++].sk);
3336	}
3337
3338	if (iter->cur_sk < iter->end_sk)
3339	sk = iter->batch[iter->cur_sk].sk;
3340	else
3341	sk = bpf_iter_tcp_batch(seq);
3342
3343	++*pos;
3344	/ Keeping st->last_pos consistent in tcp_iter_state.*
3345	* bpf iter does not do lseek, so st->last_pos always equals to *pos.
3346	*/
3347	st->last_pos = *pos;
3348	return sk;
3349	}
3350
3351	static int bpf_iter_tcp_seq_show(struct seq_file seq, void* *v)
3352	{
3353	struct bpf_iter_meta meta;
3354	struct bpf_prog *prog;
3355	struct sock *sk = v;
3356	uid_t uid;
3357	int ret;
3358
3359	if (v == SEQ_START_TOKEN)
3360	return `0`;
3361
3362	if (sk_fullsock(sk))
3363	lock_sock(sk);
3364
3365	if (unlikely(sk_unhashed(sk))) {
3366	ret = SEQ_SKIP;
3367	goto unlock;
3368	}
3369
3370	if (sk->sk_state == TCP_TIME_WAIT) {
3371	uid = `0`;
3372	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3373	const struct request_sock *req = v;
3374
3375	uid = from_kuid_munged(seq_user_ns(seq),
3376	sk_uid(req->rsk_listener));
3377	} else {
3378	uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3379	}
3380
3381	meta.seq = seq;
3382	prog = bpf_iter_get_info(&meta, false);
3383	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3384
3385	unlock:
3386	if (sk_fullsock(sk))
3387	release_sock(sk);
3388	return ret;
3389
3390	}
3391
3392	static void bpf_iter_tcp_seq_stop(struct seq_file seq, void* *v)
3393	{
3394	struct bpf_tcp_iter_state *iter = seq->private;
3395	struct bpf_iter_meta meta;
3396	struct bpf_prog *prog;
3397
3398	if (!v) {
3399	meta.seq = seq;
3400	prog = bpf_iter_get_info(&meta, true);
3401	if (prog)
3402	(void)tcp_prog_seq_show(prog, &meta, v, `0`);
3403	}
3404
3405	if (iter->cur_sk < iter->end_sk)
3406	bpf_iter_tcp_put_batch(iter);
3407	}
3408
3409	static const struct seq_operations bpf_iter_tcp_seq_ops = {
3410	.show = bpf_iter_tcp_seq_show,
3411	.start = bpf_iter_tcp_seq_start,
3412	.next = bpf_iter_tcp_seq_next,
3413	.stop = bpf_iter_tcp_seq_stop,
3414	};
3415	#endif
3416	static unsigned short seq_file_family(const struct seq_file *seq)
3417	{
3418	const struct tcp_seq_afinfo *afinfo;
3419
3420	#ifdef CONFIG_BPF_SYSCALL
3421	/ Iterated from bpf_iter. Let the bpf prog to filter instead. /
3422	if (seq->op == &bpf_iter_tcp_seq_ops)
3423	return AF_UNSPEC;
3424	#endif
3425
3426	/ Iterated from proc fs /
3427	afinfo = pde_data(inode: file_inode(f: seq->file));
3428	return afinfo->family;
3429	}
3430
3431	static const struct seq_operations tcp4_seq_ops = {
3432	.show = tcp4_seq_show,
3433	.start = tcp_seq_start,
3434	.next = tcp_seq_next,
3435	.stop = tcp_seq_stop,
3436	};
3437
3438	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3439	.family = AF_INET,
3440	};
3441
3442	static int __net_init tcp4_proc_init_net(struct net *net)
3443	{
3444	if (!proc_create_net_data(name: "tcp", mode: `0444`, parent: net->proc_net, ops: &tcp4_seq_ops,
3445	state_size: sizeof(struct tcp_iter_state), data: &tcp4_seq_afinfo))
3446	return -ENOMEM;
3447	return `0`;
3448	}
3449
3450	static void __net_exit tcp4_proc_exit_net(struct net *net)
3451	{
3452	remove_proc_entry("tcp", net->proc_net);
3453	}
3454
3455	static struct pernet_operations tcp4_net_ops = {
3456	.init = tcp4_proc_init_net,
3457	.exit = tcp4_proc_exit_net,
3458	};
3459
3460	int __init tcp4_proc_init(void)
3461	{
3462	return register_pernet_subsys(&tcp4_net_ops);
3463	}
3464
3465	void tcp4_proc_exit(void)
3466	{
3467	unregister_pernet_subsys(&tcp4_net_ops);
3468	}
3469	#endif /* CONFIG_PROC_FS */
3470
3471	/ @wake is one when sk_stream_write_space() calls us.*
3472	* This sends EPOLLOUT only if notsent_bytes is half the limit.
3473	* This mimics the strategy used in sock_def_write_space().
3474	*/
3475	bool tcp_stream_memory_free(const struct sock sk, int* wake)
3476	{
3477	const struct tcp_sock *tp = tcp_sk(sk);
3478	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3479	READ_ONCE(tp->snd_nxt);
3480
3481	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3482	}
3483	EXPORT_SYMBOL(tcp_stream_memory_free);
3484
3485	struct proto tcp_prot = {
3486	.name = "TCP",
3487	.owner = THIS_MODULE,
3488	.close = tcp_close,
3489	.pre_connect = tcp_v4_pre_connect,
3490	.connect = tcp_v4_connect,
3491	.disconnect = tcp_disconnect,
3492	.accept = inet_csk_accept,
3493	.ioctl = tcp_ioctl,
3494	.init = tcp_v4_init_sock,
3495	.destroy = tcp_v4_destroy_sock,
3496	.shutdown = tcp_shutdown,
3497	.setsockopt = tcp_setsockopt,
3498	.getsockopt = tcp_getsockopt,
3499	.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3500	.keepalive = tcp_set_keepalive,
3501	.recvmsg = tcp_recvmsg,
3502	.sendmsg = tcp_sendmsg,
3503	.splice_eof = tcp_splice_eof,
3504	.backlog_rcv = tcp_v4_do_rcv,
3505	.release_cb = tcp_release_cb,
3506	.hash = inet_hash,
3507	.unhash = inet_unhash,
3508	.get_port = inet_csk_get_port,
3509	.put_port = inet_put_port,
3510	#ifdef CONFIG_BPF_SYSCALL
3511	.psock_update_sk_prot = tcp_bpf_update_proto,
3512	#endif
3513	.enter_memory_pressure = tcp_enter_memory_pressure,
3514	.leave_memory_pressure = tcp_leave_memory_pressure,
3515	.stream_memory_free = tcp_stream_memory_free,
3516	.sockets_allocated = &tcp_sockets_allocated,
3517
3518	.memory_allocated = &net_aligned_data.tcp_memory_allocated,
3519	.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3520
3521	.memory_pressure = &tcp_memory_pressure,
3522	.sysctl_mem = sysctl_tcp_mem,
3523	.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3524	.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3525	.max_header = MAX_TCP_HEADER,
3526	.obj_size = sizeof(struct tcp_sock),
3527	.slab_flags = SLAB_TYPESAFE_BY_RCU,
3528	.twsk_prot = &tcp_timewait_sock_ops,
3529	.rsk_prot = &tcp_request_sock_ops,
3530	.h.hashinfo = NULL,
3531	.no_autobind = true,
3532	.diag_destroy = tcp_abort,
3533	};
3534	EXPORT_SYMBOL(tcp_prot);
3535
3536	static void __net_exit tcp_sk_exit(struct net *net)
3537	{
3538	if (net->ipv4.tcp_congestion_control)
3539	bpf_module_put(data: net->ipv4.tcp_congestion_control,
3540	owner: net->ipv4.tcp_congestion_control->owner);
3541	}
3542
3543	static void __net_init tcp_set_hashinfo(struct net *net)
3544	{
3545	struct inet_hashinfo *hinfo;
3546	unsigned int ehash_entries;
3547	struct net *old_net;
3548
3549	if (net_eq(net1: net, net2: &init_net))
3550	goto fallback;
3551
3552	old_net = current->nsproxy->net_ns;
3553	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3554	if (!ehash_entries)
3555	goto fallback;
3556
3557	ehash_entries = roundup_pow_of_two(ehash_entries);
3558	hinfo = inet_pernet_hashinfo_alloc(hashinfo: &tcp_hashinfo, ehash_entries);
3559	if (!hinfo) {
3560	pr_warn("Failed to allocate TCP ehash (entries: %u) "
3561	"for a netns, fallback to the global one\n",
3562	ehash_entries);
3563	fallback:
3564	hinfo = &tcp_hashinfo;
3565	ehash_entries = tcp_hashinfo.ehash_mask + `1`;
3566	}
3567
3568	net->ipv4.tcp_death_row.hashinfo = hinfo;
3569	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / `2`;
3570	net->ipv4.sysctl_max_syn_backlog = max(`128U`, ehash_entries / `128`);
3571	}
3572
3573	static int __net_init tcp_sk_init(struct net *net)
3574	{
3575	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3576	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3577	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3578	net->ipv4.sysctl_tcp_ecn_fallback = `1`;
3579
3580	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3581	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3582	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3583	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3584	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3585
3586	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3587	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3588	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3589
3590	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3591	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3592	net->ipv4.sysctl_tcp_syncookies = `1`;
3593	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3594	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3595	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3596	net->ipv4.sysctl_tcp_orphan_retries = `0`;
3597	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3598	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3599	net->ipv4.sysctl_tcp_tw_reuse = `2`;
3600	net->ipv4.sysctl_tcp_tw_reuse_delay = `1` * MSEC_PER_SEC;
3601	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = `1`;
3602
3603	refcount_set(r: &net->ipv4.tcp_death_row.tw_refcount, n: `1`);
3604	tcp_set_hashinfo(net);
3605
3606	net->ipv4.sysctl_tcp_sack = `1`;
3607	net->ipv4.sysctl_tcp_window_scaling = `1`;
3608	net->ipv4.sysctl_tcp_timestamps = `1`;
3609	net->ipv4.sysctl_tcp_early_retrans = `3`;
3610	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3611	net->ipv4.sysctl_tcp_slow_start_after_idle = `1`; / By default, RFC2861 behavior. /
3612	net->ipv4.sysctl_tcp_retrans_collapse = `1`;
3613	net->ipv4.sysctl_tcp_max_reordering = `300`;
3614	net->ipv4.sysctl_tcp_dsack = `1`;
3615	net->ipv4.sysctl_tcp_app_win = `31`;
3616	net->ipv4.sysctl_tcp_adv_win_scale = `1`;
3617	net->ipv4.sysctl_tcp_frto = `2`;
3618	net->ipv4.sysctl_tcp_moderate_rcvbuf = `1`;
3619	/ This limits the percentage of the congestion window which we*
3620	* will allow a single TSO frame to consume. Building TSO frames
3621	* which are too large can cause TCP streams to be bursty.
3622	*/
3623	net->ipv4.sysctl_tcp_tso_win_divisor = `3`;
3624	/ Default TSQ limit of 4 MB /
3625	net->ipv4.sysctl_tcp_limit_output_bytes = `4` << `20`;
3626
3627	/ rfc5961 challenge ack rate limiting, per net-ns, disabled by default. /
3628	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3629
3630	net->ipv4.sysctl_tcp_min_tso_segs = `2`;
3631	net->ipv4.sysctl_tcp_tso_rtt_log = `9`; / 2^9 = 512 usec /
3632	net->ipv4.sysctl_tcp_min_rtt_wlen = `300`;
3633	net->ipv4.sysctl_tcp_autocorking = `1`;
3634	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/`2`;
3635	net->ipv4.sysctl_tcp_pacing_ss_ratio = `200`;
3636	net->ipv4.sysctl_tcp_pacing_ca_ratio = `120`;
3637	if (net != &init_net) {
3638	memcpy(to: net->ipv4.sysctl_tcp_rmem,
3639	from: init_net.ipv4.sysctl_tcp_rmem,
3640	len: sizeof(init_net.ipv4.sysctl_tcp_rmem));
3641	memcpy(to: net->ipv4.sysctl_tcp_wmem,
3642	from: init_net.ipv4.sysctl_tcp_wmem,
3643	len: sizeof(init_net.ipv4.sysctl_tcp_wmem));
3644	}
3645	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3646	net->ipv4.sysctl_tcp_comp_sack_slack_ns = `100` * NSEC_PER_USEC;
3647	net->ipv4.sysctl_tcp_comp_sack_nr = `44`;
3648	net->ipv4.sysctl_tcp_backlog_ack_defer = `1`;
3649	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3650	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = `0`;
3651	atomic_set(v: &net->ipv4.tfo_active_disable_times, i: `0`);
3652
3653	/ Set default values for PLB /
3654	net->ipv4.sysctl_tcp_plb_enabled = `0`; / Disabled by default /
3655	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = `3`;
3656	net->ipv4.sysctl_tcp_plb_rehash_rounds = `12`;
3657	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = `60`;
3658	/ Default congestion threshold for PLB to mark a round is 50% /
3659	net->ipv4.sysctl_tcp_plb_cong_thresh = (`1` << TCP_PLB_SCALE) / `2`;
3660
3661	/ Reno is always built in /
3662	if (!net_eq(net1: net, net2: &init_net) &&
3663	bpf_try_module_get(data: init_net.ipv4.tcp_congestion_control,
3664	owner: init_net.ipv4.tcp_congestion_control->owner))
3665	net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3666	else
3667	net->ipv4.tcp_congestion_control = &tcp_reno;
3668
3669	net->ipv4.sysctl_tcp_syn_linear_timeouts = `4`;
3670	net->ipv4.sysctl_tcp_shrink_window = `0`;
3671
3672	net->ipv4.sysctl_tcp_pingpong_thresh = `1`;
3673	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3674	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3675
3676	return `0`;
3677	}
3678
3679	static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3680	{
3681	struct net *net;
3682
3683	/ make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work*
3684	* and failed setup_net error unwinding path are serialized.
3685	*
3686	* tcp_twsk_purge() handles twsk in any dead netns, not just those in
3687	* net_exit_list, the thread that dismantles a particular twsk must
3688	* do so without other thread progressing to refcount_dec_and_test() of
3689	* tcp_death_row.tw_refcount.
3690	*/
3691	mutex_lock(lock: &tcp_exit_batch_mutex);
3692
3693	tcp_twsk_purge(net_exit_list);
3694
3695	list_for_each_entry(net, net_exit_list, exit_list) {
3696	inet_pernet_hashinfo_free(hashinfo: net->ipv4.tcp_death_row.hashinfo);
3697	WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3698	tcp_fastopen_ctx_destroy(net);
3699	}
3700
3701	mutex_unlock(lock: &tcp_exit_batch_mutex);
3702	}
3703
3704	static struct pernet_operations __net_initdata tcp_sk_ops = {
3705	.init = tcp_sk_init,
3706	.exit = tcp_sk_exit,
3707	.exit_batch = tcp_sk_exit_batch,
3708	};
3709
3710	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3711	DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3712	struct sock_common *sk_common, uid_t uid)
3713
3714	#define INIT_BATCH_SZ 16
3715
3716	static int bpf_iter_init_tcp(void priv_data, struct* bpf_iter_aux_info *aux)
3717	{
3718	struct bpf_tcp_iter_state *iter = priv_data;
3719	int err;
3720
3721	err = bpf_iter_init_seq_net(priv_data, aux);
3722	if (err)
3723	return err;
3724
3725	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3726	if (err) {
3727	bpf_iter_fini_seq_net(priv_data);
3728	return err;
3729	}
3730
3731	return `0`;
3732	}
3733
3734	static void bpf_iter_fini_tcp(void *priv_data)
3735	{
3736	struct bpf_tcp_iter_state *iter = priv_data;
3737
3738	bpf_iter_fini_seq_net(priv_data);
3739	kvfree(iter->batch);
3740	}
3741
3742	static const struct bpf_iter_seq_info tcp_seq_info = {
3743	.seq_ops = &bpf_iter_tcp_seq_ops,
3744	.init_seq_private = bpf_iter_init_tcp,
3745	.fini_seq_private = bpf_iter_fini_tcp,
3746	.seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3747	};
3748
3749	static const struct bpf_func_proto *
3750	bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3751	const struct bpf_prog *prog)
3752	{
3753	switch (func_id) {
3754	case BPF_FUNC_setsockopt:
3755	return &bpf_sk_setsockopt_proto;
3756	case BPF_FUNC_getsockopt:
3757	return &bpf_sk_getsockopt_proto;
3758	default:
3759	return NULL;
3760	}
3761	}
3762
3763	static struct bpf_iter_reg tcp_reg_info = {
3764	.target = "tcp",
3765	.ctx_arg_info_size = `1`,
3766	.ctx_arg_info = {
3767	{ offsetof(struct bpf_iter__tcp, sk_common),
3768	PTR_TO_BTF_ID_OR_NULL \| PTR_TRUSTED },
3769	},
3770	.get_func_proto = bpf_iter_tcp_get_func_proto,
3771	.seq_info = &tcp_seq_info,
3772	};
3773
3774	static void __init bpf_iter_register(void)
3775	{
3776	tcp_reg_info.ctx_arg_info[`0`].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3777	if (bpf_iter_reg_target(&tcp_reg_info))
3778	pr_warn("Warning: could not register bpf iterator tcp\n");
3779	}
3780
3781	#endif
3782
3783	void __init tcp_v4_init(void)
3784	{
3785	int cpu, res;
3786
3787	for_each_possible_cpu(cpu) {
3788	struct sock *sk;
3789
3790	res = inet_ctl_sock_create(sk: &sk, PF_INET, type: SOCK_RAW,
3791	IPPROTO_TCP, net: &init_net);
3792	if (res)
3793	panic(fmt: "Failed to create the TCP control socket.\n");
3794	sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
3795
3796	/ Please enforce IP_DF and IPID==0 for RST and*
3797	* ACK sent in SYN-RECV and TIME-WAIT state.
3798	*/
3799	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3800
3801	sk->sk_clockid = CLOCK_MONOTONIC;
3802
3803	per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3804	}
3805	if (register_pernet_subsys(&tcp_sk_ops))
3806	panic(fmt: "Failed to create the TCP control socket.\n");
3807
3808	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3809	bpf_iter_register();
3810	#endif
3811	}
3812

Browse the source code of Linux/net/ipv4/tcp_ipv4.c