1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60#include <linux/sched.h>
61#include <linux/sock_diag.h>
62
63#include <net/aligned_data.h>
64#include <net/net_namespace.h>
65#include <net/icmp.h>
66#include <net/inet_hashtables.h>
67#include <net/tcp.h>
68#include <net/tcp_ecn.h>
69#include <net/transp_v6.h>
70#include <net/ipv6.h>
71#include <net/inet_common.h>
72#include <net/inet_ecn.h>
73#include <net/timewait_sock.h>
74#include <net/xfrm.h>
75#include <net/secure_seq.h>
76#include <net/busy_poll.h>
77#include <net/rstreason.h>
78#include <net/psp.h>
79
80#include <linux/inet.h>
81#include <linux/ipv6.h>
82#include <linux/stddef.h>
83#include <linux/proc_fs.h>
84#include <linux/seq_file.h>
85#include <linux/inetdevice.h>
86#include <linux/btf_ids.h>
87#include <linux/skbuff_ref.h>
88
89#include <crypto/hash.h>
90#include <linux/scatterlist.h>
91
92#include <trace/events/tcp.h>
93
94#ifdef CONFIG_TCP_MD5SIG
95static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 __be32 daddr, __be32 saddr, const struct tcphdr *th);
97#endif
98
99struct inet_hashinfo tcp_hashinfo;
100
101static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
103};
104
105static DEFINE_MUTEX(tcp_exit_batch_mutex);
106
107static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108{
109 return secure_tcp_seq(saddr: ip_hdr(skb)->daddr,
110 daddr: ip_hdr(skb)->saddr,
111 sport: tcp_hdr(skb)->dest,
112 dport: tcp_hdr(skb)->source);
113}
114
115static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
116{
117 return secure_tcp_ts_off(net, saddr: ip_hdr(skb)->daddr, daddr: ip_hdr(skb)->saddr);
118}
119
120int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
121{
122 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123 const struct inet_timewait_sock *tw = inet_twsk(sk: sktw);
124 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk: sktw);
125 struct tcp_sock *tp = tcp_sk(sk);
126 int ts_recent_stamp;
127 u32 reuse_thresh;
128
129 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130 reuse = 0;
131
132 if (reuse == 2) {
133 /* Still does not detect *everything* that goes through
134 * lo, since we require a loopback src or dst address
135 * or direct binding to 'lo' interface.
136 */
137 bool loopback = false;
138 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139 loopback = true;
140#if IS_ENABLED(CONFIG_IPV6)
141 if (tw->tw_family == AF_INET6) {
142 if (ipv6_addr_loopback(a: &tw->tw_v6_daddr) ||
143 ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_daddr) ||
144 ipv6_addr_loopback(a: &tw->tw_v6_rcv_saddr) ||
145 ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_rcv_saddr))
146 loopback = true;
147 } else
148#endif
149 {
150 if (ipv4_is_loopback(addr: tw->tw_daddr) ||
151 ipv4_is_loopback(addr: tw->tw_rcv_saddr))
152 loopback = true;
153 }
154 if (!loopback)
155 reuse = 0;
156 }
157
158 /* With PAWS, it is safe from the viewpoint
159 of data integrity. Even without PAWS it is safe provided sequence
160 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161
162 Actually, the idea is close to VJ's one, only timestamp cache is
163 held not per host, but per port pair and TW bucket is used as state
164 holder.
165
166 If TW bucket has been already destroyed we fall back to VJ's scheme
167 and use initial timestamp retrieved from peer table.
168 */
169 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172 if (ts_recent_stamp &&
173 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
175 * and releasing the bucket lock.
176 */
177 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178 return 0;
179
180 /* In case of repair and re-using TIME-WAIT sockets we still
181 * want to be sure that it is safe as above but honor the
182 * sequence numbers and time stamps set as part of the repair
183 * process.
184 *
185 * Without this check re-using a TIME-WAIT socket with TCP
186 * repair would accumulate a -1 on the repair assigned
187 * sequence number. The first time it is reused the sequence
188 * is -1, the second time -2, etc. This fixes that issue
189 * without appearing to create any others.
190 */
191 if (likely(!tp->repair)) {
192 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
193
194 if (!seq)
195 seq = 1;
196 WRITE_ONCE(tp->write_seq, seq);
197 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
198 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199 }
200
201 return 1;
202 }
203
204 return 0;
205}
206EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207
208static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
209 int addr_len)
210{
211 /* This check is replicated from tcp_v4_connect() and intended to
212 * prevent BPF program called below from accessing bytes that are out
213 * of the bound specified by user in addr_len.
214 */
215 if (addr_len < sizeof(struct sockaddr_in))
216 return -EINVAL;
217
218 sock_owned_by_me(sk);
219
220 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221}
222
223/* This will initiate an outgoing connection. */
224int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
225{
226 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
227 struct inet_timewait_death_row *tcp_death_row;
228 struct inet_sock *inet = inet_sk(sk);
229 struct tcp_sock *tp = tcp_sk(sk);
230 struct ip_options_rcu *inet_opt;
231 struct net *net = sock_net(sk);
232 __be16 orig_sport, orig_dport;
233 __be32 daddr, nexthop;
234 struct flowi4 *fl4;
235 struct rtable *rt;
236 int err;
237
238 if (addr_len < sizeof(struct sockaddr_in))
239 return -EINVAL;
240
241 if (usin->sin_family != AF_INET)
242 return -EAFNOSUPPORT;
243
244 nexthop = daddr = usin->sin_addr.s_addr;
245 inet_opt = rcu_dereference_protected(inet->inet_opt,
246 lockdep_sock_is_held(sk));
247 if (inet_opt && inet_opt->opt.srr) {
248 if (!daddr)
249 return -EINVAL;
250 nexthop = inet_opt->opt.faddr;
251 }
252
253 orig_sport = inet->inet_sport;
254 orig_dport = usin->sin_port;
255 fl4 = &inet->cork.fl.u.ip4;
256 rt = ip_route_connect(fl4, dst: nexthop, src: inet->inet_saddr,
257 oif: sk->sk_bound_dev_if, IPPROTO_TCP, sport: orig_sport,
258 dport: orig_dport, sk);
259 if (IS_ERR(ptr: rt)) {
260 err = PTR_ERR(ptr: rt);
261 if (err == -ENETUNREACH)
262 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263 return err;
264 }
265
266 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
267 ip_rt_put(rt);
268 return -ENETUNREACH;
269 }
270
271 if (!inet_opt || !inet_opt->opt.srr)
272 daddr = fl4->daddr;
273
274 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275
276 if (!inet->inet_saddr) {
277 err = inet_bhash2_update_saddr(sk, saddr: &fl4->saddr, AF_INET);
278 if (err) {
279 ip_rt_put(rt);
280 return err;
281 }
282 } else {
283 sk_rcv_saddr_set(sk, addr: inet->inet_saddr);
284 }
285
286 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287 /* Reset inherited state */
288 tp->rx_opt.ts_recent = 0;
289 tp->rx_opt.ts_recent_stamp = 0;
290 if (likely(!tp->repair))
291 WRITE_ONCE(tp->write_seq, 0);
292 }
293
294 inet->inet_dport = usin->sin_port;
295 sk_daddr_set(sk, addr: daddr);
296
297 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298 if (inet_opt)
299 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300
301 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302
303 /* Socket identity is still unknown (sport may be zero).
304 * However we set state to SYN-SENT and not releasing socket
305 * lock select source port, enter ourselves into the hash tables and
306 * complete initialization after this.
307 */
308 tcp_set_state(sk, state: TCP_SYN_SENT);
309 err = inet_hash_connect(death_row: tcp_death_row, sk);
310 if (err)
311 goto failure;
312
313 sk_set_txhash(sk);
314
315 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316 sport: inet->inet_sport, dport: inet->inet_dport, sk);
317 if (IS_ERR(ptr: rt)) {
318 err = PTR_ERR(ptr: rt);
319 rt = NULL;
320 goto failure;
321 }
322 tp->tcp_usec_ts = dst_tcp_usec_ts(dst: &rt->dst);
323 /* OK, now commit destination to socket. */
324 sk->sk_gso_type = SKB_GSO_TCPV4;
325 sk_setup_caps(sk, dst: &rt->dst);
326 rt = NULL;
327
328 if (likely(!tp->repair)) {
329 if (!tp->write_seq)
330 WRITE_ONCE(tp->write_seq,
331 secure_tcp_seq(inet->inet_saddr,
332 inet->inet_daddr,
333 inet->inet_sport,
334 usin->sin_port));
335 WRITE_ONCE(tp->tsoffset,
336 secure_tcp_ts_off(net, inet->inet_saddr,
337 inet->inet_daddr));
338 }
339
340 atomic_set(v: &inet->inet_id, i: get_random_u16());
341
342 if (tcp_fastopen_defer_connect(sk, err: &err))
343 return err;
344 if (err)
345 goto failure;
346
347 err = tcp_connect(sk);
348
349 if (err)
350 goto failure;
351
352 return 0;
353
354failure:
355 /*
356 * This unhashes the socket and releases the local port,
357 * if necessary.
358 */
359 tcp_set_state(sk, state: TCP_CLOSE);
360 inet_bhash2_reset_saddr(sk);
361 ip_rt_put(rt);
362 sk->sk_route_caps = 0;
363 inet->inet_dport = 0;
364 return err;
365}
366EXPORT_IPV6_MOD(tcp_v4_connect);
367
368/*
369 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370 * It can be called through tcp_release_cb() if socket was owned by user
371 * at the time tcp_v4_err() was called to handle ICMP message.
372 */
373void tcp_v4_mtu_reduced(struct sock *sk)
374{
375 struct inet_sock *inet = inet_sk(sk);
376 struct dst_entry *dst;
377 u32 mtu;
378
379 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
380 return;
381 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382 dst = inet_csk_update_pmtu(sk, mtu);
383 if (!dst)
384 return;
385
386 /* Something is about to be wrong... Remember soft error
387 * for the case, if this connection will not able to recover.
388 */
389 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
390 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
391
392 mtu = dst_mtu(dst);
393
394 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
395 ip_sk_accept_pmtu(sk) &&
396 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
397 tcp_sync_mss(sk, pmtu: mtu);
398
399 /* Resend the TCP packet because it's
400 * clear that the old packet has been
401 * dropped. This is the new "fast" path mtu
402 * discovery.
403 */
404 tcp_simple_retransmit(sk);
405 } /* else let the usual retransmit timer handle it */
406}
407EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
408
409static void do_redirect(struct sk_buff *skb, struct sock *sk)
410{
411 struct dst_entry *dst = __sk_dst_check(sk, cookie: 0);
412
413 if (dst)
414 dst->ops->redirect(dst, sk, skb);
415}
416
417
418/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
419void tcp_req_err(struct sock *sk, u32 seq, bool abort)
420{
421 struct request_sock *req = inet_reqsk(sk);
422 struct net *net = sock_net(sk);
423
424 /* ICMPs are not backlogged, hence we cannot get
425 * an established socket here.
426 */
427 if (seq != tcp_rsk(req)->snt_isn) {
428 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
429 } else if (abort) {
430 /*
431 * Still in SYN_RECV, just remove it silently.
432 * There is no good way to pass the error to the newly
433 * created socket, and POSIX does not want network
434 * errors returned from accept().
435 */
436 inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
437 tcp_listendrop(sk: req->rsk_listener);
438 }
439 reqsk_put(req);
440}
441EXPORT_IPV6_MOD(tcp_req_err);
442
443/* TCP-LD (RFC 6069) logic */
444void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
445{
446 struct inet_connection_sock *icsk = inet_csk(sk);
447 struct tcp_sock *tp = tcp_sk(sk);
448 struct sk_buff *skb;
449 s32 remaining;
450 u32 delta_us;
451
452 if (sock_owned_by_user(sk))
453 return;
454
455 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
456 !icsk->icsk_backoff)
457 return;
458
459 skb = tcp_rtx_queue_head(sk);
460 if (WARN_ON_ONCE(!skb))
461 return;
462
463 icsk->icsk_backoff--;
464 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
465 icsk->icsk_rto = inet_csk_rto_backoff(icsk, max_when: tcp_rto_max(sk));
466
467 tcp_mstamp_refresh(tp);
468 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
469 remaining = icsk->icsk_rto - usecs_to_jiffies(u: delta_us);
470
471 if (remaining > 0) {
472 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: remaining, pace_delay: false);
473 } else {
474 /* RTO revert clocked out retransmission.
475 * Will retransmit now.
476 */
477 tcp_retransmit_timer(sk);
478 }
479}
480EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
481
482/*
483 * This routine is called by the ICMP module when it gets some
484 * sort of error condition. If err < 0 then the socket should
485 * be closed and the error returned to the user. If err > 0
486 * it's just the icmp type << 8 | icmp code. After adjustment
487 * header points to the first 8 bytes of the tcp header. We need
488 * to find the appropriate port.
489 *
490 * The locking strategy used here is very "optimistic". When
491 * someone else accesses the socket the ICMP is just dropped
492 * and for some paths there is no check at all.
493 * A more general error queue to queue errors for later handling
494 * is probably better.
495 *
496 */
497
498int tcp_v4_err(struct sk_buff *skb, u32 info)
499{
500 const struct iphdr *iph = (const struct iphdr *)skb->data;
501 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
502 struct net *net = dev_net_rcu(dev: skb->dev);
503 const int type = icmp_hdr(skb)->type;
504 const int code = icmp_hdr(skb)->code;
505 struct request_sock *fastopen;
506 struct tcp_sock *tp;
507 u32 seq, snd_una;
508 struct sock *sk;
509 int err;
510
511 sk = __inet_lookup_established(net, saddr: iph->daddr, sport: th->dest, daddr: iph->saddr,
512 ntohs(th->source), dif: inet_iif(skb), sdif: 0);
513 if (!sk) {
514 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
515 return -ENOENT;
516 }
517 if (sk->sk_state == TCP_TIME_WAIT) {
518 /* To increase the counter of ignored icmps for TCP-AO */
519 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
520 inet_twsk_put(tw: inet_twsk(sk));
521 return 0;
522 }
523 seq = ntohl(th->seq);
524 if (sk->sk_state == TCP_NEW_SYN_RECV) {
525 tcp_req_err(sk, seq, abort: type == ICMP_PARAMETERPROB ||
526 type == ICMP_TIME_EXCEEDED ||
527 (type == ICMP_DEST_UNREACH &&
528 (code == ICMP_NET_UNREACH ||
529 code == ICMP_HOST_UNREACH)));
530 return 0;
531 }
532
533 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
534 sock_put(sk);
535 return 0;
536 }
537
538 bh_lock_sock(sk);
539 /* If too many ICMPs get dropped on busy
540 * servers this needs to be solved differently.
541 * We do take care of PMTU discovery (RFC1191) special case :
542 * we can receive locally generated ICMP messages while socket is held.
543 */
544 if (sock_owned_by_user(sk)) {
545 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
546 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
547 }
548 if (sk->sk_state == TCP_CLOSE)
549 goto out;
550
551 if (static_branch_unlikely(&ip4_min_ttl)) {
552 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
553 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
554 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
555 goto out;
556 }
557 }
558
559 tp = tcp_sk(sk);
560 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
561 fastopen = rcu_dereference(tp->fastopen_rsk);
562 snd_una = fastopen ? tcp_rsk(req: fastopen)->snt_isn : tp->snd_una;
563 if (sk->sk_state != TCP_LISTEN &&
564 !between(seq1: seq, seq2: snd_una, seq3: tp->snd_nxt)) {
565 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
566 goto out;
567 }
568
569 switch (type) {
570 case ICMP_REDIRECT:
571 if (!sock_owned_by_user(sk))
572 do_redirect(skb, sk);
573 goto out;
574 case ICMP_SOURCE_QUENCH:
575 /* Just silently ignore these. */
576 goto out;
577 case ICMP_PARAMETERPROB:
578 err = EPROTO;
579 break;
580 case ICMP_DEST_UNREACH:
581 if (code > NR_ICMP_UNREACH)
582 goto out;
583
584 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
585 /* We are not interested in TCP_LISTEN and open_requests
586 * (SYN-ACKs send out by Linux are always <576bytes so
587 * they should go through unfragmented).
588 */
589 if (sk->sk_state == TCP_LISTEN)
590 goto out;
591
592 WRITE_ONCE(tp->mtu_info, info);
593 if (!sock_owned_by_user(sk)) {
594 tcp_v4_mtu_reduced(sk);
595 } else {
596 if (!test_and_set_bit(nr: TCP_MTU_REDUCED_DEFERRED, addr: &sk->sk_tsq_flags))
597 sock_hold(sk);
598 }
599 goto out;
600 }
601
602 err = icmp_err_convert[code].errno;
603 /* check if this ICMP message allows revert of backoff.
604 * (see RFC 6069)
605 */
606 if (!fastopen &&
607 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
608 tcp_ld_RTO_revert(sk, seq);
609 break;
610 case ICMP_TIME_EXCEEDED:
611 err = EHOSTUNREACH;
612 break;
613 default:
614 goto out;
615 }
616
617 switch (sk->sk_state) {
618 case TCP_SYN_SENT:
619 case TCP_SYN_RECV:
620 /* Only in fast or simultaneous open. If a fast open socket is
621 * already accepted it is treated as a connected one below.
622 */
623 if (fastopen && !fastopen->sk)
624 break;
625
626 ip_icmp_error(sk, skb, err, port: th->dest, info, payload: (u8 *)th);
627
628 if (!sock_owned_by_user(sk))
629 tcp_done_with_error(sk, err);
630 else
631 WRITE_ONCE(sk->sk_err_soft, err);
632 goto out;
633 }
634
635 /* If we've already connected we will keep trying
636 * until we time out, or the user gives up.
637 *
638 * rfc1122 4.2.3.9 allows to consider as hard errors
639 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
640 * but it is obsoleted by pmtu discovery).
641 *
642 * Note, that in modern internet, where routing is unreliable
643 * and in each dark corner broken firewalls sit, sending random
644 * errors ordered by their masters even this two messages finally lose
645 * their original sense (even Linux sends invalid PORT_UNREACHs)
646 *
647 * Now we are in compliance with RFCs.
648 * --ANK (980905)
649 */
650
651 if (!sock_owned_by_user(sk) &&
652 inet_test_bit(RECVERR, sk)) {
653 WRITE_ONCE(sk->sk_err, err);
654 sk_error_report(sk);
655 } else { /* Only an error on timeout */
656 WRITE_ONCE(sk->sk_err_soft, err);
657 }
658
659out:
660 bh_unlock_sock(sk);
661 sock_put(sk);
662 return 0;
663}
664
665void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
666{
667 struct tcphdr *th = tcp_hdr(skb);
668
669 th->check = ~tcp_v4_check(len: skb->len, saddr, daddr, base: 0);
670 skb->csum_start = skb_transport_header(skb) - skb->head;
671 skb->csum_offset = offsetof(struct tcphdr, check);
672}
673
674/* This routine computes an IPv4 TCP checksum. */
675void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
676{
677 const struct inet_sock *inet = inet_sk(sk);
678
679 __tcp_v4_send_check(skb, saddr: inet->inet_saddr, daddr: inet->inet_daddr);
680}
681EXPORT_IPV6_MOD(tcp_v4_send_check);
682
683#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
684
685static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
686 const struct tcp_ao_hdr *aoh,
687 struct ip_reply_arg *arg, struct tcphdr *reply,
688 __be32 reply_options[REPLY_OPTIONS_LEN])
689{
690#ifdef CONFIG_TCP_AO
691 int sdif = tcp_v4_sdif(skb);
692 int dif = inet_iif(skb);
693 int l3index = sdif ? dif : 0;
694 bool allocated_traffic_key;
695 struct tcp_ao_key *key;
696 char *traffic_key;
697 bool drop = true;
698 u32 ao_sne = 0;
699 u8 keyid;
700
701 rcu_read_lock();
702 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
703 &key, &traffic_key, &allocated_traffic_key,
704 &keyid, &ao_sne))
705 goto out;
706
707 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
708 (aoh->rnext_keyid << 8) | keyid);
709 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
710 reply->doff = arg->iov[0].iov_len / 4;
711
712 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
713 key, traffic_key,
714 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
715 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
716 reply, ao_sne))
717 goto out;
718 drop = false;
719out:
720 rcu_read_unlock();
721 if (allocated_traffic_key)
722 kfree(traffic_key);
723 return drop;
724#else
725 return true;
726#endif
727}
728
729/*
730 * This routine will send an RST to the other tcp.
731 *
732 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
733 * for reset.
734 * Answer: if a packet caused RST, it is not for a socket
735 * existing in our system, if it is matched to a socket,
736 * it is just duplicate segment or bug in other side's TCP.
737 * So that we build reply only basing on parameters
738 * arrived with segment.
739 * Exception: precedence violation. We do not implement it in any case.
740 */
741
742static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
743 enum sk_rst_reason reason)
744{
745 const struct tcphdr *th = tcp_hdr(skb);
746 struct {
747 struct tcphdr th;
748 __be32 opt[REPLY_OPTIONS_LEN];
749 } rep;
750 const __u8 *md5_hash_location = NULL;
751 const struct tcp_ao_hdr *aoh;
752 struct ip_reply_arg arg;
753#ifdef CONFIG_TCP_MD5SIG
754 struct tcp_md5sig_key *key = NULL;
755 unsigned char newhash[16];
756 struct sock *sk1 = NULL;
757 int genhash;
758#endif
759 u64 transmit_time = 0;
760 struct sock *ctl_sk;
761 struct net *net;
762 u32 txhash = 0;
763
764 /* Never send a reset in response to a reset. */
765 if (th->rst)
766 return;
767
768 /* If sk not NULL, it means we did a successful lookup and incoming
769 * route had to be correct. prequeue might have dropped our dst.
770 */
771 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
772 return;
773
774 /* Swap the send and the receive. */
775 memset(s: &rep, c: 0, n: sizeof(rep));
776 rep.th.dest = th->source;
777 rep.th.source = th->dest;
778 rep.th.doff = sizeof(struct tcphdr) / 4;
779 rep.th.rst = 1;
780
781 if (th->ack) {
782 rep.th.seq = th->ack_seq;
783 } else {
784 rep.th.ack = 1;
785 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
786 skb->len - (th->doff << 2));
787 }
788
789 memset(s: &arg, c: 0, n: sizeof(arg));
790 arg.iov[0].iov_base = (unsigned char *)&rep;
791 arg.iov[0].iov_len = sizeof(rep.th);
792
793 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
794
795 /* Invalid TCP option size or twice included auth */
796 if (tcp_parse_auth_options(th: tcp_hdr(skb), md5_hash: &md5_hash_location, aoh: &aoh))
797 return;
798
799 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, arg: &arg, reply: &rep.th, reply_options: rep.opt))
800 return;
801
802#ifdef CONFIG_TCP_MD5SIG
803 rcu_read_lock();
804 if (sk && sk_fullsock(sk)) {
805 const union tcp_md5_addr *addr;
806 int l3index;
807
808 /* sdif set, means packet ingressed via a device
809 * in an L3 domain and inet_iif is set to it.
810 */
811 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
812 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
813 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
814 } else if (md5_hash_location) {
815 const union tcp_md5_addr *addr;
816 int sdif = tcp_v4_sdif(skb);
817 int dif = inet_iif(skb);
818 int l3index;
819
820 /*
821 * active side is lost. Try to find listening socket through
822 * source port, and then find md5 key through listening socket.
823 * we are not loose security here:
824 * Incoming packet is checked with md5 hash with finding key,
825 * no RST generated if md5 hash doesn't match.
826 */
827 sk1 = __inet_lookup_listener(net, NULL, doff: 0, saddr: ip_hdr(skb)->saddr,
828 sport: th->source, daddr: ip_hdr(skb)->daddr,
829 ntohs(th->source), dif, sdif);
830 /* don't send rst if it can't find key */
831 if (!sk1)
832 goto out;
833
834 /* sdif set, means packet ingressed via a device
835 * in an L3 domain and dif is set to it.
836 */
837 l3index = sdif ? dif : 0;
838 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
839 key = tcp_md5_do_lookup(sk: sk1, l3index, addr, AF_INET);
840 if (!key)
841 goto out;
842
843
844 genhash = tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
845 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
846 goto out;
847
848 }
849
850 if (key) {
851 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
852 (TCPOPT_NOP << 16) |
853 (TCPOPT_MD5SIG << 8) |
854 TCPOLEN_MD5SIG);
855 /* Update length and the length the header thinks exists */
856 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
857 rep.th.doff = arg.iov[0].iov_len / 4;
858
859 tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[1],
860 key, daddr: ip_hdr(skb)->saddr,
861 saddr: ip_hdr(skb)->daddr, th: &rep.th);
862 }
863#endif
864 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
865 if (rep.opt[0] == 0) {
866 __be32 mrst = mptcp_reset_option(skb);
867
868 if (mrst) {
869 rep.opt[0] = mrst;
870 arg.iov[0].iov_len += sizeof(mrst);
871 rep.th.doff = arg.iov[0].iov_len / 4;
872 }
873 }
874
875 arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
876 daddr: ip_hdr(skb)->saddr, /* XXX */
877 len: arg.iov[0].iov_len, IPPROTO_TCP, sum: 0);
878 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
879 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
880
881 /* When socket is gone, all binding information is lost.
882 * routing might fail in this case. No choice here, if we choose to force
883 * input interface, we will misroute in case of asymmetric route.
884 */
885 if (sk)
886 arg.bound_dev_if = sk->sk_bound_dev_if;
887
888 trace_tcp_send_reset(sk, skb__nullable: skb, reason);
889
890 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
891 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
892
893 /* ECN bits of TW reset are cleared */
894 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
895 arg.uid = sock_net_uid(net, sk: sk && sk_fullsock(sk) ? sk : NULL);
896 local_bh_disable();
897 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
898 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
899
900 sock_net_set(sk: ctl_sk, net);
901 if (sk) {
902 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
903 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
904 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
905 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
906 transmit_time = tcp_transmit_time(sk);
907 xfrm_sk_clone_policy(sk: ctl_sk, osk: sk);
908 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
909 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
910 } else {
911 ctl_sk->sk_mark = 0;
912 ctl_sk->sk_priority = 0;
913 }
914 ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
915 skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
916 daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
917 arg: &arg, len: arg.iov[0].iov_len,
918 transmit_time, txhash);
919
920 xfrm_sk_free_policy(sk: ctl_sk);
921 sock_net_set(sk: ctl_sk, net: &init_net);
922 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
923 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
924 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
925 local_bh_enable();
926
927#ifdef CONFIG_TCP_MD5SIG
928out:
929 rcu_read_unlock();
930#endif
931}
932
933/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
934 outside socket context is ugly, certainly. What can I do?
935 */
936
937static void tcp_v4_send_ack(const struct sock *sk,
938 struct sk_buff *skb, u32 seq, u32 ack,
939 u32 win, u32 tsval, u32 tsecr, int oif,
940 struct tcp_key *key,
941 int reply_flags, u8 tos, u32 txhash)
942{
943 const struct tcphdr *th = tcp_hdr(skb);
944 struct {
945 struct tcphdr th;
946 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
947 } rep;
948 struct net *net = sock_net(sk);
949 struct ip_reply_arg arg;
950 struct sock *ctl_sk;
951 u64 transmit_time;
952
953 memset(s: &rep.th, c: 0, n: sizeof(struct tcphdr));
954 memset(s: &arg, c: 0, n: sizeof(arg));
955
956 arg.iov[0].iov_base = (unsigned char *)&rep;
957 arg.iov[0].iov_len = sizeof(rep.th);
958 if (tsecr) {
959 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
960 (TCPOPT_TIMESTAMP << 8) |
961 TCPOLEN_TIMESTAMP);
962 rep.opt[1] = htonl(tsval);
963 rep.opt[2] = htonl(tsecr);
964 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
965 }
966
967 /* Swap the send and the receive. */
968 rep.th.dest = th->source;
969 rep.th.source = th->dest;
970 rep.th.doff = arg.iov[0].iov_len / 4;
971 rep.th.seq = htonl(seq);
972 rep.th.ack_seq = htonl(ack);
973 rep.th.ack = 1;
974 rep.th.window = htons(win);
975
976#ifdef CONFIG_TCP_MD5SIG
977 if (tcp_key_is_md5(key)) {
978 int offset = (tsecr) ? 3 : 0;
979
980 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
981 (TCPOPT_NOP << 16) |
982 (TCPOPT_MD5SIG << 8) |
983 TCPOLEN_MD5SIG);
984 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
985 rep.th.doff = arg.iov[0].iov_len/4;
986
987 tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[offset],
988 key: key->md5_key, daddr: ip_hdr(skb)->saddr,
989 saddr: ip_hdr(skb)->daddr, th: &rep.th);
990 }
991#endif
992#ifdef CONFIG_TCP_AO
993 if (tcp_key_is_ao(key)) {
994 int offset = (tsecr) ? 3 : 0;
995
996 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
997 (tcp_ao_len(key->ao_key) << 16) |
998 (key->ao_key->sndid << 8) |
999 key->rcv_next);
1000 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
1001 rep.th.doff = arg.iov[0].iov_len / 4;
1002
1003 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1004 key->ao_key, key->traffic_key,
1005 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1006 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1007 &rep.th, key->sne);
1008 }
1009#endif
1010 arg.flags = reply_flags;
1011 arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
1012 daddr: ip_hdr(skb)->saddr, /* XXX */
1013 len: arg.iov[0].iov_len, IPPROTO_TCP, sum: 0);
1014 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1015 if (oif)
1016 arg.bound_dev_if = oif;
1017 arg.tos = tos;
1018 arg.uid = sock_net_uid(net, sk: sk_fullsock(sk) ? sk : NULL);
1019 local_bh_disable();
1020 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1021 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1022 sock_net_set(sk: ctl_sk, net);
1023 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1024 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1025 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1026 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1027 transmit_time = tcp_transmit_time(sk);
1028 ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
1029 skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
1030 daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
1031 arg: &arg, len: arg.iov[0].iov_len,
1032 transmit_time, txhash);
1033
1034 sock_net_set(sk: ctl_sk, net: &init_net);
1035 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1036 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1037 local_bh_enable();
1038}
1039
1040static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1041 enum tcp_tw_status tw_status)
1042{
1043 struct inet_timewait_sock *tw = inet_twsk(sk);
1044 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1045 struct tcp_key key = {};
1046 u8 tos = tw->tw_tos;
1047
1048 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1049 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1050 * being placed in a different service queues (Classic rather than L4S)
1051 */
1052 if (tw_status == TCP_TW_ACK_OOW)
1053 tos &= ~INET_ECN_MASK;
1054
1055#ifdef CONFIG_TCP_AO
1056 struct tcp_ao_info *ao_info;
1057
1058 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1059 /* FIXME: the segment to-be-acked is not verified yet */
1060 ao_info = rcu_dereference(tcptw->ao_info);
1061 if (ao_info) {
1062 const struct tcp_ao_hdr *aoh;
1063
1064 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1065 inet_twsk_put(tw);
1066 return;
1067 }
1068
1069 if (aoh)
1070 key.ao_key = tcp_ao_established_key(sk, ao_info,
1071 aoh->rnext_keyid, -1);
1072 }
1073 }
1074 if (key.ao_key) {
1075 struct tcp_ao_key *rnext_key;
1076
1077 key.traffic_key = snd_other_key(key.ao_key);
1078 key.sne = READ_ONCE(ao_info->snd_sne);
1079 rnext_key = READ_ONCE(ao_info->rnext_key);
1080 key.rcv_next = rnext_key->rcvid;
1081 key.type = TCP_KEY_AO;
1082#else
1083 if (0) {
1084#endif
1085 } else if (static_branch_tcp_md5()) {
1086 key.md5_key = tcp_twsk_md5_key(tcptw);
1087 if (key.md5_key)
1088 key.type = TCP_KEY_MD5;
1089 }
1090
1091 tcp_v4_send_ack(sk, skb,
1092 seq: tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1093 win: tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1094 tsval: tcp_tw_tsval(tcptw),
1095 READ_ONCE(tcptw->tw_ts_recent),
1096 oif: tw->tw_bound_dev_if, key: &key,
1097 reply_flags: tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1098 tos,
1099 txhash: tw->tw_txhash);
1100
1101 inet_twsk_put(tw);
1102}
1103
1104static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1105 struct request_sock *req)
1106{
1107 struct tcp_key key = {};
1108
1109 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1110 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1111 */
1112 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1113 tcp_sk(sk)->snd_nxt;
1114
1115#ifdef CONFIG_TCP_AO
1116 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1117 tcp_rsk_used_ao(req)) {
1118 const union tcp_md5_addr *addr;
1119 const struct tcp_ao_hdr *aoh;
1120 int l3index;
1121
1122 /* Invalid TCP option size or twice included auth */
1123 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1124 return;
1125 if (!aoh)
1126 return;
1127
1128 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1129 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1130 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1131 aoh->rnext_keyid, -1);
1132 if (unlikely(!key.ao_key)) {
1133 /* Send ACK with any matching MKT for the peer */
1134 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1135 /* Matching key disappeared (user removed the key?)
1136 * let the handshake timeout.
1137 */
1138 if (!key.ao_key) {
1139 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1140 addr,
1141 ntohs(tcp_hdr(skb)->source),
1142 &ip_hdr(skb)->daddr,
1143 ntohs(tcp_hdr(skb)->dest));
1144 return;
1145 }
1146 }
1147 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1148 if (!key.traffic_key)
1149 return;
1150
1151 key.type = TCP_KEY_AO;
1152 key.rcv_next = aoh->keyid;
1153 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1154#else
1155 if (0) {
1156#endif
1157 } else if (static_branch_tcp_md5()) {
1158 const union tcp_md5_addr *addr;
1159 int l3index;
1160
1161 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1162 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1163 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1164 if (key.md5_key)
1165 key.type = TCP_KEY_MD5;
1166 }
1167
1168 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1169 tcp_v4_send_ack(sk, skb, seq,
1170 ack: tcp_rsk(req)->rcv_nxt,
1171 win: tcp_synack_window(req) >> inet_rsk(sk: req)->rcv_wscale,
1172 tsval: tcp_rsk_tsval(treq: tcp_rsk(req)),
1173 tsecr: req->ts_recent,
1174 oif: 0, key: &key,
1175 reply_flags: inet_rsk(sk: req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1176 tos: ip_hdr(skb)->tos & ~INET_ECN_MASK,
1177 READ_ONCE(tcp_rsk(req)->txhash));
1178 if (tcp_key_is_ao(key: &key))
1179 kfree(objp: key.traffic_key);
1180}
1181
1182/*
1183 * Send a SYN-ACK after having received a SYN.
1184 * This still operates on a request_sock only, not on a big
1185 * socket.
1186 */
1187static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1188 struct flowi *fl,
1189 struct request_sock *req,
1190 struct tcp_fastopen_cookie *foc,
1191 enum tcp_synack_type synack_type,
1192 struct sk_buff *syn_skb)
1193{
1194 struct inet_request_sock *ireq = inet_rsk(sk: req);
1195 struct flowi4 fl4;
1196 int err = -1;
1197 struct sk_buff *skb;
1198 u8 tos;
1199
1200 /* First, grab a route. */
1201 if (!dst && (dst = inet_csk_route_req(sk, fl4: &fl4, req)) == NULL)
1202 return -1;
1203
1204 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1205
1206 if (skb) {
1207 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1208 __tcp_v4_send_check(skb, saddr: ireq->ir_loc_addr, daddr: ireq->ir_rmt_addr);
1209
1210 tos = READ_ONCE(inet_sk(sk)->tos);
1211
1212 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1213 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1214 (tos & INET_ECN_MASK);
1215
1216 if (!INET_ECN_is_capable(dsfield: tos) &&
1217 tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
1218 tos |= INET_ECN_ECT_0;
1219
1220 rcu_read_lock();
1221 err = ip_build_and_send_pkt(skb, sk, saddr: ireq->ir_loc_addr,
1222 daddr: ireq->ir_rmt_addr,
1223 rcu_dereference(ireq->ireq_opt),
1224 tos);
1225 rcu_read_unlock();
1226 err = net_xmit_eval(err);
1227 }
1228
1229 return err;
1230}
1231
1232/*
1233 * IPv4 request_sock destructor.
1234 */
1235static void tcp_v4_reqsk_destructor(struct request_sock *req)
1236{
1237 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1238}
1239
1240#ifdef CONFIG_TCP_MD5SIG
1241/*
1242 * RFC2385 MD5 checksumming requires a mapping of
1243 * IP address->MD5 Key.
1244 * We need to maintain these in the sk structure.
1245 */
1246
1247DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1248EXPORT_IPV6_MOD(tcp_md5_needed);
1249
1250static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1251{
1252 if (!old)
1253 return true;
1254
1255 /* l3index always overrides non-l3index */
1256 if (old->l3index && new->l3index == 0)
1257 return false;
1258 if (old->l3index == 0 && new->l3index)
1259 return true;
1260
1261 return old->prefixlen < new->prefixlen;
1262}
1263
1264/* Find the Key structure for an address. */
1265struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1266 const union tcp_md5_addr *addr,
1267 int family, bool any_l3index)
1268{
1269 const struct tcp_sock *tp = tcp_sk(sk);
1270 struct tcp_md5sig_key *key;
1271 const struct tcp_md5sig_info *md5sig;
1272 __be32 mask;
1273 struct tcp_md5sig_key *best_match = NULL;
1274 bool match;
1275
1276 /* caller either holds rcu_read_lock() or socket lock */
1277 md5sig = rcu_dereference_check(tp->md5sig_info,
1278 lockdep_sock_is_held(sk));
1279 if (!md5sig)
1280 return NULL;
1281
1282 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1283 lockdep_sock_is_held(sk)) {
1284 if (key->family != family)
1285 continue;
1286 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1287 key->l3index != l3index)
1288 continue;
1289 if (family == AF_INET) {
1290 mask = inet_make_mask(logmask: key->prefixlen);
1291 match = (key->addr.a4.s_addr & mask) ==
1292 (addr->a4.s_addr & mask);
1293#if IS_ENABLED(CONFIG_IPV6)
1294 } else if (family == AF_INET6) {
1295 match = ipv6_prefix_equal(addr1: &key->addr.a6, addr2: &addr->a6,
1296 prefixlen: key->prefixlen);
1297#endif
1298 } else {
1299 match = false;
1300 }
1301
1302 if (match && better_md5_match(old: best_match, new: key))
1303 best_match = key;
1304 }
1305 return best_match;
1306}
1307EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1308
1309static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1310 const union tcp_md5_addr *addr,
1311 int family, u8 prefixlen,
1312 int l3index, u8 flags)
1313{
1314 const struct tcp_sock *tp = tcp_sk(sk);
1315 struct tcp_md5sig_key *key;
1316 unsigned int size = sizeof(struct in_addr);
1317 const struct tcp_md5sig_info *md5sig;
1318
1319 /* caller either holds rcu_read_lock() or socket lock */
1320 md5sig = rcu_dereference_check(tp->md5sig_info,
1321 lockdep_sock_is_held(sk));
1322 if (!md5sig)
1323 return NULL;
1324#if IS_ENABLED(CONFIG_IPV6)
1325 if (family == AF_INET6)
1326 size = sizeof(struct in6_addr);
1327#endif
1328 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1329 lockdep_sock_is_held(sk)) {
1330 if (key->family != family)
1331 continue;
1332 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1333 continue;
1334 if (key->l3index != l3index)
1335 continue;
1336 if (!memcmp(&key->addr, addr, size) &&
1337 key->prefixlen == prefixlen)
1338 return key;
1339 }
1340 return NULL;
1341}
1342
1343struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1344 const struct sock *addr_sk)
1345{
1346 const union tcp_md5_addr *addr;
1347 int l3index;
1348
1349 l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk),
1350 ifindex: addr_sk->sk_bound_dev_if);
1351 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1352 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1353}
1354EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1355
1356static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1357{
1358 struct tcp_sock *tp = tcp_sk(sk);
1359 struct tcp_md5sig_info *md5sig;
1360
1361 md5sig = kmalloc(sizeof(*md5sig), gfp);
1362 if (!md5sig)
1363 return -ENOMEM;
1364
1365 sk_gso_disable(sk);
1366 INIT_HLIST_HEAD(&md5sig->head);
1367 rcu_assign_pointer(tp->md5sig_info, md5sig);
1368 return 0;
1369}
1370
1371/* This can be called on a newly created socket, from other files */
1372static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1373 int family, u8 prefixlen, int l3index, u8 flags,
1374 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1375{
1376 /* Add Key to the list */
1377 struct tcp_md5sig_key *key;
1378 struct tcp_sock *tp = tcp_sk(sk);
1379 struct tcp_md5sig_info *md5sig;
1380
1381 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1382 if (key) {
1383 /* Pre-existing entry - just update that one.
1384 * Note that the key might be used concurrently.
1385 * data_race() is telling kcsan that we do not care of
1386 * key mismatches, since changing MD5 key on live flows
1387 * can lead to packet drops.
1388 */
1389 data_race(memcpy(key->key, newkey, newkeylen));
1390
1391 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1392 * Also note that a reader could catch new key->keylen value
1393 * but old key->key[], this is the reason we use __GFP_ZERO
1394 * at sock_kmalloc() time below these lines.
1395 */
1396 WRITE_ONCE(key->keylen, newkeylen);
1397
1398 return 0;
1399 }
1400
1401 md5sig = rcu_dereference_protected(tp->md5sig_info,
1402 lockdep_sock_is_held(sk));
1403
1404 key = sock_kmalloc(sk, size: sizeof(*key), priority: gfp | __GFP_ZERO);
1405 if (!key)
1406 return -ENOMEM;
1407
1408 memcpy(to: key->key, from: newkey, len: newkeylen);
1409 key->keylen = newkeylen;
1410 key->family = family;
1411 key->prefixlen = prefixlen;
1412 key->l3index = l3index;
1413 key->flags = flags;
1414 memcpy(to: &key->addr, from: addr,
1415 len: (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1416 sizeof(struct in_addr));
1417 hlist_add_head_rcu(n: &key->node, h: &md5sig->head);
1418 return 0;
1419}
1420
1421int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1422 int family, u8 prefixlen, int l3index, u8 flags,
1423 const u8 *newkey, u8 newkeylen)
1424{
1425 struct tcp_sock *tp = tcp_sk(sk);
1426
1427 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1428 if (tcp_md5_alloc_sigpool())
1429 return -ENOMEM;
1430
1431 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1432 tcp_md5_release_sigpool();
1433 return -ENOMEM;
1434 }
1435
1436 if (!static_branch_inc(&tcp_md5_needed.key)) {
1437 struct tcp_md5sig_info *md5sig;
1438
1439 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1440 rcu_assign_pointer(tp->md5sig_info, NULL);
1441 kfree_rcu(md5sig, rcu);
1442 tcp_md5_release_sigpool();
1443 return -EUSERS;
1444 }
1445 }
1446
1447 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1448 newkey, newkeylen, GFP_KERNEL);
1449}
1450EXPORT_IPV6_MOD(tcp_md5_do_add);
1451
1452int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1453 int family, u8 prefixlen, int l3index,
1454 struct tcp_md5sig_key *key)
1455{
1456 struct tcp_sock *tp = tcp_sk(sk);
1457
1458 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1459 tcp_md5_add_sigpool();
1460
1461 if (tcp_md5sig_info_add(sk, gfp: sk_gfp_mask(sk, GFP_ATOMIC))) {
1462 tcp_md5_release_sigpool();
1463 return -ENOMEM;
1464 }
1465
1466 if (!static_key_fast_inc_not_disabled(key: &tcp_md5_needed.key.key)) {
1467 struct tcp_md5sig_info *md5sig;
1468
1469 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1470 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1471 rcu_assign_pointer(tp->md5sig_info, NULL);
1472 kfree_rcu(md5sig, rcu);
1473 tcp_md5_release_sigpool();
1474 return -EUSERS;
1475 }
1476 }
1477
1478 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1479 flags: key->flags, newkey: key->key, newkeylen: key->keylen,
1480 gfp: sk_gfp_mask(sk, GFP_ATOMIC));
1481}
1482EXPORT_IPV6_MOD(tcp_md5_key_copy);
1483
1484int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1485 u8 prefixlen, int l3index, u8 flags)
1486{
1487 struct tcp_md5sig_key *key;
1488
1489 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1490 if (!key)
1491 return -ENOENT;
1492 hlist_del_rcu(n: &key->node);
1493 atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1494 kfree_rcu(key, rcu);
1495 return 0;
1496}
1497EXPORT_IPV6_MOD(tcp_md5_do_del);
1498
1499void tcp_clear_md5_list(struct sock *sk)
1500{
1501 struct tcp_sock *tp = tcp_sk(sk);
1502 struct tcp_md5sig_key *key;
1503 struct hlist_node *n;
1504 struct tcp_md5sig_info *md5sig;
1505
1506 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1507
1508 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1509 hlist_del(n: &key->node);
1510 atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1511 kfree(objp: key);
1512 }
1513}
1514
1515static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1516 sockptr_t optval, int optlen)
1517{
1518 struct tcp_md5sig cmd;
1519 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1520 const union tcp_md5_addr *addr;
1521 u8 prefixlen = 32;
1522 int l3index = 0;
1523 bool l3flag;
1524 u8 flags;
1525
1526 if (optlen < sizeof(cmd))
1527 return -EINVAL;
1528
1529 if (copy_from_sockptr(dst: &cmd, src: optval, size: sizeof(cmd)))
1530 return -EFAULT;
1531
1532 if (sin->sin_family != AF_INET)
1533 return -EINVAL;
1534
1535 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1536 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1537
1538 if (optname == TCP_MD5SIG_EXT &&
1539 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1540 prefixlen = cmd.tcpm_prefixlen;
1541 if (prefixlen > 32)
1542 return -EINVAL;
1543 }
1544
1545 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1546 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1547 struct net_device *dev;
1548
1549 rcu_read_lock();
1550 dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: cmd.tcpm_ifindex);
1551 if (dev && netif_is_l3_master(dev))
1552 l3index = dev->ifindex;
1553
1554 rcu_read_unlock();
1555
1556 /* ok to reference set/not set outside of rcu;
1557 * right now device MUST be an L3 master
1558 */
1559 if (!dev || !l3index)
1560 return -EINVAL;
1561 }
1562
1563 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1564
1565 if (!cmd.tcpm_keylen)
1566 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1567
1568 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1569 return -EINVAL;
1570
1571 /* Don't allow keys for peers that have a matching TCP-AO key.
1572 * See the comment in tcp_ao_add_cmd()
1573 */
1574 if (tcp_ao_required(sk, saddr: addr, AF_INET, l3index: l3flag ? l3index : -1, stat_inc: false))
1575 return -EKEYREJECTED;
1576
1577 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1578 newkey: cmd.tcpm_key, newkeylen: cmd.tcpm_keylen);
1579}
1580
1581static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1582 __be32 daddr, __be32 saddr,
1583 const struct tcphdr *th, int nbytes)
1584{
1585 struct tcp4_pseudohdr *bp;
1586 struct scatterlist sg;
1587 struct tcphdr *_th;
1588
1589 bp = hp->scratch;
1590 bp->saddr = saddr;
1591 bp->daddr = daddr;
1592 bp->pad = 0;
1593 bp->protocol = IPPROTO_TCP;
1594 bp->len = cpu_to_be16(nbytes);
1595
1596 _th = (struct tcphdr *)(bp + 1);
1597 memcpy(to: _th, from: th, len: sizeof(*th));
1598 _th->check = 0;
1599
1600 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1601 ahash_request_set_crypt(req: hp->req, src: &sg, NULL,
1602 nbytes: sizeof(*bp) + sizeof(*th));
1603 return crypto_ahash_update(req: hp->req);
1604}
1605
1606static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1607 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1608{
1609 struct tcp_sigpool hp;
1610
1611 if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1612 goto clear_hash_nostart;
1613
1614 if (crypto_ahash_init(req: hp.req))
1615 goto clear_hash;
1616 if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: th->doff << 2))
1617 goto clear_hash;
1618 if (tcp_md5_hash_key(hp: &hp, key))
1619 goto clear_hash;
1620 ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: 0);
1621 if (crypto_ahash_final(req: hp.req))
1622 goto clear_hash;
1623
1624 tcp_sigpool_end(c: &hp);
1625 return 0;
1626
1627clear_hash:
1628 tcp_sigpool_end(c: &hp);
1629clear_hash_nostart:
1630 memset(s: md5_hash, c: 0, n: 16);
1631 return 1;
1632}
1633
1634int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1635 const struct sock *sk,
1636 const struct sk_buff *skb)
1637{
1638 const struct tcphdr *th = tcp_hdr(skb);
1639 struct tcp_sigpool hp;
1640 __be32 saddr, daddr;
1641
1642 if (sk) { /* valid for establish/request sockets */
1643 saddr = sk->sk_rcv_saddr;
1644 daddr = sk->sk_daddr;
1645 } else {
1646 const struct iphdr *iph = ip_hdr(skb);
1647 saddr = iph->saddr;
1648 daddr = iph->daddr;
1649 }
1650
1651 if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1652 goto clear_hash_nostart;
1653
1654 if (crypto_ahash_init(req: hp.req))
1655 goto clear_hash;
1656
1657 if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: skb->len))
1658 goto clear_hash;
1659 if (tcp_sigpool_hash_skb_data(hp: &hp, skb, header_len: th->doff << 2))
1660 goto clear_hash;
1661 if (tcp_md5_hash_key(hp: &hp, key))
1662 goto clear_hash;
1663 ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: 0);
1664 if (crypto_ahash_final(req: hp.req))
1665 goto clear_hash;
1666
1667 tcp_sigpool_end(c: &hp);
1668 return 0;
1669
1670clear_hash:
1671 tcp_sigpool_end(c: &hp);
1672clear_hash_nostart:
1673 memset(s: md5_hash, c: 0, n: 16);
1674 return 1;
1675}
1676EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1677
1678#endif
1679
1680static void tcp_v4_init_req(struct request_sock *req,
1681 const struct sock *sk_listener,
1682 struct sk_buff *skb)
1683{
1684 struct inet_request_sock *ireq = inet_rsk(sk: req);
1685 struct net *net = sock_net(sk: sk_listener);
1686
1687 sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr);
1688 sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr);
1689 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1690}
1691
1692static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1693 struct sk_buff *skb,
1694 struct flowi *fl,
1695 struct request_sock *req,
1696 u32 tw_isn)
1697{
1698 tcp_v4_init_req(req, sk_listener: sk, skb);
1699
1700 if (security_inet_conn_request(sk, skb, req))
1701 return NULL;
1702
1703 return inet_csk_route_req(sk, fl4: &fl->u.ip4, req);
1704}
1705
1706struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1707 .family = PF_INET,
1708 .obj_size = sizeof(struct tcp_request_sock),
1709 .send_ack = tcp_v4_reqsk_send_ack,
1710 .destructor = tcp_v4_reqsk_destructor,
1711 .send_reset = tcp_v4_send_reset,
1712 .syn_ack_timeout = tcp_syn_ack_timeout,
1713};
1714
1715const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1716 .mss_clamp = TCP_MSS_DEFAULT,
1717#ifdef CONFIG_TCP_MD5SIG
1718 .req_md5_lookup = tcp_v4_md5_lookup,
1719 .calc_md5_hash = tcp_v4_md5_hash_skb,
1720#endif
1721#ifdef CONFIG_TCP_AO
1722 .ao_lookup = tcp_v4_ao_lookup_rsk,
1723 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1724 .ao_synack_hash = tcp_v4_ao_synack_hash,
1725#endif
1726#ifdef CONFIG_SYN_COOKIES
1727 .cookie_init_seq = cookie_v4_init_sequence,
1728#endif
1729 .route_req = tcp_v4_route_req,
1730 .init_seq = tcp_v4_init_seq,
1731 .init_ts_off = tcp_v4_init_ts_off,
1732 .send_synack = tcp_v4_send_synack,
1733};
1734
1735int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1736{
1737 /* Never answer to SYNs send to broadcast or multicast */
1738 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1739 goto drop;
1740
1741 return tcp_conn_request(rsk_ops: &tcp_request_sock_ops,
1742 af_ops: &tcp_request_sock_ipv4_ops, sk, skb);
1743
1744drop:
1745 tcp_listendrop(sk);
1746 return 0;
1747}
1748EXPORT_IPV6_MOD(tcp_v4_conn_request);
1749
1750
1751/*
1752 * The three way handshake has completed - we got a valid synack -
1753 * now create the new socket.
1754 */
1755struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1756 struct request_sock *req,
1757 struct dst_entry *dst,
1758 struct request_sock *req_unhash,
1759 bool *own_req)
1760{
1761 struct inet_request_sock *ireq;
1762 bool found_dup_sk = false;
1763 struct inet_sock *newinet;
1764 struct tcp_sock *newtp;
1765 struct sock *newsk;
1766#ifdef CONFIG_TCP_MD5SIG
1767 const union tcp_md5_addr *addr;
1768 struct tcp_md5sig_key *key;
1769 int l3index;
1770#endif
1771 struct ip_options_rcu *inet_opt;
1772
1773 if (sk_acceptq_is_full(sk))
1774 goto exit_overflow;
1775
1776 newsk = tcp_create_openreq_child(sk, req, skb);
1777 if (!newsk)
1778 goto exit_nonewsk;
1779
1780 newsk->sk_gso_type = SKB_GSO_TCPV4;
1781 inet_sk_rx_dst_set(sk: newsk, skb);
1782
1783 newtp = tcp_sk(newsk);
1784 newinet = inet_sk(newsk);
1785 ireq = inet_rsk(sk: req);
1786 inet_opt = rcu_dereference(ireq->ireq_opt);
1787 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1788 newinet->mc_index = inet_iif(skb);
1789 newinet->mc_ttl = ip_hdr(skb)->ttl;
1790 newinet->rcv_tos = ip_hdr(skb)->tos;
1791 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1792 if (inet_opt)
1793 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1794 atomic_set(v: &newinet->inet_id, i: get_random_u16());
1795
1796 /* Set ToS of the new socket based upon the value of incoming SYN.
1797 * ECT bits are set later in tcp_init_transfer().
1798 */
1799 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1800 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1801
1802 if (!dst) {
1803 dst = inet_csk_route_child_sock(sk, newsk, req);
1804 if (!dst)
1805 goto put_and_exit;
1806 } else {
1807 /* syncookie case : see end of cookie_v4_check() */
1808 }
1809 sk_setup_caps(sk: newsk, dst);
1810
1811 tcp_ca_openreq_child(sk: newsk, dst);
1812
1813 tcp_sync_mss(sk: newsk, pmtu: dst_mtu(dst));
1814 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), mss: dst_metric_advmss(dst));
1815
1816 tcp_initialize_rcv_mss(sk: newsk);
1817
1818#ifdef CONFIG_TCP_MD5SIG
1819 l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk), ifindex: ireq->ir_iif);
1820 /* Copy over the MD5 key from the original socket */
1821 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1822 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1823 if (key && !tcp_rsk_used_ao(req)) {
1824 if (tcp_md5_key_copy(sk: newsk, addr, AF_INET, prefixlen: 32, l3index, key))
1825 goto put_and_exit;
1826 sk_gso_disable(sk: newsk);
1827 }
1828#endif
1829#ifdef CONFIG_TCP_AO
1830 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1831 goto put_and_exit; /* OOM, release back memory */
1832#endif
1833
1834 if (__inet_inherit_port(sk, child: newsk) < 0)
1835 goto put_and_exit;
1836 *own_req = inet_ehash_nolisten(sk: newsk, osk: req_to_sk(req: req_unhash),
1837 found_dup_sk: &found_dup_sk);
1838 if (likely(*own_req)) {
1839 tcp_move_syn(tp: newtp, req);
1840 ireq->ireq_opt = NULL;
1841 } else {
1842 newinet->inet_opt = NULL;
1843
1844 if (!req_unhash && found_dup_sk) {
1845 /* This code path should only be executed in the
1846 * syncookie case only
1847 */
1848 bh_unlock_sock(newsk);
1849 sock_put(sk: newsk);
1850 newsk = NULL;
1851 }
1852 }
1853 return newsk;
1854
1855exit_overflow:
1856 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1857exit_nonewsk:
1858 dst_release(dst);
1859exit:
1860 tcp_listendrop(sk);
1861 return NULL;
1862put_and_exit:
1863 newinet->inet_opt = NULL;
1864 inet_csk_prepare_forced_close(sk: newsk);
1865 tcp_done(sk: newsk);
1866 goto exit;
1867}
1868EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1869
1870static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1871{
1872#ifdef CONFIG_SYN_COOKIES
1873 const struct tcphdr *th = tcp_hdr(skb);
1874
1875 if (!th->syn)
1876 sk = cookie_v4_check(sk, skb);
1877#endif
1878 return sk;
1879}
1880
1881u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1882 struct tcphdr *th, u32 *cookie)
1883{
1884 u16 mss = 0;
1885#ifdef CONFIG_SYN_COOKIES
1886 mss = tcp_get_syncookie_mss(rsk_ops: &tcp_request_sock_ops,
1887 af_ops: &tcp_request_sock_ipv4_ops, sk, th);
1888 if (mss) {
1889 *cookie = __cookie_v4_init_sequence(iph, th, mssp: &mss);
1890 tcp_synq_overflow(sk);
1891 }
1892#endif
1893 return mss;
1894}
1895
1896INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1897 u32));
1898/* The socket must have it's spinlock held when we get
1899 * here, unless it is a TCP_LISTEN socket.
1900 *
1901 * We have a potential double-lock case here, so even when
1902 * doing backlog processing we use the BH locking scheme.
1903 * This is because we cannot sleep with the original spinlock
1904 * held.
1905 */
1906int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1907{
1908 enum skb_drop_reason reason;
1909 struct sock *rsk;
1910
1911 reason = psp_sk_rx_policy_check(sk, skb);
1912 if (reason)
1913 goto err_discard;
1914
1915 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1916 struct dst_entry *dst;
1917
1918 dst = rcu_dereference_protected(sk->sk_rx_dst,
1919 lockdep_sock_is_held(sk));
1920
1921 sock_rps_save_rxhash(sk, skb);
1922 sk_mark_napi_id(sk, skb);
1923 if (dst) {
1924 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1925 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1926 dst, 0)) {
1927 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1928 dst_release(dst);
1929 }
1930 }
1931 tcp_rcv_established(sk, skb);
1932 return 0;
1933 }
1934
1935 if (tcp_checksum_complete(skb))
1936 goto csum_err;
1937
1938 if (sk->sk_state == TCP_LISTEN) {
1939 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1940
1941 if (!nsk)
1942 return 0;
1943 if (nsk != sk) {
1944 reason = tcp_child_process(parent: sk, child: nsk, skb);
1945 if (reason) {
1946 rsk = nsk;
1947 goto reset;
1948 }
1949 return 0;
1950 }
1951 } else
1952 sock_rps_save_rxhash(sk, skb);
1953
1954 reason = tcp_rcv_state_process(sk, skb);
1955 if (reason) {
1956 rsk = sk;
1957 goto reset;
1958 }
1959 return 0;
1960
1961reset:
1962 tcp_v4_send_reset(sk: rsk, skb, reason: sk_rst_convert_drop_reason(reason));
1963discard:
1964 sk_skb_reason_drop(sk, skb, reason);
1965 /* Be careful here. If this function gets more complicated and
1966 * gcc suffers from register pressure on the x86, sk (in %ebx)
1967 * might be destroyed here. This current version compiles correctly,
1968 * but you have been warned.
1969 */
1970 return 0;
1971
1972csum_err:
1973 reason = SKB_DROP_REASON_TCP_CSUM;
1974 trace_tcp_bad_csum(skb);
1975 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1976err_discard:
1977 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1978 goto discard;
1979}
1980EXPORT_SYMBOL(tcp_v4_do_rcv);
1981
1982int tcp_v4_early_demux(struct sk_buff *skb)
1983{
1984 struct net *net = dev_net_rcu(dev: skb->dev);
1985 const struct iphdr *iph;
1986 const struct tcphdr *th;
1987 struct sock *sk;
1988
1989 if (skb->pkt_type != PACKET_HOST)
1990 return 0;
1991
1992 if (!pskb_may_pull(skb, len: skb_transport_offset(skb) + sizeof(struct tcphdr)))
1993 return 0;
1994
1995 iph = ip_hdr(skb);
1996 th = tcp_hdr(skb);
1997
1998 if (th->doff < sizeof(struct tcphdr) / 4)
1999 return 0;
2000
2001 sk = __inet_lookup_established(net, saddr: iph->saddr, sport: th->source,
2002 daddr: iph->daddr, ntohs(th->dest),
2003 dif: skb->skb_iif, sdif: inet_sdif(skb));
2004 if (sk) {
2005 skb->sk = sk;
2006 skb->destructor = sock_edemux;
2007 if (sk_fullsock(sk)) {
2008 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2009
2010 if (dst)
2011 dst = dst_check(dst, cookie: 0);
2012 if (dst &&
2013 sk->sk_rx_dst_ifindex == skb->skb_iif)
2014 skb_dst_set_noref(skb, dst);
2015 }
2016 }
2017 return 0;
2018}
2019
2020bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2021 enum skb_drop_reason *reason)
2022{
2023 u32 tail_gso_size, tail_gso_segs;
2024 struct skb_shared_info *shinfo;
2025 const struct tcphdr *th;
2026 struct tcphdr *thtail;
2027 struct sk_buff *tail;
2028 unsigned int hdrlen;
2029 bool fragstolen;
2030 u32 gso_segs;
2031 u32 gso_size;
2032 u64 limit;
2033 int delta;
2034 int err;
2035
2036 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2037 * we can fix skb->truesize to its real value to avoid future drops.
2038 * This is valid because skb is not yet charged to the socket.
2039 * It has been noticed pure SACK packets were sometimes dropped
2040 * (if cooked by drivers without copybreak feature).
2041 */
2042 skb_condense(skb);
2043
2044 tcp_cleanup_skb(skb);
2045
2046 if (unlikely(tcp_checksum_complete(skb))) {
2047 bh_unlock_sock(sk);
2048 trace_tcp_bad_csum(skb);
2049 *reason = SKB_DROP_REASON_TCP_CSUM;
2050 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2051 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2052 return true;
2053 }
2054
2055 /* Attempt coalescing to last skb in backlog, even if we are
2056 * above the limits.
2057 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2058 */
2059 th = (const struct tcphdr *)skb->data;
2060 hdrlen = th->doff * 4;
2061
2062 tail = sk->sk_backlog.tail;
2063 if (!tail)
2064 goto no_coalesce;
2065 thtail = (struct tcphdr *)tail->data;
2066
2067 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2068 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2069 ((TCP_SKB_CB(tail)->tcp_flags |
2070 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2071 !((TCP_SKB_CB(tail)->tcp_flags &
2072 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2073 ((TCP_SKB_CB(tail)->tcp_flags ^
2074 TCP_SKB_CB(skb)->tcp_flags) &
2075 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2076 !tcp_skb_can_collapse_rx(to: tail, from: skb) ||
2077 thtail->doff != th->doff ||
2078 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2079 /* prior to PSP Rx policy check, retain exact PSP metadata */
2080 psp_skb_coalesce_diff(one: tail, two: skb))
2081 goto no_coalesce;
2082
2083 __skb_pull(skb, len: hdrlen);
2084
2085 shinfo = skb_shinfo(skb);
2086 gso_size = shinfo->gso_size ?: skb->len;
2087 gso_segs = shinfo->gso_segs ?: 1;
2088
2089 shinfo = skb_shinfo(tail);
2090 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2091 tail_gso_segs = shinfo->gso_segs ?: 1;
2092
2093 if (skb_try_coalesce(to: tail, from: skb, fragstolen: &fragstolen, delta_truesize: &delta)) {
2094 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2095
2096 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2097 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2098 thtail->window = th->window;
2099 }
2100
2101 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2102 * thtail->fin, so that the fast path in tcp_rcv_established()
2103 * is not entered if we append a packet with a FIN.
2104 * SYN, RST, URG are not present.
2105 * ACK is set on both packets.
2106 * PSH : we do not really care in TCP stack,
2107 * at least for 'GRO' packets.
2108 */
2109 thtail->fin |= th->fin;
2110 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2111
2112 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2113 TCP_SKB_CB(tail)->has_rxtstamp = true;
2114 tail->tstamp = skb->tstamp;
2115 skb_hwtstamps(skb: tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2116 }
2117
2118 /* Not as strict as GRO. We only need to carry mss max value */
2119 shinfo->gso_size = max(gso_size, tail_gso_size);
2120 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2121
2122 sk->sk_backlog.len += delta;
2123 __NET_INC_STATS(sock_net(sk),
2124 LINUX_MIB_TCPBACKLOGCOALESCE);
2125 kfree_skb_partial(skb, head_stolen: fragstolen);
2126 return false;
2127 }
2128 __skb_push(skb, len: hdrlen);
2129
2130no_coalesce:
2131 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2132 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2133 * sk_rcvbuf in normal conditions.
2134 */
2135 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2136
2137 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2138
2139 /* Only socket owner can try to collapse/prune rx queues
2140 * to reduce memory overhead, so add a little headroom here.
2141 * Few sockets backlog are possibly concurrently non empty.
2142 */
2143 limit += 64 * 1024;
2144
2145 limit = min_t(u64, limit, UINT_MAX);
2146
2147 err = sk_add_backlog(sk, skb, limit);
2148 if (unlikely(err)) {
2149 bh_unlock_sock(sk);
2150 if (err == -ENOMEM) {
2151 *reason = SKB_DROP_REASON_PFMEMALLOC;
2152 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2153 } else {
2154 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2155 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2156 }
2157 return true;
2158 }
2159 return false;
2160}
2161EXPORT_IPV6_MOD(tcp_add_backlog);
2162
2163int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
2164{
2165 struct tcphdr *th = (struct tcphdr *)skb->data;
2166
2167 return sk_filter_trim_cap(sk, skb, cap: th->doff * 4, reason);
2168}
2169EXPORT_IPV6_MOD(tcp_filter);
2170
2171static void tcp_v4_restore_cb(struct sk_buff *skb)
2172{
2173 memmove(IPCB(skb), src: &TCP_SKB_CB(skb)->header.h4,
2174 count: sizeof(struct inet_skb_parm));
2175}
2176
2177static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2178 const struct tcphdr *th)
2179{
2180 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2181 * barrier() makes sure compiler wont play fool^Waliasing games.
2182 */
2183 memmove(dest: &TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2184 count: sizeof(struct inet_skb_parm));
2185 barrier();
2186
2187 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2188 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2189 skb->len - th->doff * 4);
2190 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2191 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2192 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2193 TCP_SKB_CB(skb)->sacked = 0;
2194 TCP_SKB_CB(skb)->has_rxtstamp =
2195 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2196}
2197
2198/*
2199 * From tcp_input.c
2200 */
2201
2202int tcp_v4_rcv(struct sk_buff *skb)
2203{
2204 struct net *net = dev_net_rcu(dev: skb->dev);
2205 enum skb_drop_reason drop_reason;
2206 enum tcp_tw_status tw_status;
2207 int sdif = inet_sdif(skb);
2208 int dif = inet_iif(skb);
2209 const struct iphdr *iph;
2210 const struct tcphdr *th;
2211 struct sock *sk = NULL;
2212 bool refcounted;
2213 int ret;
2214 u32 isn;
2215
2216 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2217 if (skb->pkt_type != PACKET_HOST)
2218 goto discard_it;
2219
2220 /* Count it even if it's bad */
2221 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2222
2223 if (!pskb_may_pull(skb, len: sizeof(struct tcphdr)))
2224 goto discard_it;
2225
2226 th = (const struct tcphdr *)skb->data;
2227
2228 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2229 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2230 goto bad_packet;
2231 }
2232 if (!pskb_may_pull(skb, len: th->doff * 4))
2233 goto discard_it;
2234
2235 /* An explanation is required here, I think.
2236 * Packet length and doff are validated by header prediction,
2237 * provided case of th->doff==0 is eliminated.
2238 * So, we defer the checks. */
2239
2240 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2241 goto csum_error;
2242
2243 th = (const struct tcphdr *)skb->data;
2244 iph = ip_hdr(skb);
2245lookup:
2246 sk = __inet_lookup_skb(skb, doff: __tcp_hdrlen(th), sport: th->source,
2247 dport: th->dest, sdif, refcounted: &refcounted);
2248 if (!sk)
2249 goto no_tcp_socket;
2250
2251 if (sk->sk_state == TCP_TIME_WAIT)
2252 goto do_time_wait;
2253
2254 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2255 struct request_sock *req = inet_reqsk(sk);
2256 bool req_stolen = false;
2257 struct sock *nsk;
2258
2259 sk = req->rsk_listener;
2260 if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb))
2261 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2262 else
2263 drop_reason = tcp_inbound_hash(sk, req, skb,
2264 saddr: &iph->saddr, daddr: &iph->daddr,
2265 AF_INET, dif, sdif);
2266 if (unlikely(drop_reason)) {
2267 sk_drops_skbadd(sk, skb);
2268 reqsk_put(req);
2269 goto discard_it;
2270 }
2271 if (tcp_checksum_complete(skb)) {
2272 reqsk_put(req);
2273 goto csum_error;
2274 }
2275 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2276 nsk = reuseport_migrate_sock(sk, migrating_sk: req_to_sk(req), skb);
2277 if (!nsk) {
2278 inet_csk_reqsk_queue_drop_and_put(sk, req);
2279 goto lookup;
2280 }
2281 sk = nsk;
2282 /* reuseport_migrate_sock() has already held one sk_refcnt
2283 * before returning.
2284 */
2285 } else {
2286 /* We own a reference on the listener, increase it again
2287 * as we might lose it too soon.
2288 */
2289 sock_hold(sk);
2290 }
2291 refcounted = true;
2292 nsk = NULL;
2293 if (!tcp_filter(sk, skb, reason: &drop_reason)) {
2294 th = (const struct tcphdr *)skb->data;
2295 iph = ip_hdr(skb);
2296 tcp_v4_fill_cb(skb, iph, th);
2297 nsk = tcp_check_req(sk, skb, req, fastopen: false, lost_race: &req_stolen,
2298 drop_reason: &drop_reason);
2299 }
2300 if (!nsk) {
2301 reqsk_put(req);
2302 if (req_stolen) {
2303 /* Another cpu got exclusive access to req
2304 * and created a full blown socket.
2305 * Try to feed this packet to this socket
2306 * instead of discarding it.
2307 */
2308 tcp_v4_restore_cb(skb);
2309 sock_put(sk);
2310 goto lookup;
2311 }
2312 goto discard_and_relse;
2313 }
2314 nf_reset_ct(skb);
2315 if (nsk == sk) {
2316 reqsk_put(req);
2317 tcp_v4_restore_cb(skb);
2318 } else {
2319 drop_reason = tcp_child_process(parent: sk, child: nsk, skb);
2320 if (drop_reason) {
2321 enum sk_rst_reason rst_reason;
2322
2323 rst_reason = sk_rst_convert_drop_reason(reason: drop_reason);
2324 tcp_v4_send_reset(sk: nsk, skb, reason: rst_reason);
2325 goto discard_and_relse;
2326 }
2327 sock_put(sk);
2328 return 0;
2329 }
2330 }
2331
2332process:
2333 if (static_branch_unlikely(&ip4_min_ttl)) {
2334 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2335 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2336 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2337 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2338 goto discard_and_relse;
2339 }
2340 }
2341
2342 if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb)) {
2343 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2344 goto discard_and_relse;
2345 }
2346
2347 drop_reason = tcp_inbound_hash(sk, NULL, skb, saddr: &iph->saddr, daddr: &iph->daddr,
2348 AF_INET, dif, sdif);
2349 if (drop_reason)
2350 goto discard_and_relse;
2351
2352 nf_reset_ct(skb);
2353
2354 if (tcp_filter(sk, skb, reason: &drop_reason))
2355 goto discard_and_relse;
2356
2357 th = (const struct tcphdr *)skb->data;
2358 iph = ip_hdr(skb);
2359 tcp_v4_fill_cb(skb, iph, th);
2360
2361 skb->dev = NULL;
2362
2363 if (sk->sk_state == TCP_LISTEN) {
2364 ret = tcp_v4_do_rcv(sk, skb);
2365 goto put_and_return;
2366 }
2367
2368 sk_incoming_cpu_update(sk);
2369
2370 bh_lock_sock_nested(sk);
2371 tcp_segs_in(tcp_sk(sk), skb);
2372 ret = 0;
2373 if (!sock_owned_by_user(sk)) {
2374 ret = tcp_v4_do_rcv(sk, skb);
2375 } else {
2376 if (tcp_add_backlog(sk, skb, reason: &drop_reason))
2377 goto discard_and_relse;
2378 }
2379 bh_unlock_sock(sk);
2380
2381put_and_return:
2382 if (refcounted)
2383 sock_put(sk);
2384
2385 return ret;
2386
2387no_tcp_socket:
2388 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2389 if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb))
2390 goto discard_it;
2391
2392 tcp_v4_fill_cb(skb, iph, th);
2393
2394 if (tcp_checksum_complete(skb)) {
2395csum_error:
2396 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2397 trace_tcp_bad_csum(skb);
2398 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2399bad_packet:
2400 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2401 } else {
2402 tcp_v4_send_reset(NULL, skb, reason: sk_rst_convert_drop_reason(reason: drop_reason));
2403 }
2404
2405discard_it:
2406 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2407 /* Discard frame. */
2408 sk_skb_reason_drop(sk, skb, reason: drop_reason);
2409 return 0;
2410
2411discard_and_relse:
2412 sk_drops_skbadd(sk, skb);
2413 if (refcounted)
2414 sock_put(sk);
2415 goto discard_it;
2416
2417do_time_wait:
2418 if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb)) {
2419 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2420 inet_twsk_put(tw: inet_twsk(sk));
2421 goto discard_it;
2422 }
2423
2424 tcp_v4_fill_cb(skb, iph, th);
2425
2426 if (tcp_checksum_complete(skb)) {
2427 inet_twsk_put(tw: inet_twsk(sk));
2428 goto csum_error;
2429 }
2430
2431 tw_status = tcp_timewait_state_process(tw: inet_twsk(sk), skb, th, tw_isn: &isn,
2432 drop_reason: &drop_reason);
2433 switch (tw_status) {
2434 case TCP_TW_SYN: {
2435 struct sock *sk2 = inet_lookup_listener(net, skb, doff: __tcp_hdrlen(th),
2436 saddr: iph->saddr, sport: th->source,
2437 daddr: iph->daddr, dport: th->dest,
2438 dif: inet_iif(skb),
2439 sdif);
2440 if (sk2) {
2441 inet_twsk_deschedule_put(tw: inet_twsk(sk));
2442 sk = sk2;
2443 tcp_v4_restore_cb(skb);
2444 refcounted = false;
2445 __this_cpu_write(tcp_tw_isn, isn);
2446 goto process;
2447 }
2448
2449 drop_reason = psp_twsk_rx_policy_check(tw: inet_twsk(sk), skb);
2450 if (drop_reason)
2451 break;
2452 }
2453 /* to ACK */
2454 fallthrough;
2455 case TCP_TW_ACK:
2456 case TCP_TW_ACK_OOW:
2457 tcp_v4_timewait_ack(sk, skb, tw_status);
2458 break;
2459 case TCP_TW_RST:
2460 tcp_v4_send_reset(sk, skb, reason: SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2461 inet_twsk_deschedule_put(tw: inet_twsk(sk));
2462 goto discard_it;
2463 case TCP_TW_SUCCESS:;
2464 }
2465 goto discard_it;
2466}
2467
2468static struct timewait_sock_ops tcp_timewait_sock_ops = {
2469 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2470};
2471
2472void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2473{
2474 struct dst_entry *dst = skb_dst(skb);
2475
2476 if (dst && dst_hold_safe(dst)) {
2477 rcu_assign_pointer(sk->sk_rx_dst, dst);
2478 sk->sk_rx_dst_ifindex = skb->skb_iif;
2479 }
2480}
2481EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2482
2483const struct inet_connection_sock_af_ops ipv4_specific = {
2484 .queue_xmit = ip_queue_xmit,
2485 .send_check = tcp_v4_send_check,
2486 .rebuild_header = inet_sk_rebuild_header,
2487 .sk_rx_dst_set = inet_sk_rx_dst_set,
2488 .conn_request = tcp_v4_conn_request,
2489 .syn_recv_sock = tcp_v4_syn_recv_sock,
2490 .net_header_len = sizeof(struct iphdr),
2491 .setsockopt = ip_setsockopt,
2492 .getsockopt = ip_getsockopt,
2493 .mtu_reduced = tcp_v4_mtu_reduced,
2494};
2495EXPORT_IPV6_MOD(ipv4_specific);
2496
2497#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2499#ifdef CONFIG_TCP_MD5SIG
2500 .md5_lookup = tcp_v4_md5_lookup,
2501 .calc_md5_hash = tcp_v4_md5_hash_skb,
2502 .md5_parse = tcp_v4_parse_md5_keys,
2503#endif
2504#ifdef CONFIG_TCP_AO
2505 .ao_lookup = tcp_v4_ao_lookup,
2506 .calc_ao_hash = tcp_v4_ao_hash_skb,
2507 .ao_parse = tcp_v4_parse_ao,
2508 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2509#endif
2510};
2511
2512static void tcp4_destruct_sock(struct sock *sk)
2513{
2514 tcp_md5_destruct_sock(sk);
2515 tcp_ao_destroy_sock(sk, twsk: false);
2516 inet_sock_destruct(sk);
2517}
2518#endif
2519
2520/* NOTE: A lot of things set to zero explicitly by call to
2521 * sk_alloc() so need not be done here.
2522 */
2523static int tcp_v4_init_sock(struct sock *sk)
2524{
2525 struct inet_connection_sock *icsk = inet_csk(sk);
2526
2527 tcp_init_sock(sk);
2528
2529 icsk->icsk_af_ops = &ipv4_specific;
2530
2531#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2532 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2533 sk->sk_destruct = tcp4_destruct_sock;
2534#endif
2535
2536 return 0;
2537}
2538
2539static void tcp_release_user_frags(struct sock *sk)
2540{
2541#ifdef CONFIG_PAGE_POOL
2542 unsigned long index;
2543 void *netmem;
2544
2545 xa_for_each(&sk->sk_user_frags, index, netmem)
2546 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2547#endif
2548}
2549
2550void tcp_v4_destroy_sock(struct sock *sk)
2551{
2552 struct tcp_sock *tp = tcp_sk(sk);
2553
2554 tcp_release_user_frags(sk);
2555
2556 xa_destroy(&sk->sk_user_frags);
2557
2558 trace_tcp_destroy_sock(sk);
2559
2560 tcp_clear_xmit_timers(sk);
2561
2562 tcp_cleanup_congestion_control(sk);
2563
2564 tcp_cleanup_ulp(sk);
2565
2566 /* Cleanup up the write buffer. */
2567 tcp_write_queue_purge(sk);
2568
2569 /* Check if we want to disable active TFO */
2570 tcp_fastopen_active_disable_ofo_check(sk);
2571
2572 /* Cleans up our, hopefully empty, out_of_order_queue. */
2573 skb_rbtree_purge(root: &tp->out_of_order_queue);
2574
2575 /* Clean up a referenced TCP bind bucket. */
2576 if (inet_csk(sk)->icsk_bind_hash)
2577 inet_put_port(sk);
2578
2579 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2580
2581 /* If socket is aborted during connect operation */
2582 tcp_free_fastopen_req(tp);
2583 tcp_fastopen_destroy_cipher(sk);
2584 tcp_saved_syn_free(tp);
2585
2586 sk_sockets_allocated_dec(sk);
2587}
2588EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2589
2590#ifdef CONFIG_PROC_FS
2591/* Proc filesystem TCP sock list dumping. */
2592
2593static unsigned short seq_file_family(const struct seq_file *seq);
2594
2595static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2596{
2597 unsigned short family = seq_file_family(seq);
2598
2599 /* AF_UNSPEC is used as a match all */
2600 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2601 net_eq(net1: sock_net(sk), net2: seq_file_net(seq)));
2602}
2603
2604/* Find a non empty bucket (starting from st->bucket)
2605 * and return the first sk from it.
2606 */
2607static void *listening_get_first(struct seq_file *seq)
2608{
2609 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610 struct tcp_iter_state *st = seq->private;
2611
2612 st->offset = 0;
2613 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2614 struct inet_listen_hashbucket *ilb2;
2615 struct hlist_nulls_node *node;
2616 struct sock *sk;
2617
2618 ilb2 = &hinfo->lhash2[st->bucket];
2619 if (hlist_nulls_empty(h: &ilb2->nulls_head))
2620 continue;
2621
2622 spin_lock(lock: &ilb2->lock);
2623 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2624 if (seq_sk_match(seq, sk))
2625 return sk;
2626 }
2627 spin_unlock(lock: &ilb2->lock);
2628 }
2629
2630 return NULL;
2631}
2632
2633/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2634 * If "cur" is the last one in the st->bucket,
2635 * call listening_get_first() to return the first sk of the next
2636 * non empty bucket.
2637 */
2638static void *listening_get_next(struct seq_file *seq, void *cur)
2639{
2640 struct tcp_iter_state *st = seq->private;
2641 struct inet_listen_hashbucket *ilb2;
2642 struct hlist_nulls_node *node;
2643 struct inet_hashinfo *hinfo;
2644 struct sock *sk = cur;
2645
2646 ++st->num;
2647 ++st->offset;
2648
2649 sk = sk_nulls_next(sk);
2650 sk_nulls_for_each_from(sk, node) {
2651 if (seq_sk_match(seq, sk))
2652 return sk;
2653 }
2654
2655 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2656 ilb2 = &hinfo->lhash2[st->bucket];
2657 spin_unlock(lock: &ilb2->lock);
2658 ++st->bucket;
2659 return listening_get_first(seq);
2660}
2661
2662static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2663{
2664 struct tcp_iter_state *st = seq->private;
2665 void *rc;
2666
2667 st->bucket = 0;
2668 st->offset = 0;
2669 rc = listening_get_first(seq);
2670
2671 while (rc && *pos) {
2672 rc = listening_get_next(seq, cur: rc);
2673 --*pos;
2674 }
2675 return rc;
2676}
2677
2678static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2679 const struct tcp_iter_state *st)
2680{
2681 return hlist_nulls_empty(h: &hinfo->ehash[st->bucket].chain);
2682}
2683
2684/*
2685 * Get first established socket starting from bucket given in st->bucket.
2686 * If st->bucket is zero, the very first socket in the hash is returned.
2687 */
2688static void *established_get_first(struct seq_file *seq)
2689{
2690 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2691 struct tcp_iter_state *st = seq->private;
2692
2693 st->offset = 0;
2694 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2695 struct sock *sk;
2696 struct hlist_nulls_node *node;
2697 spinlock_t *lock = inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket);
2698
2699 cond_resched();
2700
2701 /* Lockless fast path for the common case of empty buckets */
2702 if (empty_bucket(hinfo, st))
2703 continue;
2704
2705 spin_lock_bh(lock);
2706 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2707 if (seq_sk_match(seq, sk))
2708 return sk;
2709 }
2710 spin_unlock_bh(lock);
2711 }
2712
2713 return NULL;
2714}
2715
2716static void *established_get_next(struct seq_file *seq, void *cur)
2717{
2718 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2719 struct tcp_iter_state *st = seq->private;
2720 struct hlist_nulls_node *node;
2721 struct sock *sk = cur;
2722
2723 ++st->num;
2724 ++st->offset;
2725
2726 sk = sk_nulls_next(sk);
2727
2728 sk_nulls_for_each_from(sk, node) {
2729 if (seq_sk_match(seq, sk))
2730 return sk;
2731 }
2732
2733 spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2734 ++st->bucket;
2735 return established_get_first(seq);
2736}
2737
2738static void *established_get_idx(struct seq_file *seq, loff_t pos)
2739{
2740 struct tcp_iter_state *st = seq->private;
2741 void *rc;
2742
2743 st->bucket = 0;
2744 rc = established_get_first(seq);
2745
2746 while (rc && pos) {
2747 rc = established_get_next(seq, cur: rc);
2748 --pos;
2749 }
2750 return rc;
2751}
2752
2753static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2754{
2755 void *rc;
2756 struct tcp_iter_state *st = seq->private;
2757
2758 st->state = TCP_SEQ_STATE_LISTENING;
2759 rc = listening_get_idx(seq, pos: &pos);
2760
2761 if (!rc) {
2762 st->state = TCP_SEQ_STATE_ESTABLISHED;
2763 rc = established_get_idx(seq, pos);
2764 }
2765
2766 return rc;
2767}
2768
2769static void *tcp_seek_last_pos(struct seq_file *seq)
2770{
2771 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2772 struct tcp_iter_state *st = seq->private;
2773 int bucket = st->bucket;
2774 int offset = st->offset;
2775 int orig_num = st->num;
2776 void *rc = NULL;
2777
2778 switch (st->state) {
2779 case TCP_SEQ_STATE_LISTENING:
2780 if (st->bucket > hinfo->lhash2_mask)
2781 break;
2782 rc = listening_get_first(seq);
2783 while (offset-- && rc && bucket == st->bucket)
2784 rc = listening_get_next(seq, cur: rc);
2785 if (rc)
2786 break;
2787 st->bucket = 0;
2788 st->state = TCP_SEQ_STATE_ESTABLISHED;
2789 fallthrough;
2790 case TCP_SEQ_STATE_ESTABLISHED:
2791 if (st->bucket > hinfo->ehash_mask)
2792 break;
2793 rc = established_get_first(seq);
2794 while (offset-- && rc && bucket == st->bucket)
2795 rc = established_get_next(seq, cur: rc);
2796 }
2797
2798 st->num = orig_num;
2799
2800 return rc;
2801}
2802
2803void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2804{
2805 struct tcp_iter_state *st = seq->private;
2806 void *rc;
2807
2808 if (*pos && *pos == st->last_pos) {
2809 rc = tcp_seek_last_pos(seq);
2810 if (rc)
2811 goto out;
2812 }
2813
2814 st->state = TCP_SEQ_STATE_LISTENING;
2815 st->num = 0;
2816 st->bucket = 0;
2817 st->offset = 0;
2818 rc = *pos ? tcp_get_idx(seq, pos: *pos - 1) : SEQ_START_TOKEN;
2819
2820out:
2821 st->last_pos = *pos;
2822 return rc;
2823}
2824EXPORT_IPV6_MOD(tcp_seq_start);
2825
2826void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2827{
2828 struct tcp_iter_state *st = seq->private;
2829 void *rc = NULL;
2830
2831 if (v == SEQ_START_TOKEN) {
2832 rc = tcp_get_idx(seq, pos: 0);
2833 goto out;
2834 }
2835
2836 switch (st->state) {
2837 case TCP_SEQ_STATE_LISTENING:
2838 rc = listening_get_next(seq, cur: v);
2839 if (!rc) {
2840 st->state = TCP_SEQ_STATE_ESTABLISHED;
2841 st->bucket = 0;
2842 st->offset = 0;
2843 rc = established_get_first(seq);
2844 }
2845 break;
2846 case TCP_SEQ_STATE_ESTABLISHED:
2847 rc = established_get_next(seq, cur: v);
2848 break;
2849 }
2850out:
2851 ++*pos;
2852 st->last_pos = *pos;
2853 return rc;
2854}
2855EXPORT_IPV6_MOD(tcp_seq_next);
2856
2857void tcp_seq_stop(struct seq_file *seq, void *v)
2858{
2859 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2860 struct tcp_iter_state *st = seq->private;
2861
2862 switch (st->state) {
2863 case TCP_SEQ_STATE_LISTENING:
2864 if (v != SEQ_START_TOKEN)
2865 spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
2866 break;
2867 case TCP_SEQ_STATE_ESTABLISHED:
2868 if (v)
2869 spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2870 break;
2871 }
2872}
2873EXPORT_IPV6_MOD(tcp_seq_stop);
2874
2875static void get_openreq4(const struct request_sock *req,
2876 struct seq_file *f, int i)
2877{
2878 const struct inet_request_sock *ireq = inet_rsk(sk: req);
2879 long delta = req->rsk_timer.expires - jiffies;
2880
2881 seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2882 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2883 i,
2884 ireq->ir_loc_addr,
2885 ireq->ir_num,
2886 ireq->ir_rmt_addr,
2887 ntohs(ireq->ir_rmt_port),
2888 TCP_SYN_RECV,
2889 0, 0, /* could print option size, but that is af dependent. */
2890 1, /* timers active (only the expire timer) */
2891 jiffies_delta_to_clock_t(delta),
2892 req->num_timeout,
2893 from_kuid_munged(to: seq_user_ns(seq: f),
2894 kuid: sk_uid(sk: req->rsk_listener)),
2895 0, /* non standard timer */
2896 0, /* open_requests have no inode */
2897 0,
2898 req);
2899}
2900
2901static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2902{
2903 int timer_active;
2904 unsigned long timer_expires;
2905 const struct tcp_sock *tp = tcp_sk(sk);
2906 const struct inet_connection_sock *icsk = inet_csk(sk);
2907 const struct inet_sock *inet = inet_sk(sk);
2908 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2909 __be32 dest = inet->inet_daddr;
2910 __be32 src = inet->inet_rcv_saddr;
2911 __u16 destp = ntohs(inet->inet_dport);
2912 __u16 srcp = ntohs(inet->inet_sport);
2913 u8 icsk_pending;
2914 int rx_queue;
2915 int state;
2916
2917 icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2918 if (icsk_pending == ICSK_TIME_RETRANS ||
2919 icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2920 icsk_pending == ICSK_TIME_LOSS_PROBE) {
2921 timer_active = 1;
2922 timer_expires = icsk_timeout(icsk);
2923 } else if (icsk_pending == ICSK_TIME_PROBE0) {
2924 timer_active = 4;
2925 timer_expires = icsk_timeout(icsk);
2926 } else if (timer_pending(timer: &sk->sk_timer)) {
2927 timer_active = 2;
2928 timer_expires = sk->sk_timer.expires;
2929 } else {
2930 timer_active = 0;
2931 timer_expires = jiffies;
2932 }
2933
2934 state = inet_sk_state_load(sk);
2935 if (state == TCP_LISTEN)
2936 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2937 else
2938 /* Because we don't lock the socket,
2939 * we might find a transient negative value.
2940 */
2941 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2942 READ_ONCE(tp->copied_seq), 0);
2943
2944 seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2945 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2946 i, src, srcp, dest, destp, state,
2947 READ_ONCE(tp->write_seq) - tp->snd_una,
2948 rx_queue,
2949 timer_active,
2950 jiffies_delta_to_clock_t(delta: timer_expires - jiffies),
2951 READ_ONCE(icsk->icsk_retransmits),
2952 from_kuid_munged(to: seq_user_ns(seq: f), kuid: sk_uid(sk)),
2953 READ_ONCE(icsk->icsk_probes_out),
2954 sock_i_ino(sk),
2955 refcount_read(r: &sk->sk_refcnt), sk,
2956 jiffies_to_clock_t(x: icsk->icsk_rto),
2957 jiffies_to_clock_t(x: icsk->icsk_ack.ato),
2958 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2959 tcp_snd_cwnd(tp),
2960 state == TCP_LISTEN ?
2961 fastopenq->max_qlen :
2962 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2963}
2964
2965static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2966 struct seq_file *f, int i)
2967{
2968 long delta = tw->tw_timer.expires - jiffies;
2969 __be32 dest, src;
2970 __u16 destp, srcp;
2971
2972 dest = tw->tw_daddr;
2973 src = tw->tw_rcv_saddr;
2974 destp = ntohs(tw->tw_dport);
2975 srcp = ntohs(tw->tw_sport);
2976
2977 seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2978 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2979 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2980 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2981 refcount_read(r: &tw->tw_refcnt), tw);
2982}
2983
2984#define TMPSZ 150
2985
2986static int tcp4_seq_show(struct seq_file *seq, void *v)
2987{
2988 struct tcp_iter_state *st;
2989 struct sock *sk = v;
2990
2991 seq_setwidth(m: seq, TMPSZ - 1);
2992 if (v == SEQ_START_TOKEN) {
2993 seq_puts(m: seq, s: " sl local_address rem_address st tx_queue "
2994 "rx_queue tr tm->when retrnsmt uid timeout "
2995 "inode");
2996 goto out;
2997 }
2998 st = seq->private;
2999
3000 if (sk->sk_state == TCP_TIME_WAIT)
3001 get_timewait4_sock(tw: v, f: seq, i: st->num);
3002 else if (sk->sk_state == TCP_NEW_SYN_RECV)
3003 get_openreq4(req: v, f: seq, i: st->num);
3004 else
3005 get_tcp4_sock(sk: v, f: seq, i: st->num);
3006out:
3007 seq_pad(m: seq, c: '\n');
3008 return 0;
3009}
3010
3011#ifdef CONFIG_BPF_SYSCALL
3012union bpf_tcp_iter_batch_item {
3013 struct sock *sk;
3014 __u64 cookie;
3015};
3016
3017struct bpf_tcp_iter_state {
3018 struct tcp_iter_state state;
3019 unsigned int cur_sk;
3020 unsigned int end_sk;
3021 unsigned int max_sk;
3022 union bpf_tcp_iter_batch_item *batch;
3023};
3024
3025struct bpf_iter__tcp {
3026 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3027 __bpf_md_ptr(struct sock_common *, sk_common);
3028 uid_t uid __aligned(8);
3029};
3030
3031static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3032 struct sock_common *sk_common, uid_t uid)
3033{
3034 struct bpf_iter__tcp ctx;
3035
3036 meta->seq_num--; /* skip SEQ_START_TOKEN */
3037 ctx.meta = meta;
3038 ctx.sk_common = sk_common;
3039 ctx.uid = uid;
3040 return bpf_iter_run_prog(prog, &ctx);
3041}
3042
3043static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3044{
3045 union bpf_tcp_iter_batch_item *item;
3046 unsigned int cur_sk = iter->cur_sk;
3047 __u64 cookie;
3048
3049 /* Remember the cookies of the sockets we haven't seen yet, so we can
3050 * pick up where we left off next time around.
3051 */
3052 while (cur_sk < iter->end_sk) {
3053 item = &iter->batch[cur_sk++];
3054 cookie = sock_gen_cookie(item->sk);
3055 sock_gen_put(item->sk);
3056 item->cookie = cookie;
3057 }
3058}
3059
3060static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3061 unsigned int new_batch_sz, gfp_t flags)
3062{
3063 union bpf_tcp_iter_batch_item *new_batch;
3064
3065 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3066 flags | __GFP_NOWARN);
3067 if (!new_batch)
3068 return -ENOMEM;
3069
3070 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3071 kvfree(iter->batch);
3072 iter->batch = new_batch;
3073 iter->max_sk = new_batch_sz;
3074
3075 return 0;
3076}
3077
3078static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3079 union bpf_tcp_iter_batch_item *cookies,
3080 int n_cookies)
3081{
3082 struct hlist_nulls_node *node;
3083 struct sock *sk;
3084 int i;
3085
3086 for (i = 0; i < n_cookies; i++) {
3087 sk = first_sk;
3088 sk_nulls_for_each_from(sk, node)
3089 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3090 return sk;
3091 }
3092
3093 return NULL;
3094}
3095
3096static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3097{
3098 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3099 struct bpf_tcp_iter_state *iter = seq->private;
3100 struct tcp_iter_state *st = &iter->state;
3101 unsigned int find_cookie = iter->cur_sk;
3102 unsigned int end_cookie = iter->end_sk;
3103 int resume_bucket = st->bucket;
3104 struct sock *sk;
3105
3106 if (end_cookie && find_cookie == end_cookie)
3107 ++st->bucket;
3108
3109 sk = listening_get_first(seq);
3110 iter->cur_sk = 0;
3111 iter->end_sk = 0;
3112
3113 if (sk && st->bucket == resume_bucket && end_cookie) {
3114 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3115 end_cookie - find_cookie);
3116 if (!sk) {
3117 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3118 ++st->bucket;
3119 sk = listening_get_first(seq);
3120 }
3121 }
3122
3123 return sk;
3124}
3125
3126static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3127{
3128 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3129 struct bpf_tcp_iter_state *iter = seq->private;
3130 struct tcp_iter_state *st = &iter->state;
3131 unsigned int find_cookie = iter->cur_sk;
3132 unsigned int end_cookie = iter->end_sk;
3133 int resume_bucket = st->bucket;
3134 struct sock *sk;
3135
3136 if (end_cookie && find_cookie == end_cookie)
3137 ++st->bucket;
3138
3139 sk = established_get_first(seq);
3140 iter->cur_sk = 0;
3141 iter->end_sk = 0;
3142
3143 if (sk && st->bucket == resume_bucket && end_cookie) {
3144 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3145 end_cookie - find_cookie);
3146 if (!sk) {
3147 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3148 ++st->bucket;
3149 sk = established_get_first(seq);
3150 }
3151 }
3152
3153 return sk;
3154}
3155
3156static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3157{
3158 struct bpf_tcp_iter_state *iter = seq->private;
3159 struct tcp_iter_state *st = &iter->state;
3160 struct sock *sk = NULL;
3161
3162 switch (st->state) {
3163 case TCP_SEQ_STATE_LISTENING:
3164 sk = bpf_iter_tcp_resume_listening(seq);
3165 if (sk)
3166 break;
3167 st->bucket = 0;
3168 st->state = TCP_SEQ_STATE_ESTABLISHED;
3169 fallthrough;
3170 case TCP_SEQ_STATE_ESTABLISHED:
3171 sk = bpf_iter_tcp_resume_established(seq);
3172 break;
3173 }
3174
3175 return sk;
3176}
3177
3178static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3179 struct sock **start_sk)
3180{
3181 struct bpf_tcp_iter_state *iter = seq->private;
3182 struct hlist_nulls_node *node;
3183 unsigned int expected = 1;
3184 struct sock *sk;
3185
3186 sock_hold(*start_sk);
3187 iter->batch[iter->end_sk++].sk = *start_sk;
3188
3189 sk = sk_nulls_next(*start_sk);
3190 *start_sk = NULL;
3191 sk_nulls_for_each_from(sk, node) {
3192 if (seq_sk_match(seq, sk)) {
3193 if (iter->end_sk < iter->max_sk) {
3194 sock_hold(sk);
3195 iter->batch[iter->end_sk++].sk = sk;
3196 } else if (!*start_sk) {
3197 /* Remember where we left off. */
3198 *start_sk = sk;
3199 }
3200 expected++;
3201 }
3202 }
3203
3204 return expected;
3205}
3206
3207static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3208 struct sock **start_sk)
3209{
3210 struct bpf_tcp_iter_state *iter = seq->private;
3211 struct hlist_nulls_node *node;
3212 unsigned int expected = 1;
3213 struct sock *sk;
3214
3215 sock_hold(*start_sk);
3216 iter->batch[iter->end_sk++].sk = *start_sk;
3217
3218 sk = sk_nulls_next(*start_sk);
3219 *start_sk = NULL;
3220 sk_nulls_for_each_from(sk, node) {
3221 if (seq_sk_match(seq, sk)) {
3222 if (iter->end_sk < iter->max_sk) {
3223 sock_hold(sk);
3224 iter->batch[iter->end_sk++].sk = sk;
3225 } else if (!*start_sk) {
3226 /* Remember where we left off. */
3227 *start_sk = sk;
3228 }
3229 expected++;
3230 }
3231 }
3232
3233 return expected;
3234}
3235
3236static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3237 struct sock **start_sk)
3238{
3239 struct bpf_tcp_iter_state *iter = seq->private;
3240 struct tcp_iter_state *st = &iter->state;
3241
3242 if (st->state == TCP_SEQ_STATE_LISTENING)
3243 return bpf_iter_tcp_listening_batch(seq, start_sk);
3244 else
3245 return bpf_iter_tcp_established_batch(seq, start_sk);
3246}
3247
3248static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3249{
3250 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3251 struct bpf_tcp_iter_state *iter = seq->private;
3252 struct tcp_iter_state *st = &iter->state;
3253
3254 if (st->state == TCP_SEQ_STATE_LISTENING)
3255 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3256 else
3257 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3258}
3259
3260static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3261{
3262 struct bpf_tcp_iter_state *iter = seq->private;
3263 unsigned int expected;
3264 struct sock *sk;
3265 int err;
3266
3267 sk = bpf_iter_tcp_resume(seq);
3268 if (!sk)
3269 return NULL; /* Done */
3270
3271 expected = bpf_iter_fill_batch(seq, &sk);
3272 if (likely(iter->end_sk == expected))
3273 goto done;
3274
3275 /* Batch size was too small. */
3276 bpf_iter_tcp_unlock_bucket(seq);
3277 bpf_iter_tcp_put_batch(iter);
3278 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3279 GFP_USER);
3280 if (err)
3281 return ERR_PTR(err);
3282
3283 sk = bpf_iter_tcp_resume(seq);
3284 if (!sk)
3285 return NULL; /* Done */
3286
3287 expected = bpf_iter_fill_batch(seq, &sk);
3288 if (likely(iter->end_sk == expected))
3289 goto done;
3290
3291 /* Batch size was still too small. Hold onto the lock while we try
3292 * again with a larger batch to make sure the current bucket's size
3293 * does not change in the meantime.
3294 */
3295 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3296 if (err) {
3297 bpf_iter_tcp_unlock_bucket(seq);
3298 return ERR_PTR(err);
3299 }
3300
3301 expected = bpf_iter_fill_batch(seq, &sk);
3302 WARN_ON_ONCE(iter->end_sk != expected);
3303done:
3304 bpf_iter_tcp_unlock_bucket(seq);
3305 return iter->batch[0].sk;
3306}
3307
3308static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3309{
3310 /* bpf iter does not support lseek, so it always
3311 * continue from where it was stop()-ped.
3312 */
3313 if (*pos)
3314 return bpf_iter_tcp_batch(seq);
3315
3316 return SEQ_START_TOKEN;
3317}
3318
3319static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3320{
3321 struct bpf_tcp_iter_state *iter = seq->private;
3322 struct tcp_iter_state *st = &iter->state;
3323 struct sock *sk;
3324
3325 /* Whenever seq_next() is called, the iter->cur_sk is
3326 * done with seq_show(), so advance to the next sk in
3327 * the batch.
3328 */
3329 if (iter->cur_sk < iter->end_sk) {
3330 /* Keeping st->num consistent in tcp_iter_state.
3331 * bpf_iter_tcp does not use st->num.
3332 * meta.seq_num is used instead.
3333 */
3334 st->num++;
3335 sock_gen_put(iter->batch[iter->cur_sk++].sk);
3336 }
3337
3338 if (iter->cur_sk < iter->end_sk)
3339 sk = iter->batch[iter->cur_sk].sk;
3340 else
3341 sk = bpf_iter_tcp_batch(seq);
3342
3343 ++*pos;
3344 /* Keeping st->last_pos consistent in tcp_iter_state.
3345 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3346 */
3347 st->last_pos = *pos;
3348 return sk;
3349}
3350
3351static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3352{
3353 struct bpf_iter_meta meta;
3354 struct bpf_prog *prog;
3355 struct sock *sk = v;
3356 uid_t uid;
3357 int ret;
3358
3359 if (v == SEQ_START_TOKEN)
3360 return 0;
3361
3362 if (sk_fullsock(sk))
3363 lock_sock(sk);
3364
3365 if (unlikely(sk_unhashed(sk))) {
3366 ret = SEQ_SKIP;
3367 goto unlock;
3368 }
3369
3370 if (sk->sk_state == TCP_TIME_WAIT) {
3371 uid = 0;
3372 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3373 const struct request_sock *req = v;
3374
3375 uid = from_kuid_munged(seq_user_ns(seq),
3376 sk_uid(req->rsk_listener));
3377 } else {
3378 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3379 }
3380
3381 meta.seq = seq;
3382 prog = bpf_iter_get_info(&meta, false);
3383 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3384
3385unlock:
3386 if (sk_fullsock(sk))
3387 release_sock(sk);
3388 return ret;
3389
3390}
3391
3392static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3393{
3394 struct bpf_tcp_iter_state *iter = seq->private;
3395 struct bpf_iter_meta meta;
3396 struct bpf_prog *prog;
3397
3398 if (!v) {
3399 meta.seq = seq;
3400 prog = bpf_iter_get_info(&meta, true);
3401 if (prog)
3402 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3403 }
3404
3405 if (iter->cur_sk < iter->end_sk)
3406 bpf_iter_tcp_put_batch(iter);
3407}
3408
3409static const struct seq_operations bpf_iter_tcp_seq_ops = {
3410 .show = bpf_iter_tcp_seq_show,
3411 .start = bpf_iter_tcp_seq_start,
3412 .next = bpf_iter_tcp_seq_next,
3413 .stop = bpf_iter_tcp_seq_stop,
3414};
3415#endif
3416static unsigned short seq_file_family(const struct seq_file *seq)
3417{
3418 const struct tcp_seq_afinfo *afinfo;
3419
3420#ifdef CONFIG_BPF_SYSCALL
3421 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3422 if (seq->op == &bpf_iter_tcp_seq_ops)
3423 return AF_UNSPEC;
3424#endif
3425
3426 /* Iterated from proc fs */
3427 afinfo = pde_data(inode: file_inode(f: seq->file));
3428 return afinfo->family;
3429}
3430
3431static const struct seq_operations tcp4_seq_ops = {
3432 .show = tcp4_seq_show,
3433 .start = tcp_seq_start,
3434 .next = tcp_seq_next,
3435 .stop = tcp_seq_stop,
3436};
3437
3438static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3439 .family = AF_INET,
3440};
3441
3442static int __net_init tcp4_proc_init_net(struct net *net)
3443{
3444 if (!proc_create_net_data(name: "tcp", mode: 0444, parent: net->proc_net, ops: &tcp4_seq_ops,
3445 state_size: sizeof(struct tcp_iter_state), data: &tcp4_seq_afinfo))
3446 return -ENOMEM;
3447 return 0;
3448}
3449
3450static void __net_exit tcp4_proc_exit_net(struct net *net)
3451{
3452 remove_proc_entry("tcp", net->proc_net);
3453}
3454
3455static struct pernet_operations tcp4_net_ops = {
3456 .init = tcp4_proc_init_net,
3457 .exit = tcp4_proc_exit_net,
3458};
3459
3460int __init tcp4_proc_init(void)
3461{
3462 return register_pernet_subsys(&tcp4_net_ops);
3463}
3464
3465void tcp4_proc_exit(void)
3466{
3467 unregister_pernet_subsys(&tcp4_net_ops);
3468}
3469#endif /* CONFIG_PROC_FS */
3470
3471/* @wake is one when sk_stream_write_space() calls us.
3472 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3473 * This mimics the strategy used in sock_def_write_space().
3474 */
3475bool tcp_stream_memory_free(const struct sock *sk, int wake)
3476{
3477 const struct tcp_sock *tp = tcp_sk(sk);
3478 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3479 READ_ONCE(tp->snd_nxt);
3480
3481 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3482}
3483EXPORT_SYMBOL(tcp_stream_memory_free);
3484
3485struct proto tcp_prot = {
3486 .name = "TCP",
3487 .owner = THIS_MODULE,
3488 .close = tcp_close,
3489 .pre_connect = tcp_v4_pre_connect,
3490 .connect = tcp_v4_connect,
3491 .disconnect = tcp_disconnect,
3492 .accept = inet_csk_accept,
3493 .ioctl = tcp_ioctl,
3494 .init = tcp_v4_init_sock,
3495 .destroy = tcp_v4_destroy_sock,
3496 .shutdown = tcp_shutdown,
3497 .setsockopt = tcp_setsockopt,
3498 .getsockopt = tcp_getsockopt,
3499 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3500 .keepalive = tcp_set_keepalive,
3501 .recvmsg = tcp_recvmsg,
3502 .sendmsg = tcp_sendmsg,
3503 .splice_eof = tcp_splice_eof,
3504 .backlog_rcv = tcp_v4_do_rcv,
3505 .release_cb = tcp_release_cb,
3506 .hash = inet_hash,
3507 .unhash = inet_unhash,
3508 .get_port = inet_csk_get_port,
3509 .put_port = inet_put_port,
3510#ifdef CONFIG_BPF_SYSCALL
3511 .psock_update_sk_prot = tcp_bpf_update_proto,
3512#endif
3513 .enter_memory_pressure = tcp_enter_memory_pressure,
3514 .leave_memory_pressure = tcp_leave_memory_pressure,
3515 .stream_memory_free = tcp_stream_memory_free,
3516 .sockets_allocated = &tcp_sockets_allocated,
3517
3518 .memory_allocated = &net_aligned_data.tcp_memory_allocated,
3519 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3520
3521 .memory_pressure = &tcp_memory_pressure,
3522 .sysctl_mem = sysctl_tcp_mem,
3523 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3524 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3525 .max_header = MAX_TCP_HEADER,
3526 .obj_size = sizeof(struct tcp_sock),
3527 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3528 .twsk_prot = &tcp_timewait_sock_ops,
3529 .rsk_prot = &tcp_request_sock_ops,
3530 .h.hashinfo = NULL,
3531 .no_autobind = true,
3532 .diag_destroy = tcp_abort,
3533};
3534EXPORT_SYMBOL(tcp_prot);
3535
3536static void __net_exit tcp_sk_exit(struct net *net)
3537{
3538 if (net->ipv4.tcp_congestion_control)
3539 bpf_module_put(data: net->ipv4.tcp_congestion_control,
3540 owner: net->ipv4.tcp_congestion_control->owner);
3541}
3542
3543static void __net_init tcp_set_hashinfo(struct net *net)
3544{
3545 struct inet_hashinfo *hinfo;
3546 unsigned int ehash_entries;
3547 struct net *old_net;
3548
3549 if (net_eq(net1: net, net2: &init_net))
3550 goto fallback;
3551
3552 old_net = current->nsproxy->net_ns;
3553 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3554 if (!ehash_entries)
3555 goto fallback;
3556
3557 ehash_entries = roundup_pow_of_two(ehash_entries);
3558 hinfo = inet_pernet_hashinfo_alloc(hashinfo: &tcp_hashinfo, ehash_entries);
3559 if (!hinfo) {
3560 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3561 "for a netns, fallback to the global one\n",
3562 ehash_entries);
3563fallback:
3564 hinfo = &tcp_hashinfo;
3565 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3566 }
3567
3568 net->ipv4.tcp_death_row.hashinfo = hinfo;
3569 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3570 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3571}
3572
3573static int __net_init tcp_sk_init(struct net *net)
3574{
3575 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3576 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3577 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3578 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3579
3580 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3581 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3582 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3583 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3584 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3585
3586 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3587 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3588 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3589
3590 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3591 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3592 net->ipv4.sysctl_tcp_syncookies = 1;
3593 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3594 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3595 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3596 net->ipv4.sysctl_tcp_orphan_retries = 0;
3597 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3598 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3599 net->ipv4.sysctl_tcp_tw_reuse = 2;
3600 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3601 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3602
3603 refcount_set(r: &net->ipv4.tcp_death_row.tw_refcount, n: 1);
3604 tcp_set_hashinfo(net);
3605
3606 net->ipv4.sysctl_tcp_sack = 1;
3607 net->ipv4.sysctl_tcp_window_scaling = 1;
3608 net->ipv4.sysctl_tcp_timestamps = 1;
3609 net->ipv4.sysctl_tcp_early_retrans = 3;
3610 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3611 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3612 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3613 net->ipv4.sysctl_tcp_max_reordering = 300;
3614 net->ipv4.sysctl_tcp_dsack = 1;
3615 net->ipv4.sysctl_tcp_app_win = 31;
3616 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3617 net->ipv4.sysctl_tcp_frto = 2;
3618 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3619 /* This limits the percentage of the congestion window which we
3620 * will allow a single TSO frame to consume. Building TSO frames
3621 * which are too large can cause TCP streams to be bursty.
3622 */
3623 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3624 /* Default TSQ limit of 4 MB */
3625 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3626
3627 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3628 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3629
3630 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3631 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3632 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3633 net->ipv4.sysctl_tcp_autocorking = 1;
3634 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3635 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3636 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3637 if (net != &init_net) {
3638 memcpy(to: net->ipv4.sysctl_tcp_rmem,
3639 from: init_net.ipv4.sysctl_tcp_rmem,
3640 len: sizeof(init_net.ipv4.sysctl_tcp_rmem));
3641 memcpy(to: net->ipv4.sysctl_tcp_wmem,
3642 from: init_net.ipv4.sysctl_tcp_wmem,
3643 len: sizeof(init_net.ipv4.sysctl_tcp_wmem));
3644 }
3645 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3646 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3647 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3648 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3649 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3650 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3651 atomic_set(v: &net->ipv4.tfo_active_disable_times, i: 0);
3652
3653 /* Set default values for PLB */
3654 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3655 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3656 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3657 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3658 /* Default congestion threshold for PLB to mark a round is 50% */
3659 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3660
3661 /* Reno is always built in */
3662 if (!net_eq(net1: net, net2: &init_net) &&
3663 bpf_try_module_get(data: init_net.ipv4.tcp_congestion_control,
3664 owner: init_net.ipv4.tcp_congestion_control->owner))
3665 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3666 else
3667 net->ipv4.tcp_congestion_control = &tcp_reno;
3668
3669 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3670 net->ipv4.sysctl_tcp_shrink_window = 0;
3671
3672 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3673 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3674 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3675
3676 return 0;
3677}
3678
3679static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3680{
3681 struct net *net;
3682
3683 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3684 * and failed setup_net error unwinding path are serialized.
3685 *
3686 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3687 * net_exit_list, the thread that dismantles a particular twsk must
3688 * do so without other thread progressing to refcount_dec_and_test() of
3689 * tcp_death_row.tw_refcount.
3690 */
3691 mutex_lock(lock: &tcp_exit_batch_mutex);
3692
3693 tcp_twsk_purge(net_exit_list);
3694
3695 list_for_each_entry(net, net_exit_list, exit_list) {
3696 inet_pernet_hashinfo_free(hashinfo: net->ipv4.tcp_death_row.hashinfo);
3697 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3698 tcp_fastopen_ctx_destroy(net);
3699 }
3700
3701 mutex_unlock(lock: &tcp_exit_batch_mutex);
3702}
3703
3704static struct pernet_operations __net_initdata tcp_sk_ops = {
3705 .init = tcp_sk_init,
3706 .exit = tcp_sk_exit,
3707 .exit_batch = tcp_sk_exit_batch,
3708};
3709
3710#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3711DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3712 struct sock_common *sk_common, uid_t uid)
3713
3714#define INIT_BATCH_SZ 16
3715
3716static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3717{
3718 struct bpf_tcp_iter_state *iter = priv_data;
3719 int err;
3720
3721 err = bpf_iter_init_seq_net(priv_data, aux);
3722 if (err)
3723 return err;
3724
3725 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3726 if (err) {
3727 bpf_iter_fini_seq_net(priv_data);
3728 return err;
3729 }
3730
3731 return 0;
3732}
3733
3734static void bpf_iter_fini_tcp(void *priv_data)
3735{
3736 struct bpf_tcp_iter_state *iter = priv_data;
3737
3738 bpf_iter_fini_seq_net(priv_data);
3739 kvfree(iter->batch);
3740}
3741
3742static const struct bpf_iter_seq_info tcp_seq_info = {
3743 .seq_ops = &bpf_iter_tcp_seq_ops,
3744 .init_seq_private = bpf_iter_init_tcp,
3745 .fini_seq_private = bpf_iter_fini_tcp,
3746 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3747};
3748
3749static const struct bpf_func_proto *
3750bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3751 const struct bpf_prog *prog)
3752{
3753 switch (func_id) {
3754 case BPF_FUNC_setsockopt:
3755 return &bpf_sk_setsockopt_proto;
3756 case BPF_FUNC_getsockopt:
3757 return &bpf_sk_getsockopt_proto;
3758 default:
3759 return NULL;
3760 }
3761}
3762
3763static struct bpf_iter_reg tcp_reg_info = {
3764 .target = "tcp",
3765 .ctx_arg_info_size = 1,
3766 .ctx_arg_info = {
3767 { offsetof(struct bpf_iter__tcp, sk_common),
3768 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3769 },
3770 .get_func_proto = bpf_iter_tcp_get_func_proto,
3771 .seq_info = &tcp_seq_info,
3772};
3773
3774static void __init bpf_iter_register(void)
3775{
3776 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3777 if (bpf_iter_reg_target(&tcp_reg_info))
3778 pr_warn("Warning: could not register bpf iterator tcp\n");
3779}
3780
3781#endif
3782
3783void __init tcp_v4_init(void)
3784{
3785 int cpu, res;
3786
3787 for_each_possible_cpu(cpu) {
3788 struct sock *sk;
3789
3790 res = inet_ctl_sock_create(sk: &sk, PF_INET, type: SOCK_RAW,
3791 IPPROTO_TCP, net: &init_net);
3792 if (res)
3793 panic(fmt: "Failed to create the TCP control socket.\n");
3794 sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
3795
3796 /* Please enforce IP_DF and IPID==0 for RST and
3797 * ACK sent in SYN-RECV and TIME-WAIT state.
3798 */
3799 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3800
3801 sk->sk_clockid = CLOCK_MONOTONIC;
3802
3803 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3804 }
3805 if (register_pernet_subsys(&tcp_sk_ops))
3806 panic(fmt: "Failed to create the TCP control socket.\n");
3807
3808#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3809 bpf_iter_register();
3810#endif
3811}
3812