| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | /* | 
|---|
| 3 | * INET		An implementation of the TCP/IP protocol suite for the LINUX | 
|---|
| 4 | *		operating system.  INET is implemented using the  BSD Socket | 
|---|
| 5 | *		interface as the means of communication with the user level. | 
|---|
| 6 | * | 
|---|
| 7 | *		Generic TIME_WAIT sockets functions | 
|---|
| 8 | * | 
|---|
| 9 | *		From code orinally in TCP | 
|---|
| 10 | */ | 
|---|
| 11 |  | 
|---|
| 12 | #include <linux/kernel.h> | 
|---|
| 13 | #include <linux/slab.h> | 
|---|
| 14 | #include <linux/module.h> | 
|---|
| 15 | #include <net/inet_hashtables.h> | 
|---|
| 16 | #include <net/inet_timewait_sock.h> | 
|---|
| 17 | #include <net/ip.h> | 
|---|
| 18 | #include <net/tcp.h> | 
|---|
| 19 | #include <net/psp.h> | 
|---|
| 20 |  | 
|---|
| 21 | /** | 
|---|
| 22 | *	inet_twsk_bind_unhash - unhash a timewait socket from bind hash | 
|---|
| 23 | *	@tw: timewait socket | 
|---|
| 24 | *	@hashinfo: hashinfo pointer | 
|---|
| 25 | * | 
|---|
| 26 | *	unhash a timewait socket from bind hash, if hashed. | 
|---|
| 27 | *	bind hash lock must be held by caller. | 
|---|
| 28 | *	Returns 1 if caller should call inet_twsk_put() after lock release. | 
|---|
| 29 | */ | 
|---|
| 30 | void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, | 
|---|
| 31 | struct inet_hashinfo *hashinfo) | 
|---|
| 32 | { | 
|---|
| 33 | struct inet_bind2_bucket *tb2 = tw->tw_tb2; | 
|---|
| 34 | struct inet_bind_bucket *tb = tw->tw_tb; | 
|---|
| 35 |  | 
|---|
| 36 | if (!tb) | 
|---|
| 37 | return; | 
|---|
| 38 |  | 
|---|
| 39 | __sk_del_bind_node(sk: (struct sock *)tw); | 
|---|
| 40 | tw->tw_tb = NULL; | 
|---|
| 41 | tw->tw_tb2 = NULL; | 
|---|
| 42 | inet_bind2_bucket_destroy(cachep: hashinfo->bind2_bucket_cachep, tb: tb2); | 
|---|
| 43 | inet_bind_bucket_destroy(tb); | 
|---|
| 44 |  | 
|---|
| 45 | __sock_put(sk: (struct sock *)tw); | 
|---|
| 46 | } | 
|---|
| 47 |  | 
|---|
| 48 | /* Must be called with locally disabled BHs. */ | 
|---|
| 49 | static void inet_twsk_kill(struct inet_timewait_sock *tw) | 
|---|
| 50 | { | 
|---|
| 51 | struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; | 
|---|
| 52 | spinlock_t *lock = inet_ehash_lockp(hashinfo, hash: tw->tw_hash); | 
|---|
| 53 | struct inet_bind_hashbucket *bhead, *bhead2; | 
|---|
| 54 |  | 
|---|
| 55 | spin_lock(lock); | 
|---|
| 56 | sk_nulls_del_node_init_rcu(sk: (struct sock *)tw); | 
|---|
| 57 | spin_unlock(lock); | 
|---|
| 58 |  | 
|---|
| 59 | /* Disassociate with bind bucket. */ | 
|---|
| 60 | bhead = &hashinfo->bhash[inet_bhashfn(net: twsk_net(twsk: tw), lport: tw->tw_num, | 
|---|
| 61 | bhash_size: hashinfo->bhash_size)]; | 
|---|
| 62 | bhead2 = inet_bhashfn_portaddr(hinfo: hashinfo, sk: (struct sock *)tw, | 
|---|
| 63 | net: twsk_net(twsk: tw), port: tw->tw_num); | 
|---|
| 64 |  | 
|---|
| 65 | spin_lock(lock: &bhead->lock); | 
|---|
| 66 | spin_lock(lock: &bhead2->lock); | 
|---|
| 67 | inet_twsk_bind_unhash(tw, hashinfo); | 
|---|
| 68 | spin_unlock(lock: &bhead2->lock); | 
|---|
| 69 | spin_unlock(lock: &bhead->lock); | 
|---|
| 70 |  | 
|---|
| 71 | refcount_dec(r: &tw->tw_dr->tw_refcount); | 
|---|
| 72 | inet_twsk_put(tw); | 
|---|
| 73 | } | 
|---|
| 74 |  | 
|---|
| 75 | void inet_twsk_free(struct inet_timewait_sock *tw) | 
|---|
| 76 | { | 
|---|
| 77 | struct module *owner = tw->tw_prot->owner; | 
|---|
| 78 |  | 
|---|
| 79 | tcp_twsk_destructor(sk: (struct sock *)tw); | 
|---|
| 80 | kmem_cache_free(s: tw->tw_prot->twsk_prot->twsk_slab, objp: tw); | 
|---|
| 81 | module_put(module: owner); | 
|---|
| 82 | } | 
|---|
| 83 |  | 
|---|
| 84 | void inet_twsk_put(struct inet_timewait_sock *tw) | 
|---|
| 85 | { | 
|---|
| 86 | if (refcount_dec_and_test(r: &tw->tw_refcnt)) | 
|---|
| 87 | inet_twsk_free(tw); | 
|---|
| 88 | } | 
|---|
| 89 | EXPORT_SYMBOL_GPL(inet_twsk_put); | 
|---|
| 90 |  | 
|---|
| 91 | static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, | 
|---|
| 92 | struct hlist_nulls_head *list) | 
|---|
| 93 | { | 
|---|
| 94 | hlist_nulls_add_head_rcu(n: &tw->tw_node, h: list); | 
|---|
| 95 | } | 
|---|
| 96 |  | 
|---|
| 97 | static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) | 
|---|
| 98 | { | 
|---|
| 99 | __inet_twsk_schedule(tw, timeo, rearm: false); | 
|---|
| 100 | } | 
|---|
| 101 |  | 
|---|
| 102 | /* | 
|---|
| 103 | * Enter the time wait state. | 
|---|
| 104 | * Essentially we whip up a timewait bucket, copy the relevant info into it | 
|---|
| 105 | * from the SK, and mess with hash chains and list linkage. | 
|---|
| 106 | * | 
|---|
| 107 | * The caller must not access @tw anymore after this function returns. | 
|---|
| 108 | */ | 
|---|
| 109 | void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, | 
|---|
| 110 | struct sock *sk, | 
|---|
| 111 | struct inet_hashinfo *hashinfo, | 
|---|
| 112 | int timeo) | 
|---|
| 113 | { | 
|---|
| 114 | const struct inet_sock *inet = inet_sk(sk); | 
|---|
| 115 | const struct inet_connection_sock *icsk = inet_csk(sk); | 
|---|
| 116 | struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, hash: sk->sk_hash); | 
|---|
| 117 | spinlock_t *lock = inet_ehash_lockp(hashinfo, hash: sk->sk_hash); | 
|---|
| 118 | struct inet_bind_hashbucket *bhead, *bhead2; | 
|---|
| 119 |  | 
|---|
| 120 | /* Step 1: Put TW into bind hash. Original socket stays there too. | 
|---|
| 121 | Note, that any socket with inet->num != 0 MUST be bound in | 
|---|
| 122 | binding cache, even if it is closed. | 
|---|
| 123 | */ | 
|---|
| 124 | bhead = &hashinfo->bhash[inet_bhashfn(net: twsk_net(twsk: tw), lport: inet->inet_num, | 
|---|
| 125 | bhash_size: hashinfo->bhash_size)]; | 
|---|
| 126 | bhead2 = inet_bhashfn_portaddr(hinfo: hashinfo, sk, net: twsk_net(twsk: tw), port: inet->inet_num); | 
|---|
| 127 |  | 
|---|
| 128 | local_bh_disable(); | 
|---|
| 129 | spin_lock(lock: &bhead->lock); | 
|---|
| 130 | spin_lock(lock: &bhead2->lock); | 
|---|
| 131 |  | 
|---|
| 132 | tw->tw_tb = icsk->icsk_bind_hash; | 
|---|
| 133 | WARN_ON(!icsk->icsk_bind_hash); | 
|---|
| 134 |  | 
|---|
| 135 | tw->tw_tb2 = icsk->icsk_bind2_hash; | 
|---|
| 136 | WARN_ON(!icsk->icsk_bind2_hash); | 
|---|
| 137 | sk_add_bind_node(sk: (struct sock *)tw, list: &tw->tw_tb2->owners); | 
|---|
| 138 |  | 
|---|
| 139 | spin_unlock(lock: &bhead2->lock); | 
|---|
| 140 | spin_unlock(lock: &bhead->lock); | 
|---|
| 141 |  | 
|---|
| 142 | spin_lock(lock); | 
|---|
| 143 |  | 
|---|
| 144 | /* Step 2: Hash TW into tcp ehash chain */ | 
|---|
| 145 | inet_twsk_add_node_rcu(tw, list: &ehead->chain); | 
|---|
| 146 |  | 
|---|
| 147 | /* Step 3: Remove SK from hash chain */ | 
|---|
| 148 | if (__sk_nulls_del_node_init_rcu(sk)) | 
|---|
| 149 | sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: -1); | 
|---|
| 150 |  | 
|---|
| 151 |  | 
|---|
| 152 | /* Ensure above writes are committed into memory before updating the | 
|---|
| 153 | * refcount. | 
|---|
| 154 | * Provides ordering vs later refcount_inc(). | 
|---|
| 155 | */ | 
|---|
| 156 | smp_wmb(); | 
|---|
| 157 | /* tw_refcnt is set to 3 because we have : | 
|---|
| 158 | * - one reference for bhash chain. | 
|---|
| 159 | * - one reference for ehash chain. | 
|---|
| 160 | * - one reference for timer. | 
|---|
| 161 | * Also note that after this point, we lost our implicit reference | 
|---|
| 162 | * so we are not allowed to use tw anymore. | 
|---|
| 163 | */ | 
|---|
| 164 | refcount_set(r: &tw->tw_refcnt, n: 3); | 
|---|
| 165 |  | 
|---|
| 166 | inet_twsk_schedule(tw, timeo); | 
|---|
| 167 |  | 
|---|
| 168 | spin_unlock(lock); | 
|---|
| 169 | local_bh_enable(); | 
|---|
| 170 | } | 
|---|
| 171 |  | 
|---|
| 172 | static void tw_timer_handler(struct timer_list *t) | 
|---|
| 173 | { | 
|---|
| 174 | struct inet_timewait_sock *tw = timer_container_of(tw, t, tw_timer); | 
|---|
| 175 |  | 
|---|
| 176 | inet_twsk_kill(tw); | 
|---|
| 177 | } | 
|---|
| 178 |  | 
|---|
| 179 | struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, | 
|---|
| 180 | struct inet_timewait_death_row *dr, | 
|---|
| 181 | const int state) | 
|---|
| 182 | { | 
|---|
| 183 | struct inet_timewait_sock *tw; | 
|---|
| 184 |  | 
|---|
| 185 | if (refcount_read(r: &dr->tw_refcount) - 1 >= | 
|---|
| 186 | READ_ONCE(dr->sysctl_max_tw_buckets)) | 
|---|
| 187 | return NULL; | 
|---|
| 188 |  | 
|---|
| 189 | tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, | 
|---|
| 190 | GFP_ATOMIC); | 
|---|
| 191 | if (tw) { | 
|---|
| 192 | const struct inet_sock *inet = inet_sk(sk); | 
|---|
| 193 |  | 
|---|
| 194 | tw->tw_dr	    = dr; | 
|---|
| 195 | /* Give us an identity. */ | 
|---|
| 196 | tw->tw_daddr	    = inet->inet_daddr; | 
|---|
| 197 | tw->tw_rcv_saddr    = inet->inet_rcv_saddr; | 
|---|
| 198 | tw->tw_bound_dev_if = sk->sk_bound_dev_if; | 
|---|
| 199 | tw->tw_tos	    = inet->tos; | 
|---|
| 200 | tw->tw_num	    = inet->inet_num; | 
|---|
| 201 | tw->tw_state	    = TCP_TIME_WAIT; | 
|---|
| 202 | tw->tw_substate	    = state; | 
|---|
| 203 | tw->tw_sport	    = inet->inet_sport; | 
|---|
| 204 | tw->tw_dport	    = inet->inet_dport; | 
|---|
| 205 | tw->tw_family	    = sk->sk_family; | 
|---|
| 206 | tw->tw_reuse	    = sk->sk_reuse; | 
|---|
| 207 | tw->tw_reuseport    = sk->sk_reuseport; | 
|---|
| 208 | tw->tw_hash	    = sk->sk_hash; | 
|---|
| 209 | tw->tw_ipv6only	    = 0; | 
|---|
| 210 | tw->tw_transparent  = inet_test_bit(TRANSPARENT, sk); | 
|---|
| 211 | tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND); | 
|---|
| 212 | tw->tw_prot	    = sk->sk_prot_creator; | 
|---|
| 213 | atomic64_set(v: &tw->tw_cookie, i: atomic64_read(v: &sk->sk_cookie)); | 
|---|
| 214 | twsk_net_set(twsk: tw, net: sock_net(sk)); | 
|---|
| 215 | timer_setup(&tw->tw_timer, tw_timer_handler, 0); | 
|---|
| 216 | #ifdef CONFIG_SOCK_VALIDATE_XMIT | 
|---|
| 217 | tw->tw_validate_xmit_skb = NULL; | 
|---|
| 218 | #endif | 
|---|
| 219 | /* | 
|---|
| 220 | * Because we use RCU lookups, we should not set tw_refcnt | 
|---|
| 221 | * to a non null value before everything is setup for this | 
|---|
| 222 | * timewait socket. | 
|---|
| 223 | */ | 
|---|
| 224 | refcount_set(r: &tw->tw_refcnt, n: 0); | 
|---|
| 225 |  | 
|---|
| 226 | __module_get(module: tw->tw_prot->owner); | 
|---|
| 227 | psp_twsk_init(tw, sk); | 
|---|
| 228 | } | 
|---|
| 229 |  | 
|---|
| 230 | return tw; | 
|---|
| 231 | } | 
|---|
| 232 |  | 
|---|
| 233 | /* These are always called from BH context.  See callers in | 
|---|
| 234 | * tcp_input.c to verify this. | 
|---|
| 235 | */ | 
|---|
| 236 |  | 
|---|
| 237 | /* This is for handling early-kills of TIME_WAIT sockets. | 
|---|
| 238 | * Warning : consume reference. | 
|---|
| 239 | * Caller should not access tw anymore. | 
|---|
| 240 | */ | 
|---|
| 241 | void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) | 
|---|
| 242 | { | 
|---|
| 243 | struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; | 
|---|
| 244 | spinlock_t *lock = inet_ehash_lockp(hashinfo, hash: tw->tw_hash); | 
|---|
| 245 |  | 
|---|
| 246 | /* inet_twsk_purge() walks over all sockets, including tw ones, | 
|---|
| 247 | * and removes them via inet_twsk_deschedule_put() after a | 
|---|
| 248 | * refcount_inc_not_zero(). | 
|---|
| 249 | * | 
|---|
| 250 | * inet_twsk_hashdance_schedule() must (re)init the refcount before | 
|---|
| 251 | * arming the timer, i.e. inet_twsk_purge can obtain a reference to | 
|---|
| 252 | * a twsk that did not yet schedule the timer. | 
|---|
| 253 | * | 
|---|
| 254 | * The ehash lock synchronizes these two: | 
|---|
| 255 | * After acquiring the lock, the timer is always scheduled (else | 
|---|
| 256 | * timer_shutdown returns false), because hashdance_schedule releases | 
|---|
| 257 | * the ehash lock only after completing the timer initialization. | 
|---|
| 258 | * | 
|---|
| 259 | * Without grabbing the ehash lock, we get: | 
|---|
| 260 | * 1) cpu x sets twsk refcount to 3 | 
|---|
| 261 | * 2) cpu y bumps refcount to 4 | 
|---|
| 262 | * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down | 
|---|
| 263 | * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown | 
|---|
| 264 | * -> timer refcount is never decremented. | 
|---|
| 265 | */ | 
|---|
| 266 | spin_lock(lock); | 
|---|
| 267 | /*  Makes sure hashdance_schedule() has completed */ | 
|---|
| 268 | spin_unlock(lock); | 
|---|
| 269 |  | 
|---|
| 270 | if (timer_shutdown_sync(timer: &tw->tw_timer)) | 
|---|
| 271 | inet_twsk_kill(tw); | 
|---|
| 272 | inet_twsk_put(tw); | 
|---|
| 273 | } | 
|---|
| 274 | EXPORT_SYMBOL(inet_twsk_deschedule_put); | 
|---|
| 275 |  | 
|---|
| 276 | void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) | 
|---|
| 277 | { | 
|---|
| 278 | /* timeout := RTO * 3.5 | 
|---|
| 279 | * | 
|---|
| 280 | * 3.5 = 1+2+0.5 to wait for two retransmits. | 
|---|
| 281 | * | 
|---|
| 282 | * RATIONALE: if FIN arrived and we entered TIME-WAIT state, | 
|---|
| 283 | * our ACK acking that FIN can be lost. If N subsequent retransmitted | 
|---|
| 284 | * FINs (or previous seqments) are lost (probability of such event | 
|---|
| 285 | * is p^(N+1), where p is probability to lose single packet and | 
|---|
| 286 | * time to detect the loss is about RTO*(2^N - 1) with exponential | 
|---|
| 287 | * backoff). Normal timewait length is calculated so, that we | 
|---|
| 288 | * waited at least for one retransmitted FIN (maximal RTO is 120sec). | 
|---|
| 289 | * [ BTW Linux. following BSD, violates this requirement waiting | 
|---|
| 290 | *   only for 60sec, we should wait at least for 240 secs. | 
|---|
| 291 | *   Well, 240 consumes too much of resources 8) | 
|---|
| 292 | * ] | 
|---|
| 293 | * This interval is not reduced to catch old duplicate and | 
|---|
| 294 | * responces to our wandering segments living for two MSLs. | 
|---|
| 295 | * However, if we use PAWS to detect | 
|---|
| 296 | * old duplicates, we can reduce the interval to bounds required | 
|---|
| 297 | * by RTO, rather than MSL. So, if peer understands PAWS, we | 
|---|
| 298 | * kill tw bucket after 3.5*RTO (it is important that this number | 
|---|
| 299 | * is greater than TS tick!) and detect old duplicates with help | 
|---|
| 300 | * of PAWS. | 
|---|
| 301 | */ | 
|---|
| 302 |  | 
|---|
| 303 | if (!rearm) { | 
|---|
| 304 | bool kill = timeo <= 4*HZ; | 
|---|
| 305 |  | 
|---|
| 306 | __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED : | 
|---|
| 307 | LINUX_MIB_TIMEWAITED); | 
|---|
| 308 | BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); | 
|---|
| 309 | refcount_inc(r: &tw->tw_dr->tw_refcount); | 
|---|
| 310 | } else { | 
|---|
| 311 | mod_timer_pending(timer: &tw->tw_timer, expires: jiffies + timeo); | 
|---|
| 312 | } | 
|---|
| 313 | } | 
|---|
| 314 |  | 
|---|
| 315 | /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ | 
|---|
| 316 | void inet_twsk_purge(struct inet_hashinfo *hashinfo) | 
|---|
| 317 | { | 
|---|
| 318 | struct inet_ehash_bucket *head = &hashinfo->ehash[0]; | 
|---|
| 319 | unsigned int ehash_mask = hashinfo->ehash_mask; | 
|---|
| 320 | struct hlist_nulls_node *node; | 
|---|
| 321 | unsigned int slot; | 
|---|
| 322 | struct sock *sk; | 
|---|
| 323 |  | 
|---|
| 324 | for (slot = 0; slot <= ehash_mask; slot++, head++) { | 
|---|
| 325 | if (hlist_nulls_empty(h: &head->chain)) | 
|---|
| 326 | continue; | 
|---|
| 327 |  | 
|---|
| 328 | restart_rcu: | 
|---|
| 329 | cond_resched(); | 
|---|
| 330 | rcu_read_lock(); | 
|---|
| 331 | restart: | 
|---|
| 332 | sk_nulls_for_each_rcu(sk, node, &head->chain) { | 
|---|
| 333 | int state = inet_sk_state_load(sk); | 
|---|
| 334 |  | 
|---|
| 335 | if ((1 << state) & ~(TCPF_TIME_WAIT | | 
|---|
| 336 | TCPF_NEW_SYN_RECV)) | 
|---|
| 337 | continue; | 
|---|
| 338 |  | 
|---|
| 339 | if (check_net(net: sock_net(sk))) | 
|---|
| 340 | continue; | 
|---|
| 341 |  | 
|---|
| 342 | if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) | 
|---|
| 343 | continue; | 
|---|
| 344 |  | 
|---|
| 345 | if (check_net(net: sock_net(sk))) { | 
|---|
| 346 | sock_gen_put(sk); | 
|---|
| 347 | goto restart; | 
|---|
| 348 | } | 
|---|
| 349 |  | 
|---|
| 350 | rcu_read_unlock(); | 
|---|
| 351 | local_bh_disable(); | 
|---|
| 352 | if (state == TCP_TIME_WAIT) { | 
|---|
| 353 | inet_twsk_deschedule_put(inet_twsk(sk)); | 
|---|
| 354 | } else { | 
|---|
| 355 | struct request_sock *req = inet_reqsk(sk); | 
|---|
| 356 |  | 
|---|
| 357 | inet_csk_reqsk_queue_drop_and_put(sk: req->rsk_listener, | 
|---|
| 358 | req); | 
|---|
| 359 | } | 
|---|
| 360 | local_bh_enable(); | 
|---|
| 361 | goto restart_rcu; | 
|---|
| 362 | } | 
|---|
| 363 | /* If the nulls value we got at the end of this lookup is | 
|---|
| 364 | * not the expected one, we must restart lookup. | 
|---|
| 365 | * We probably met an item that was moved to another chain. | 
|---|
| 366 | */ | 
|---|
| 367 | if (get_nulls_value(ptr: node) != slot) | 
|---|
| 368 | goto restart; | 
|---|
| 369 | rcu_read_unlock(); | 
|---|
| 370 | } | 
|---|
| 371 | } | 
|---|
| 372 |  | 
|---|