| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | 
|---|
| 2 | /* | 
|---|
| 3 | * INET		An implementation of the TCP/IP protocol suite for the LINUX | 
|---|
| 4 | *		operating system.  INET is implemented using the BSD Socket | 
|---|
| 5 | *		interface as the means of communication with the user level. | 
|---|
| 6 | * | 
|---|
| 7 | *		Generic INET transport hashtables | 
|---|
| 8 | * | 
|---|
| 9 | * Authors:	Lotsa people, from code originally in tcp | 
|---|
| 10 | */ | 
|---|
| 11 |  | 
|---|
| 12 | #include <linux/module.h> | 
|---|
| 13 | #include <linux/random.h> | 
|---|
| 14 | #include <linux/sched.h> | 
|---|
| 15 | #include <linux/slab.h> | 
|---|
| 16 | #include <linux/wait.h> | 
|---|
| 17 | #include <linux/vmalloc.h> | 
|---|
| 18 | #include <linux/memblock.h> | 
|---|
| 19 |  | 
|---|
| 20 | #include <net/addrconf.h> | 
|---|
| 21 | #include <net/inet_connection_sock.h> | 
|---|
| 22 | #include <net/inet_hashtables.h> | 
|---|
| 23 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 24 | #include <net/inet6_hashtables.h> | 
|---|
| 25 | #endif | 
|---|
| 26 | #include <net/hotdata.h> | 
|---|
| 27 | #include <net/ip.h> | 
|---|
| 28 | #include <net/rps.h> | 
|---|
| 29 | #include <net/secure_seq.h> | 
|---|
| 30 | #include <net/sock_reuseport.h> | 
|---|
| 31 | #include <net/tcp.h> | 
|---|
| 32 |  | 
|---|
| 33 | u32 inet_ehashfn(const struct net *net, const __be32 laddr, | 
|---|
| 34 | const __u16 lport, const __be32 faddr, | 
|---|
| 35 | const __be16 fport) | 
|---|
| 36 | { | 
|---|
| 37 | net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); | 
|---|
| 38 |  | 
|---|
| 39 | return lport + __inet_ehashfn(laddr, lport: 0, faddr, fport, | 
|---|
| 40 | inet_ehash_secret + net_hash_mix(net)); | 
|---|
| 41 | } | 
|---|
| 42 | EXPORT_SYMBOL_GPL(inet_ehashfn); | 
|---|
| 43 |  | 
|---|
| 44 | /* This function handles inet_sock, but also timewait and request sockets | 
|---|
| 45 | * for IPv4/IPv6. | 
|---|
| 46 | */ | 
|---|
| 47 | static u32 sk_ehashfn(const struct sock *sk) | 
|---|
| 48 | { | 
|---|
| 49 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 50 | if (sk->sk_family == AF_INET6 && | 
|---|
| 51 | !ipv6_addr_v4mapped(a: &sk->sk_v6_daddr)) | 
|---|
| 52 | return inet6_ehashfn(sock_net(sk), | 
|---|
| 53 | &sk->sk_v6_rcv_saddr, sk->sk_num, | 
|---|
| 54 | &sk->sk_v6_daddr, sk->sk_dport); | 
|---|
| 55 | #endif | 
|---|
| 56 | return inet_ehashfn(sock_net(sk), | 
|---|
| 57 | sk->sk_rcv_saddr, sk->sk_num, | 
|---|
| 58 | sk->sk_daddr, sk->sk_dport); | 
|---|
| 59 | } | 
|---|
| 60 |  | 
|---|
| 61 | static bool sk_is_connect_bind(const struct sock *sk) | 
|---|
| 62 | { | 
|---|
| 63 | if (sk->sk_state == TCP_TIME_WAIT) | 
|---|
| 64 | return inet_twsk(sk)->tw_connect_bind; | 
|---|
| 65 | else | 
|---|
| 66 | return sk->sk_userlocks & SOCK_CONNECT_BIND; | 
|---|
| 67 | } | 
|---|
| 68 |  | 
|---|
| 69 | /* | 
|---|
| 70 | * Allocate and initialize a new local port bind bucket. | 
|---|
| 71 | * The bindhash mutex for snum's hash chain must be held here. | 
|---|
| 72 | */ | 
|---|
| 73 | struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, | 
|---|
| 74 | struct net *net, | 
|---|
| 75 | struct inet_bind_hashbucket *head, | 
|---|
| 76 | const unsigned short snum, | 
|---|
| 77 | int l3mdev) | 
|---|
| 78 | { | 
|---|
| 79 | struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); | 
|---|
| 80 |  | 
|---|
| 81 | if (tb) { | 
|---|
| 82 | write_pnet(pnet: &tb->ib_net, net); | 
|---|
| 83 | tb->l3mdev    = l3mdev; | 
|---|
| 84 | tb->port      = snum; | 
|---|
| 85 | tb->fastreuse = 0; | 
|---|
| 86 | tb->fastreuseport = 0; | 
|---|
| 87 | INIT_HLIST_HEAD(&tb->bhash2); | 
|---|
| 88 | hlist_add_head_rcu(n: &tb->node, h: &head->chain); | 
|---|
| 89 | } | 
|---|
| 90 | return tb; | 
|---|
| 91 | } | 
|---|
| 92 |  | 
|---|
| 93 | /* | 
|---|
| 94 | * Caller must hold hashbucket lock for this tb with local BH disabled | 
|---|
| 95 | */ | 
|---|
| 96 | void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) | 
|---|
| 97 | { | 
|---|
| 98 | const struct inet_bind2_bucket *tb2; | 
|---|
| 99 |  | 
|---|
| 100 | if (hlist_empty(h: &tb->bhash2)) { | 
|---|
| 101 | hlist_del_rcu(n: &tb->node); | 
|---|
| 102 | kfree_rcu(tb, rcu); | 
|---|
| 103 | return; | 
|---|
| 104 | } | 
|---|
| 105 |  | 
|---|
| 106 | if (tb->fastreuse == -1 && tb->fastreuseport == -1) | 
|---|
| 107 | return; | 
|---|
| 108 | hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { | 
|---|
| 109 | if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) | 
|---|
| 110 | return; | 
|---|
| 111 | } | 
|---|
| 112 | tb->fastreuse = -1; | 
|---|
| 113 | tb->fastreuseport = -1; | 
|---|
| 114 | } | 
|---|
| 115 |  | 
|---|
| 116 | bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, | 
|---|
| 117 | unsigned short port, int l3mdev) | 
|---|
| 118 | { | 
|---|
| 119 | return net_eq(net1: ib_net(ib: tb), net2: net) && tb->port == port && | 
|---|
| 120 | tb->l3mdev == l3mdev; | 
|---|
| 121 | } | 
|---|
| 122 |  | 
|---|
| 123 | static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, | 
|---|
| 124 | struct net *net, | 
|---|
| 125 | struct inet_bind_hashbucket *head, | 
|---|
| 126 | struct inet_bind_bucket *tb, | 
|---|
| 127 | const struct sock *sk) | 
|---|
| 128 | { | 
|---|
| 129 | write_pnet(pnet: &tb2->ib_net, net); | 
|---|
| 130 | tb2->l3mdev = tb->l3mdev; | 
|---|
| 131 | tb2->port = tb->port; | 
|---|
| 132 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 133 | BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); | 
|---|
| 134 | if (sk->sk_family == AF_INET6) { | 
|---|
| 135 | tb2->addr_type = ipv6_addr_type(addr: &sk->sk_v6_rcv_saddr); | 
|---|
| 136 | tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; | 
|---|
| 137 | } else { | 
|---|
| 138 | tb2->addr_type = IPV6_ADDR_MAPPED; | 
|---|
| 139 | ipv6_addr_set_v4mapped(addr: sk->sk_rcv_saddr, v4mapped: &tb2->v6_rcv_saddr); | 
|---|
| 140 | } | 
|---|
| 141 | #else | 
|---|
| 142 | tb2->rcv_saddr = sk->sk_rcv_saddr; | 
|---|
| 143 | #endif | 
|---|
| 144 | tb2->fastreuse = 0; | 
|---|
| 145 | tb2->fastreuseport = 0; | 
|---|
| 146 | INIT_HLIST_HEAD(&tb2->owners); | 
|---|
| 147 | hlist_add_head(n: &tb2->node, h: &head->chain); | 
|---|
| 148 | hlist_add_head(n: &tb2->bhash_node, h: &tb->bhash2); | 
|---|
| 149 | } | 
|---|
| 150 |  | 
|---|
| 151 | struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, | 
|---|
| 152 | struct net *net, | 
|---|
| 153 | struct inet_bind_hashbucket *head, | 
|---|
| 154 | struct inet_bind_bucket *tb, | 
|---|
| 155 | const struct sock *sk) | 
|---|
| 156 | { | 
|---|
| 157 | struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); | 
|---|
| 158 |  | 
|---|
| 159 | if (tb2) | 
|---|
| 160 | inet_bind2_bucket_init(tb2, net, head, tb, sk); | 
|---|
| 161 |  | 
|---|
| 162 | return tb2; | 
|---|
| 163 | } | 
|---|
| 164 |  | 
|---|
| 165 | /* Caller must hold hashbucket lock for this tb with local BH disabled */ | 
|---|
| 166 | void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) | 
|---|
| 167 | { | 
|---|
| 168 | const struct sock *sk; | 
|---|
| 169 |  | 
|---|
| 170 | if (hlist_empty(h: &tb->owners)) { | 
|---|
| 171 | __hlist_del(n: &tb->node); | 
|---|
| 172 | __hlist_del(n: &tb->bhash_node); | 
|---|
| 173 | kmem_cache_free(s: cachep, objp: tb); | 
|---|
| 174 | return; | 
|---|
| 175 | } | 
|---|
| 176 |  | 
|---|
| 177 | if (tb->fastreuse == -1 && tb->fastreuseport == -1) | 
|---|
| 178 | return; | 
|---|
| 179 | sk_for_each_bound(sk, &tb->owners) { | 
|---|
| 180 | if (!sk_is_connect_bind(sk)) | 
|---|
| 181 | return; | 
|---|
| 182 | } | 
|---|
| 183 | tb->fastreuse = -1; | 
|---|
| 184 | tb->fastreuseport = -1; | 
|---|
| 185 | } | 
|---|
| 186 |  | 
|---|
| 187 | static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, | 
|---|
| 188 | const struct sock *sk) | 
|---|
| 189 | { | 
|---|
| 190 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 191 | if (sk->sk_family == AF_INET6) | 
|---|
| 192 | return ipv6_addr_equal(a1: &tb2->v6_rcv_saddr, a2: &sk->sk_v6_rcv_saddr); | 
|---|
| 193 |  | 
|---|
| 194 | if (tb2->addr_type != IPV6_ADDR_MAPPED) | 
|---|
| 195 | return false; | 
|---|
| 196 | #endif | 
|---|
| 197 | return tb2->rcv_saddr == sk->sk_rcv_saddr; | 
|---|
| 198 | } | 
|---|
| 199 |  | 
|---|
| 200 | void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, | 
|---|
| 201 | struct inet_bind2_bucket *tb2, unsigned short port) | 
|---|
| 202 | { | 
|---|
| 203 | inet_sk(sk)->inet_num = port; | 
|---|
| 204 | inet_csk(sk)->icsk_bind_hash = tb; | 
|---|
| 205 | inet_csk(sk)->icsk_bind2_hash = tb2; | 
|---|
| 206 | sk_add_bind_node(sk, list: &tb2->owners); | 
|---|
| 207 | } | 
|---|
| 208 |  | 
|---|
| 209 | /* | 
|---|
| 210 | * Get rid of any references to a local port held by the given sock. | 
|---|
| 211 | */ | 
|---|
| 212 | static void __inet_put_port(struct sock *sk) | 
|---|
| 213 | { | 
|---|
| 214 | struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); | 
|---|
| 215 | struct inet_bind_hashbucket *head, *head2; | 
|---|
| 216 | struct net *net = sock_net(sk); | 
|---|
| 217 | struct inet_bind_bucket *tb; | 
|---|
| 218 | int bhash; | 
|---|
| 219 |  | 
|---|
| 220 | bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, bhash_size: hashinfo->bhash_size); | 
|---|
| 221 | head = &hashinfo->bhash[bhash]; | 
|---|
| 222 | head2 = inet_bhashfn_portaddr(hinfo: hashinfo, sk, net, inet_sk(sk)->inet_num); | 
|---|
| 223 |  | 
|---|
| 224 | spin_lock(lock: &head->lock); | 
|---|
| 225 | tb = inet_csk(sk)->icsk_bind_hash; | 
|---|
| 226 | inet_csk(sk)->icsk_bind_hash = NULL; | 
|---|
| 227 | inet_sk(sk)->inet_num = 0; | 
|---|
| 228 | sk->sk_userlocks &= ~SOCK_CONNECT_BIND; | 
|---|
| 229 |  | 
|---|
| 230 | spin_lock(lock: &head2->lock); | 
|---|
| 231 | if (inet_csk(sk)->icsk_bind2_hash) { | 
|---|
| 232 | struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; | 
|---|
| 233 |  | 
|---|
| 234 | __sk_del_bind_node(sk); | 
|---|
| 235 | inet_csk(sk)->icsk_bind2_hash = NULL; | 
|---|
| 236 | inet_bind2_bucket_destroy(cachep: hashinfo->bind2_bucket_cachep, tb: tb2); | 
|---|
| 237 | } | 
|---|
| 238 | spin_unlock(lock: &head2->lock); | 
|---|
| 239 |  | 
|---|
| 240 | inet_bind_bucket_destroy(tb); | 
|---|
| 241 | spin_unlock(lock: &head->lock); | 
|---|
| 242 | } | 
|---|
| 243 |  | 
|---|
| 244 | void inet_put_port(struct sock *sk) | 
|---|
| 245 | { | 
|---|
| 246 | local_bh_disable(); | 
|---|
| 247 | __inet_put_port(sk); | 
|---|
| 248 | local_bh_enable(); | 
|---|
| 249 | } | 
|---|
| 250 | EXPORT_SYMBOL(inet_put_port); | 
|---|
| 251 |  | 
|---|
| 252 | int __inet_inherit_port(const struct sock *sk, struct sock *child) | 
|---|
| 253 | { | 
|---|
| 254 | struct inet_hashinfo *table = tcp_get_hashinfo(sk); | 
|---|
| 255 | unsigned short port = inet_sk(child)->inet_num; | 
|---|
| 256 | struct inet_bind_hashbucket *head, *head2; | 
|---|
| 257 | bool created_inet_bind_bucket = false; | 
|---|
| 258 | struct net *net = sock_net(sk); | 
|---|
| 259 | bool update_fastreuse = false; | 
|---|
| 260 | struct inet_bind2_bucket *tb2; | 
|---|
| 261 | struct inet_bind_bucket *tb; | 
|---|
| 262 | int bhash, l3mdev; | 
|---|
| 263 |  | 
|---|
| 264 | bhash = inet_bhashfn(net, lport: port, bhash_size: table->bhash_size); | 
|---|
| 265 | head = &table->bhash[bhash]; | 
|---|
| 266 | head2 = inet_bhashfn_portaddr(hinfo: table, sk: child, net, port); | 
|---|
| 267 |  | 
|---|
| 268 | spin_lock(lock: &head->lock); | 
|---|
| 269 | spin_lock(lock: &head2->lock); | 
|---|
| 270 | tb = inet_csk(sk)->icsk_bind_hash; | 
|---|
| 271 | tb2 = inet_csk(sk)->icsk_bind2_hash; | 
|---|
| 272 | if (unlikely(!tb || !tb2)) { | 
|---|
| 273 | spin_unlock(lock: &head2->lock); | 
|---|
| 274 | spin_unlock(lock: &head->lock); | 
|---|
| 275 | return -ENOENT; | 
|---|
| 276 | } | 
|---|
| 277 | if (tb->port != port) { | 
|---|
| 278 | l3mdev = inet_sk_bound_l3mdev(sk); | 
|---|
| 279 |  | 
|---|
| 280 | /* NOTE: using tproxy and redirecting skbs to a proxy | 
|---|
| 281 | * on a different listener port breaks the assumption | 
|---|
| 282 | * that the listener socket's icsk_bind_hash is the same | 
|---|
| 283 | * as that of the child socket. We have to look up or | 
|---|
| 284 | * create a new bind bucket for the child here. */ | 
|---|
| 285 | inet_bind_bucket_for_each(tb, &head->chain) { | 
|---|
| 286 | if (inet_bind_bucket_match(tb, net, port, l3mdev)) | 
|---|
| 287 | break; | 
|---|
| 288 | } | 
|---|
| 289 | if (!tb) { | 
|---|
| 290 | tb = inet_bind_bucket_create(cachep: table->bind_bucket_cachep, | 
|---|
| 291 | net, head, snum: port, l3mdev); | 
|---|
| 292 | if (!tb) { | 
|---|
| 293 | spin_unlock(lock: &head2->lock); | 
|---|
| 294 | spin_unlock(lock: &head->lock); | 
|---|
| 295 | return -ENOMEM; | 
|---|
| 296 | } | 
|---|
| 297 | created_inet_bind_bucket = true; | 
|---|
| 298 | } | 
|---|
| 299 | update_fastreuse = true; | 
|---|
| 300 |  | 
|---|
| 301 | goto bhash2_find; | 
|---|
| 302 | } else if (!inet_bind2_bucket_addr_match(tb2, sk: child)) { | 
|---|
| 303 | l3mdev = inet_sk_bound_l3mdev(sk); | 
|---|
| 304 |  | 
|---|
| 305 | bhash2_find: | 
|---|
| 306 | tb2 = inet_bind2_bucket_find(head: head2, net, port, l3mdev, sk: child); | 
|---|
| 307 | if (!tb2) { | 
|---|
| 308 | tb2 = inet_bind2_bucket_create(cachep: table->bind2_bucket_cachep, | 
|---|
| 309 | net, head: head2, tb, sk: child); | 
|---|
| 310 | if (!tb2) | 
|---|
| 311 | goto error; | 
|---|
| 312 | } | 
|---|
| 313 | } | 
|---|
| 314 | if (update_fastreuse) | 
|---|
| 315 | inet_csk_update_fastreuse(sk: child, tb, tb2); | 
|---|
| 316 | inet_bind_hash(sk: child, tb, tb2, port); | 
|---|
| 317 | spin_unlock(lock: &head2->lock); | 
|---|
| 318 | spin_unlock(lock: &head->lock); | 
|---|
| 319 |  | 
|---|
| 320 | return 0; | 
|---|
| 321 |  | 
|---|
| 322 | error: | 
|---|
| 323 | if (created_inet_bind_bucket) | 
|---|
| 324 | inet_bind_bucket_destroy(tb); | 
|---|
| 325 | spin_unlock(lock: &head2->lock); | 
|---|
| 326 | spin_unlock(lock: &head->lock); | 
|---|
| 327 | return -ENOMEM; | 
|---|
| 328 | } | 
|---|
| 329 | EXPORT_SYMBOL_GPL(__inet_inherit_port); | 
|---|
| 330 |  | 
|---|
| 331 | static struct inet_listen_hashbucket * | 
|---|
| 332 | inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) | 
|---|
| 333 | { | 
|---|
| 334 | u32 hash; | 
|---|
| 335 |  | 
|---|
| 336 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 337 | if (sk->sk_family == AF_INET6) | 
|---|
| 338 | hash = ipv6_portaddr_hash(net: sock_net(sk), | 
|---|
| 339 | addr6: &sk->sk_v6_rcv_saddr, | 
|---|
| 340 | inet_sk(sk)->inet_num); | 
|---|
| 341 | else | 
|---|
| 342 | #endif | 
|---|
| 343 | hash = ipv4_portaddr_hash(net: sock_net(sk), | 
|---|
| 344 | inet_sk(sk)->inet_rcv_saddr, | 
|---|
| 345 | inet_sk(sk)->inet_num); | 
|---|
| 346 | return inet_lhash2_bucket(h, hash); | 
|---|
| 347 | } | 
|---|
| 348 |  | 
|---|
| 349 | static inline int compute_score(struct sock *sk, const struct net *net, | 
|---|
| 350 | const unsigned short hnum, const __be32 daddr, | 
|---|
| 351 | const int dif, const int sdif) | 
|---|
| 352 | { | 
|---|
| 353 | int score = -1; | 
|---|
| 354 |  | 
|---|
| 355 | if (net_eq(net1: sock_net(sk), net2: net) && sk->sk_num == hnum && | 
|---|
| 356 | !ipv6_only_sock(sk)) { | 
|---|
| 357 | if (sk->sk_rcv_saddr != daddr) | 
|---|
| 358 | return -1; | 
|---|
| 359 |  | 
|---|
| 360 | if (!inet_sk_bound_dev_eq(net, bound_dev_if: sk->sk_bound_dev_if, dif, sdif)) | 
|---|
| 361 | return -1; | 
|---|
| 362 | score =  sk->sk_bound_dev_if ? 2 : 1; | 
|---|
| 363 |  | 
|---|
| 364 | if (sk->sk_family == PF_INET) | 
|---|
| 365 | score++; | 
|---|
| 366 | if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) | 
|---|
| 367 | score++; | 
|---|
| 368 | } | 
|---|
| 369 | return score; | 
|---|
| 370 | } | 
|---|
| 371 |  | 
|---|
| 372 | /** | 
|---|
| 373 | * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. | 
|---|
| 374 | * @net: network namespace. | 
|---|
| 375 | * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. | 
|---|
| 376 | * @skb: context for a potential SK_REUSEPORT program. | 
|---|
| 377 | * @doff: header offset. | 
|---|
| 378 | * @saddr: source address. | 
|---|
| 379 | * @sport: source port. | 
|---|
| 380 | * @daddr: destination address. | 
|---|
| 381 | * @hnum: destination port in host byte order. | 
|---|
| 382 | * @ehashfn: hash function used to generate the fallback hash. | 
|---|
| 383 | * | 
|---|
| 384 | * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to | 
|---|
| 385 | *         the selected sock or an error. | 
|---|
| 386 | */ | 
|---|
| 387 | struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, | 
|---|
| 388 | struct sk_buff *skb, int doff, | 
|---|
| 389 | __be32 saddr, __be16 sport, | 
|---|
| 390 | __be32 daddr, unsigned short hnum, | 
|---|
| 391 | inet_ehashfn_t *ehashfn) | 
|---|
| 392 | { | 
|---|
| 393 | struct sock *reuse_sk = NULL; | 
|---|
| 394 | u32 phash; | 
|---|
| 395 |  | 
|---|
| 396 | if (sk->sk_reuseport) { | 
|---|
| 397 | phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, | 
|---|
| 398 | net, daddr, hnum, saddr, sport); | 
|---|
| 399 | reuse_sk = reuseport_select_sock(sk, hash: phash, skb, hdr_len: doff); | 
|---|
| 400 | } | 
|---|
| 401 | return reuse_sk; | 
|---|
| 402 | } | 
|---|
| 403 | EXPORT_SYMBOL_GPL(inet_lookup_reuseport); | 
|---|
| 404 |  | 
|---|
| 405 | /* | 
|---|
| 406 | * Here are some nice properties to exploit here. The BSD API | 
|---|
| 407 | * does not allow a listening sock to specify the remote port nor the | 
|---|
| 408 | * remote address for the connection. So always assume those are both | 
|---|
| 409 | * wildcarded during the search since they can never be otherwise. | 
|---|
| 410 | */ | 
|---|
| 411 |  | 
|---|
| 412 | /* called with rcu_read_lock() : No refcount taken on the socket */ | 
|---|
| 413 | static struct sock *inet_lhash2_lookup(const struct net *net, | 
|---|
| 414 | struct inet_listen_hashbucket *ilb2, | 
|---|
| 415 | struct sk_buff *skb, int doff, | 
|---|
| 416 | const __be32 saddr, __be16 sport, | 
|---|
| 417 | const __be32 daddr, const unsigned short hnum, | 
|---|
| 418 | const int dif, const int sdif) | 
|---|
| 419 | { | 
|---|
| 420 | struct sock *sk, *result = NULL; | 
|---|
| 421 | struct hlist_nulls_node *node; | 
|---|
| 422 | int score, hiscore = 0; | 
|---|
| 423 |  | 
|---|
| 424 | sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { | 
|---|
| 425 | score = compute_score(sk, net, hnum, daddr, dif, sdif); | 
|---|
| 426 | if (score > hiscore) { | 
|---|
| 427 | result = inet_lookup_reuseport(net, sk, skb, doff, | 
|---|
| 428 | saddr, sport, daddr, hnum, inet_ehashfn); | 
|---|
| 429 | if (result) | 
|---|
| 430 | return result; | 
|---|
| 431 |  | 
|---|
| 432 | result = sk; | 
|---|
| 433 | hiscore = score; | 
|---|
| 434 | } | 
|---|
| 435 | } | 
|---|
| 436 |  | 
|---|
| 437 | return result; | 
|---|
| 438 | } | 
|---|
| 439 |  | 
|---|
| 440 | struct sock *inet_lookup_run_sk_lookup(const struct net *net, | 
|---|
| 441 | int protocol, | 
|---|
| 442 | struct sk_buff *skb, int doff, | 
|---|
| 443 | __be32 saddr, __be16 sport, | 
|---|
| 444 | __be32 daddr, u16 hnum, const int dif, | 
|---|
| 445 | inet_ehashfn_t *ehashfn) | 
|---|
| 446 | { | 
|---|
| 447 | struct sock *sk, *reuse_sk; | 
|---|
| 448 | bool no_reuseport; | 
|---|
| 449 |  | 
|---|
| 450 | no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, | 
|---|
| 451 | daddr, dport: hnum, ifindex: dif, psk: &sk); | 
|---|
| 452 | if (no_reuseport || IS_ERR_OR_NULL(ptr: sk)) | 
|---|
| 453 | return sk; | 
|---|
| 454 |  | 
|---|
| 455 | reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, | 
|---|
| 456 | ehashfn); | 
|---|
| 457 | if (reuse_sk) | 
|---|
| 458 | sk = reuse_sk; | 
|---|
| 459 | return sk; | 
|---|
| 460 | } | 
|---|
| 461 |  | 
|---|
| 462 | struct sock *__inet_lookup_listener(const struct net *net, | 
|---|
| 463 | struct sk_buff *skb, int doff, | 
|---|
| 464 | const __be32 saddr, __be16 sport, | 
|---|
| 465 | const __be32 daddr, const unsigned short hnum, | 
|---|
| 466 | const int dif, const int sdif) | 
|---|
| 467 | { | 
|---|
| 468 | struct inet_listen_hashbucket *ilb2; | 
|---|
| 469 | struct inet_hashinfo *hashinfo; | 
|---|
| 470 | struct sock *result = NULL; | 
|---|
| 471 | unsigned int hash2; | 
|---|
| 472 |  | 
|---|
| 473 | /* Lookup redirect from BPF */ | 
|---|
| 474 | if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { | 
|---|
| 475 | result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, | 
|---|
| 476 | saddr, sport, daddr, hnum, dif, | 
|---|
| 477 | ehashfn: inet_ehashfn); | 
|---|
| 478 | if (result) | 
|---|
| 479 | goto done; | 
|---|
| 480 | } | 
|---|
| 481 |  | 
|---|
| 482 | hashinfo = net->ipv4.tcp_death_row.hashinfo; | 
|---|
| 483 | hash2 = ipv4_portaddr_hash(net, saddr: daddr, port: hnum); | 
|---|
| 484 | ilb2 = inet_lhash2_bucket(h: hashinfo, hash: hash2); | 
|---|
| 485 |  | 
|---|
| 486 | result = inet_lhash2_lookup(net, ilb2, skb, doff, | 
|---|
| 487 | saddr, sport, daddr, hnum, | 
|---|
| 488 | dif, sdif); | 
|---|
| 489 | if (result) | 
|---|
| 490 | goto done; | 
|---|
| 491 |  | 
|---|
| 492 | /* Lookup lhash2 with INADDR_ANY */ | 
|---|
| 493 | hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), port: hnum); | 
|---|
| 494 | ilb2 = inet_lhash2_bucket(h: hashinfo, hash: hash2); | 
|---|
| 495 |  | 
|---|
| 496 | result = inet_lhash2_lookup(net, ilb2, skb, doff, | 
|---|
| 497 | saddr, sport, htonl(INADDR_ANY), hnum, | 
|---|
| 498 | dif, sdif); | 
|---|
| 499 | done: | 
|---|
| 500 | if (IS_ERR(ptr: result)) | 
|---|
| 501 | return NULL; | 
|---|
| 502 | return result; | 
|---|
| 503 | } | 
|---|
| 504 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 
|---|
| 505 |  | 
|---|
| 506 | /* All sockets share common refcount, but have different destructors */ | 
|---|
| 507 | void sock_gen_put(struct sock *sk) | 
|---|
| 508 | { | 
|---|
| 509 | if (!refcount_dec_and_test(r: &sk->sk_refcnt)) | 
|---|
| 510 | return; | 
|---|
| 511 |  | 
|---|
| 512 | if (sk->sk_state == TCP_TIME_WAIT) | 
|---|
| 513 | inet_twsk_free(tw: inet_twsk(sk)); | 
|---|
| 514 | else if (sk->sk_state == TCP_NEW_SYN_RECV) | 
|---|
| 515 | reqsk_free(req: inet_reqsk(sk)); | 
|---|
| 516 | else | 
|---|
| 517 | sk_free(sk); | 
|---|
| 518 | } | 
|---|
| 519 | EXPORT_SYMBOL_GPL(sock_gen_put); | 
|---|
| 520 |  | 
|---|
| 521 | void sock_edemux(struct sk_buff *skb) | 
|---|
| 522 | { | 
|---|
| 523 | sock_gen_put(skb->sk); | 
|---|
| 524 | } | 
|---|
| 525 | EXPORT_SYMBOL(sock_edemux); | 
|---|
| 526 |  | 
|---|
| 527 | struct sock *__inet_lookup_established(const struct net *net, | 
|---|
| 528 | const __be32 saddr, const __be16 sport, | 
|---|
| 529 | const __be32 daddr, const u16 hnum, | 
|---|
| 530 | const int dif, const int sdif) | 
|---|
| 531 | { | 
|---|
| 532 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); | 
|---|
| 533 | INET_ADDR_COOKIE(acookie, saddr, daddr); | 
|---|
| 534 | const struct hlist_nulls_node *node; | 
|---|
| 535 | struct inet_ehash_bucket *head; | 
|---|
| 536 | struct inet_hashinfo *hashinfo; | 
|---|
| 537 | unsigned int hash, slot; | 
|---|
| 538 | struct sock *sk; | 
|---|
| 539 |  | 
|---|
| 540 | hashinfo = net->ipv4.tcp_death_row.hashinfo; | 
|---|
| 541 | hash = inet_ehashfn(net, daddr, hnum, saddr, sport); | 
|---|
| 542 | slot = hash & hashinfo->ehash_mask; | 
|---|
| 543 | head = &hashinfo->ehash[slot]; | 
|---|
| 544 |  | 
|---|
| 545 | begin: | 
|---|
| 546 | sk_nulls_for_each_rcu(sk, node, &head->chain) { | 
|---|
| 547 | if (sk->sk_hash != hash) | 
|---|
| 548 | continue; | 
|---|
| 549 | if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { | 
|---|
| 550 | if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) | 
|---|
| 551 | goto out; | 
|---|
| 552 | if (unlikely(!inet_match(net, sk, acookie, | 
|---|
| 553 | ports, dif, sdif))) { | 
|---|
| 554 | sock_gen_put(sk); | 
|---|
| 555 | goto begin; | 
|---|
| 556 | } | 
|---|
| 557 | goto found; | 
|---|
| 558 | } | 
|---|
| 559 | } | 
|---|
| 560 | /* | 
|---|
| 561 | * if the nulls value we got at the end of this lookup is | 
|---|
| 562 | * not the expected one, we must restart lookup. | 
|---|
| 563 | * We probably met an item that was moved to another chain. | 
|---|
| 564 | */ | 
|---|
| 565 | if (get_nulls_value(ptr: node) != slot) | 
|---|
| 566 | goto begin; | 
|---|
| 567 | out: | 
|---|
| 568 | sk = NULL; | 
|---|
| 569 | found: | 
|---|
| 570 | return sk; | 
|---|
| 571 | } | 
|---|
| 572 | EXPORT_SYMBOL_GPL(__inet_lookup_established); | 
|---|
| 573 |  | 
|---|
| 574 | /* called with local bh disabled */ | 
|---|
| 575 | static int __inet_check_established(struct inet_timewait_death_row *death_row, | 
|---|
| 576 | struct sock *sk, __u16 lport, | 
|---|
| 577 | struct inet_timewait_sock **twp, | 
|---|
| 578 | bool rcu_lookup, | 
|---|
| 579 | u32 hash) | 
|---|
| 580 | { | 
|---|
| 581 | struct inet_hashinfo *hinfo = death_row->hashinfo; | 
|---|
| 582 | struct inet_sock *inet = inet_sk(sk); | 
|---|
| 583 | __be32 daddr = inet->inet_rcv_saddr; | 
|---|
| 584 | __be32 saddr = inet->inet_daddr; | 
|---|
| 585 | int dif = sk->sk_bound_dev_if; | 
|---|
| 586 | struct net *net = sock_net(sk); | 
|---|
| 587 | int sdif = l3mdev_master_ifindex_by_index(net, ifindex: dif); | 
|---|
| 588 | INET_ADDR_COOKIE(acookie, saddr, daddr); | 
|---|
| 589 | const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); | 
|---|
| 590 | struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo: hinfo, hash); | 
|---|
| 591 | struct inet_timewait_sock *tw = NULL; | 
|---|
| 592 | const struct hlist_nulls_node *node; | 
|---|
| 593 | struct sock *sk2; | 
|---|
| 594 | spinlock_t *lock; | 
|---|
| 595 |  | 
|---|
| 596 | if (rcu_lookup) { | 
|---|
| 597 | sk_nulls_for_each(sk2, node, &head->chain) { | 
|---|
| 598 | if (sk2->sk_hash != hash || | 
|---|
| 599 | !inet_match(net, sk: sk2, cookie: acookie, ports, dif, sdif)) | 
|---|
| 600 | continue; | 
|---|
| 601 | if (sk2->sk_state == TCP_TIME_WAIT) | 
|---|
| 602 | break; | 
|---|
| 603 | return -EADDRNOTAVAIL; | 
|---|
| 604 | } | 
|---|
| 605 | return 0; | 
|---|
| 606 | } | 
|---|
| 607 |  | 
|---|
| 608 | lock = inet_ehash_lockp(hashinfo: hinfo, hash); | 
|---|
| 609 | spin_lock(lock); | 
|---|
| 610 |  | 
|---|
| 611 | sk_nulls_for_each(sk2, node, &head->chain) { | 
|---|
| 612 | if (sk2->sk_hash != hash) | 
|---|
| 613 | continue; | 
|---|
| 614 |  | 
|---|
| 615 | if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { | 
|---|
| 616 | if (sk2->sk_state == TCP_TIME_WAIT) { | 
|---|
| 617 | tw = inet_twsk(sk: sk2); | 
|---|
| 618 | if (tcp_twsk_unique(sk, sktw: sk2, twp)) | 
|---|
| 619 | break; | 
|---|
| 620 | } | 
|---|
| 621 | goto not_unique; | 
|---|
| 622 | } | 
|---|
| 623 | } | 
|---|
| 624 |  | 
|---|
| 625 | /* Must record num and sport now. Otherwise we will see | 
|---|
| 626 | * in hash table socket with a funny identity. | 
|---|
| 627 | */ | 
|---|
| 628 | inet->inet_num = lport; | 
|---|
| 629 | inet->inet_sport = htons(lport); | 
|---|
| 630 | sk->sk_hash = hash; | 
|---|
| 631 | WARN_ON(!sk_unhashed(sk)); | 
|---|
| 632 | __sk_nulls_add_node_rcu(sk, list: &head->chain); | 
|---|
| 633 | if (tw) { | 
|---|
| 634 | sk_nulls_del_node_init_rcu(sk: (struct sock *)tw); | 
|---|
| 635 | __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); | 
|---|
| 636 | } | 
|---|
| 637 | spin_unlock(lock); | 
|---|
| 638 | sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: 1); | 
|---|
| 639 |  | 
|---|
| 640 | if (twp) { | 
|---|
| 641 | *twp = tw; | 
|---|
| 642 | } else if (tw) { | 
|---|
| 643 | /* Silly. Should hash-dance instead... */ | 
|---|
| 644 | inet_twsk_deschedule_put(tw); | 
|---|
| 645 | } | 
|---|
| 646 | return 0; | 
|---|
| 647 |  | 
|---|
| 648 | not_unique: | 
|---|
| 649 | spin_unlock(lock); | 
|---|
| 650 | return -EADDRNOTAVAIL; | 
|---|
| 651 | } | 
|---|
| 652 |  | 
|---|
| 653 | static u64 inet_sk_port_offset(const struct sock *sk) | 
|---|
| 654 | { | 
|---|
| 655 | const struct inet_sock *inet = inet_sk(sk); | 
|---|
| 656 |  | 
|---|
| 657 | return secure_ipv4_port_ephemeral(saddr: inet->inet_rcv_saddr, | 
|---|
| 658 | daddr: inet->inet_daddr, | 
|---|
| 659 | dport: inet->inet_dport); | 
|---|
| 660 | } | 
|---|
| 661 |  | 
|---|
| 662 | /* Searches for an exsiting socket in the ehash bucket list. | 
|---|
| 663 | * Returns true if found, false otherwise. | 
|---|
| 664 | */ | 
|---|
| 665 | static bool inet_ehash_lookup_by_sk(struct sock *sk, | 
|---|
| 666 | struct hlist_nulls_head *list) | 
|---|
| 667 | { | 
|---|
| 668 | const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); | 
|---|
| 669 | const int sdif = sk->sk_bound_dev_if; | 
|---|
| 670 | const int dif = sk->sk_bound_dev_if; | 
|---|
| 671 | const struct hlist_nulls_node *node; | 
|---|
| 672 | struct net *net = sock_net(sk); | 
|---|
| 673 | struct sock *esk; | 
|---|
| 674 |  | 
|---|
| 675 | INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); | 
|---|
| 676 |  | 
|---|
| 677 | sk_nulls_for_each_rcu(esk, node, list) { | 
|---|
| 678 | if (esk->sk_hash != sk->sk_hash) | 
|---|
| 679 | continue; | 
|---|
| 680 | if (sk->sk_family == AF_INET) { | 
|---|
| 681 | if (unlikely(inet_match(net, esk, acookie, | 
|---|
| 682 | ports, dif, sdif))) { | 
|---|
| 683 | return true; | 
|---|
| 684 | } | 
|---|
| 685 | } | 
|---|
| 686 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 687 | else if (sk->sk_family == AF_INET6) { | 
|---|
| 688 | if (unlikely(inet6_match(net, esk, | 
|---|
| 689 | &sk->sk_v6_daddr, | 
|---|
| 690 | &sk->sk_v6_rcv_saddr, | 
|---|
| 691 | ports, dif, sdif))) { | 
|---|
| 692 | return true; | 
|---|
| 693 | } | 
|---|
| 694 | } | 
|---|
| 695 | #endif | 
|---|
| 696 | } | 
|---|
| 697 | return false; | 
|---|
| 698 | } | 
|---|
| 699 |  | 
|---|
| 700 | /* Insert a socket into ehash, and eventually remove another one | 
|---|
| 701 | * (The another one can be a SYN_RECV or TIMEWAIT) | 
|---|
| 702 | * If an existing socket already exists, socket sk is not inserted, | 
|---|
| 703 | * and sets found_dup_sk parameter to true. | 
|---|
| 704 | */ | 
|---|
| 705 | bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) | 
|---|
| 706 | { | 
|---|
| 707 | struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); | 
|---|
| 708 | struct inet_ehash_bucket *head; | 
|---|
| 709 | struct hlist_nulls_head *list; | 
|---|
| 710 | spinlock_t *lock; | 
|---|
| 711 | bool ret = true; | 
|---|
| 712 |  | 
|---|
| 713 | WARN_ON_ONCE(!sk_unhashed(sk)); | 
|---|
| 714 |  | 
|---|
| 715 | sk->sk_hash = sk_ehashfn(sk); | 
|---|
| 716 | head = inet_ehash_bucket(hashinfo, hash: sk->sk_hash); | 
|---|
| 717 | list = &head->chain; | 
|---|
| 718 | lock = inet_ehash_lockp(hashinfo, hash: sk->sk_hash); | 
|---|
| 719 |  | 
|---|
| 720 | spin_lock(lock); | 
|---|
| 721 | if (osk) { | 
|---|
| 722 | WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); | 
|---|
| 723 | ret = sk_nulls_del_node_init_rcu(sk: osk); | 
|---|
| 724 | } else if (found_dup_sk) { | 
|---|
| 725 | *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); | 
|---|
| 726 | if (*found_dup_sk) | 
|---|
| 727 | ret = false; | 
|---|
| 728 | } | 
|---|
| 729 |  | 
|---|
| 730 | if (ret) | 
|---|
| 731 | __sk_nulls_add_node_rcu(sk, list); | 
|---|
| 732 |  | 
|---|
| 733 | spin_unlock(lock); | 
|---|
| 734 |  | 
|---|
| 735 | return ret; | 
|---|
| 736 | } | 
|---|
| 737 |  | 
|---|
| 738 | bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) | 
|---|
| 739 | { | 
|---|
| 740 | bool ok = inet_ehash_insert(sk, osk, found_dup_sk); | 
|---|
| 741 |  | 
|---|
| 742 | if (ok) { | 
|---|
| 743 | sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: 1); | 
|---|
| 744 | } else { | 
|---|
| 745 | tcp_orphan_count_inc(); | 
|---|
| 746 | inet_sk_set_state(sk, state: TCP_CLOSE); | 
|---|
| 747 | sock_set_flag(sk, flag: SOCK_DEAD); | 
|---|
| 748 | inet_csk_destroy_sock(sk); | 
|---|
| 749 | } | 
|---|
| 750 | return ok; | 
|---|
| 751 | } | 
|---|
| 752 | EXPORT_IPV6_MOD(inet_ehash_nolisten); | 
|---|
| 753 |  | 
|---|
| 754 | static int inet_reuseport_add_sock(struct sock *sk, | 
|---|
| 755 | struct inet_listen_hashbucket *ilb) | 
|---|
| 756 | { | 
|---|
| 757 | struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; | 
|---|
| 758 | const struct hlist_nulls_node *node; | 
|---|
| 759 | kuid_t uid = sk_uid(sk); | 
|---|
| 760 | struct sock *sk2; | 
|---|
| 761 |  | 
|---|
| 762 | sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { | 
|---|
| 763 | if (sk2 != sk && | 
|---|
| 764 | sk2->sk_family == sk->sk_family && | 
|---|
| 765 | ipv6_only_sock(sk2) == ipv6_only_sock(sk) && | 
|---|
| 766 | sk2->sk_bound_dev_if == sk->sk_bound_dev_if && | 
|---|
| 767 | inet_csk(sk2)->icsk_bind_hash == tb && | 
|---|
| 768 | sk2->sk_reuseport && uid_eq(left: uid, right: sk_uid(sk: sk2)) && | 
|---|
| 769 | inet_rcv_saddr_equal(sk, sk2, match_wildcard: false)) | 
|---|
| 770 | return reuseport_add_sock(sk, sk2, | 
|---|
| 771 | bind_inany: inet_rcv_saddr_any(sk)); | 
|---|
| 772 | } | 
|---|
| 773 |  | 
|---|
| 774 | return reuseport_alloc(sk, bind_inany: inet_rcv_saddr_any(sk)); | 
|---|
| 775 | } | 
|---|
| 776 |  | 
|---|
| 777 | int inet_hash(struct sock *sk) | 
|---|
| 778 | { | 
|---|
| 779 | struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); | 
|---|
| 780 | struct inet_listen_hashbucket *ilb2; | 
|---|
| 781 | int err = 0; | 
|---|
| 782 |  | 
|---|
| 783 | if (sk->sk_state == TCP_CLOSE) | 
|---|
| 784 | return 0; | 
|---|
| 785 |  | 
|---|
| 786 | if (sk->sk_state != TCP_LISTEN) { | 
|---|
| 787 | local_bh_disable(); | 
|---|
| 788 | inet_ehash_nolisten(sk, NULL, NULL); | 
|---|
| 789 | local_bh_enable(); | 
|---|
| 790 | return 0; | 
|---|
| 791 | } | 
|---|
| 792 | WARN_ON(!sk_unhashed(sk)); | 
|---|
| 793 | ilb2 = inet_lhash2_bucket_sk(h: hashinfo, sk); | 
|---|
| 794 |  | 
|---|
| 795 | spin_lock(lock: &ilb2->lock); | 
|---|
| 796 | if (sk->sk_reuseport) { | 
|---|
| 797 | err = inet_reuseport_add_sock(sk, ilb: ilb2); | 
|---|
| 798 | if (err) | 
|---|
| 799 | goto unlock; | 
|---|
| 800 | } | 
|---|
| 801 | sock_set_flag(sk, flag: SOCK_RCU_FREE); | 
|---|
| 802 | if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && | 
|---|
| 803 | sk->sk_family == AF_INET6) | 
|---|
| 804 | __sk_nulls_add_node_tail_rcu(sk, list: &ilb2->nulls_head); | 
|---|
| 805 | else | 
|---|
| 806 | __sk_nulls_add_node_rcu(sk, list: &ilb2->nulls_head); | 
|---|
| 807 | sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: 1); | 
|---|
| 808 | unlock: | 
|---|
| 809 | spin_unlock(lock: &ilb2->lock); | 
|---|
| 810 |  | 
|---|
| 811 | return err; | 
|---|
| 812 | } | 
|---|
| 813 | EXPORT_IPV6_MOD(inet_hash); | 
|---|
| 814 |  | 
|---|
| 815 | void inet_unhash(struct sock *sk) | 
|---|
| 816 | { | 
|---|
| 817 | struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); | 
|---|
| 818 |  | 
|---|
| 819 | if (sk_unhashed(sk)) | 
|---|
| 820 | return; | 
|---|
| 821 |  | 
|---|
| 822 | sock_rps_delete_flow(sk); | 
|---|
| 823 | if (sk->sk_state == TCP_LISTEN) { | 
|---|
| 824 | struct inet_listen_hashbucket *ilb2; | 
|---|
| 825 |  | 
|---|
| 826 | ilb2 = inet_lhash2_bucket_sk(h: hashinfo, sk); | 
|---|
| 827 | /* Don't disable bottom halves while acquiring the lock to | 
|---|
| 828 | * avoid circular locking dependency on PREEMPT_RT. | 
|---|
| 829 | */ | 
|---|
| 830 | spin_lock(lock: &ilb2->lock); | 
|---|
| 831 | if (rcu_access_pointer(sk->sk_reuseport_cb)) | 
|---|
| 832 | reuseport_stop_listen_sock(sk); | 
|---|
| 833 |  | 
|---|
| 834 | __sk_nulls_del_node_init_rcu(sk); | 
|---|
| 835 | sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: -1); | 
|---|
| 836 | spin_unlock(lock: &ilb2->lock); | 
|---|
| 837 | } else { | 
|---|
| 838 | spinlock_t *lock = inet_ehash_lockp(hashinfo, hash: sk->sk_hash); | 
|---|
| 839 |  | 
|---|
| 840 | spin_lock_bh(lock); | 
|---|
| 841 | __sk_nulls_del_node_init_rcu(sk); | 
|---|
| 842 | sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: -1); | 
|---|
| 843 | spin_unlock_bh(lock); | 
|---|
| 844 | } | 
|---|
| 845 | } | 
|---|
| 846 | EXPORT_IPV6_MOD(inet_unhash); | 
|---|
| 847 |  | 
|---|
| 848 | static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, | 
|---|
| 849 | const struct net *net, unsigned short port, | 
|---|
| 850 | int l3mdev, const struct sock *sk) | 
|---|
| 851 | { | 
|---|
| 852 | if (!net_eq(net1: ib2_net(ib: tb), net2: net) || tb->port != port || | 
|---|
| 853 | tb->l3mdev != l3mdev) | 
|---|
| 854 | return false; | 
|---|
| 855 |  | 
|---|
| 856 | return inet_bind2_bucket_addr_match(tb2: tb, sk); | 
|---|
| 857 | } | 
|---|
| 858 |  | 
|---|
| 859 | bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, | 
|---|
| 860 | unsigned short port, int l3mdev, const struct sock *sk) | 
|---|
| 861 | { | 
|---|
| 862 | if (!net_eq(net1: ib2_net(ib: tb), net2: net) || tb->port != port || | 
|---|
| 863 | tb->l3mdev != l3mdev) | 
|---|
| 864 | return false; | 
|---|
| 865 |  | 
|---|
| 866 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 867 | if (tb->addr_type == IPV6_ADDR_ANY) | 
|---|
| 868 | return true; | 
|---|
| 869 |  | 
|---|
| 870 | if (tb->addr_type != IPV6_ADDR_MAPPED) | 
|---|
| 871 | return false; | 
|---|
| 872 |  | 
|---|
| 873 | if (sk->sk_family == AF_INET6 && | 
|---|
| 874 | !ipv6_addr_v4mapped(a: &sk->sk_v6_rcv_saddr)) | 
|---|
| 875 | return false; | 
|---|
| 876 | #endif | 
|---|
| 877 | return tb->rcv_saddr == 0; | 
|---|
| 878 | } | 
|---|
| 879 |  | 
|---|
| 880 | /* The socket's bhash2 hashbucket spinlock must be held when this is called */ | 
|---|
| 881 | struct inet_bind2_bucket * | 
|---|
| 882 | inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, | 
|---|
| 883 | unsigned short port, int l3mdev, const struct sock *sk) | 
|---|
| 884 | { | 
|---|
| 885 | struct inet_bind2_bucket *bhash2 = NULL; | 
|---|
| 886 |  | 
|---|
| 887 | inet_bind_bucket_for_each(bhash2, &head->chain) | 
|---|
| 888 | if (inet_bind2_bucket_match(tb: bhash2, net, port, l3mdev, sk)) | 
|---|
| 889 | break; | 
|---|
| 890 |  | 
|---|
| 891 | return bhash2; | 
|---|
| 892 | } | 
|---|
| 893 |  | 
|---|
| 894 | struct inet_bind_hashbucket * | 
|---|
| 895 | inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) | 
|---|
| 896 | { | 
|---|
| 897 | struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); | 
|---|
| 898 | u32 hash; | 
|---|
| 899 |  | 
|---|
| 900 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 901 | if (sk->sk_family == AF_INET6) | 
|---|
| 902 | hash = ipv6_portaddr_hash(net, addr6: &in6addr_any, port); | 
|---|
| 903 | else | 
|---|
| 904 | #endif | 
|---|
| 905 | hash = ipv4_portaddr_hash(net, saddr: 0, port); | 
|---|
| 906 |  | 
|---|
| 907 | return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; | 
|---|
| 908 | } | 
|---|
| 909 |  | 
|---|
| 910 | static void inet_update_saddr(struct sock *sk, void *saddr, int family) | 
|---|
| 911 | { | 
|---|
| 912 | if (family == AF_INET) { | 
|---|
| 913 | inet_sk(sk)->inet_saddr = *(__be32 *)saddr; | 
|---|
| 914 | sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); | 
|---|
| 915 | } | 
|---|
| 916 | #if IS_ENABLED(CONFIG_IPV6) | 
|---|
| 917 | else { | 
|---|
| 918 | sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; | 
|---|
| 919 | } | 
|---|
| 920 | #endif | 
|---|
| 921 | } | 
|---|
| 922 |  | 
|---|
| 923 | static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) | 
|---|
| 924 | { | 
|---|
| 925 | struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); | 
|---|
| 926 | struct inet_bind_hashbucket *head, *head2; | 
|---|
| 927 | struct inet_bind2_bucket *tb2, *new_tb2; | 
|---|
| 928 | int l3mdev = inet_sk_bound_l3mdev(sk); | 
|---|
| 929 | int port = inet_sk(sk)->inet_num; | 
|---|
| 930 | struct net *net = sock_net(sk); | 
|---|
| 931 | int bhash; | 
|---|
| 932 |  | 
|---|
| 933 | if (!inet_csk(sk)->icsk_bind2_hash) { | 
|---|
| 934 | /* Not bind()ed before. */ | 
|---|
| 935 | if (reset) | 
|---|
| 936 | inet_reset_saddr(sk); | 
|---|
| 937 | else | 
|---|
| 938 | inet_update_saddr(sk, saddr, family); | 
|---|
| 939 |  | 
|---|
| 940 | return 0; | 
|---|
| 941 | } | 
|---|
| 942 |  | 
|---|
| 943 | /* Allocate a bind2 bucket ahead of time to avoid permanently putting | 
|---|
| 944 | * the bhash2 table in an inconsistent state if a new tb2 bucket | 
|---|
| 945 | * allocation fails. | 
|---|
| 946 | */ | 
|---|
| 947 | new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); | 
|---|
| 948 | if (!new_tb2) { | 
|---|
| 949 | if (reset) { | 
|---|
| 950 | /* The (INADDR_ANY, port) bucket might have already | 
|---|
| 951 | * been freed, then we cannot fixup icsk_bind2_hash, | 
|---|
| 952 | * so we give up and unlink sk from bhash/bhash2 not | 
|---|
| 953 | * to leave inconsistency in bhash2. | 
|---|
| 954 | */ | 
|---|
| 955 | inet_put_port(sk); | 
|---|
| 956 | inet_reset_saddr(sk); | 
|---|
| 957 | } | 
|---|
| 958 |  | 
|---|
| 959 | return -ENOMEM; | 
|---|
| 960 | } | 
|---|
| 961 |  | 
|---|
| 962 | bhash = inet_bhashfn(net, lport: port, bhash_size: hinfo->bhash_size); | 
|---|
| 963 | head = &hinfo->bhash[bhash]; | 
|---|
| 964 | head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); | 
|---|
| 965 |  | 
|---|
| 966 | /* If we change saddr locklessly, another thread | 
|---|
| 967 | * iterating over bhash might see corrupted address. | 
|---|
| 968 | */ | 
|---|
| 969 | spin_lock_bh(lock: &head->lock); | 
|---|
| 970 |  | 
|---|
| 971 | spin_lock(lock: &head2->lock); | 
|---|
| 972 | __sk_del_bind_node(sk); | 
|---|
| 973 | inet_bind2_bucket_destroy(cachep: hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); | 
|---|
| 974 | spin_unlock(lock: &head2->lock); | 
|---|
| 975 |  | 
|---|
| 976 | if (reset) | 
|---|
| 977 | inet_reset_saddr(sk); | 
|---|
| 978 | else | 
|---|
| 979 | inet_update_saddr(sk, saddr, family); | 
|---|
| 980 |  | 
|---|
| 981 | head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); | 
|---|
| 982 |  | 
|---|
| 983 | spin_lock(lock: &head2->lock); | 
|---|
| 984 | tb2 = inet_bind2_bucket_find(head: head2, net, port, l3mdev, sk); | 
|---|
| 985 | if (!tb2) { | 
|---|
| 986 | tb2 = new_tb2; | 
|---|
| 987 | inet_bind2_bucket_init(tb2, net, head: head2, inet_csk(sk)->icsk_bind_hash, sk); | 
|---|
| 988 | if (sk_is_connect_bind(sk)) { | 
|---|
| 989 | tb2->fastreuse = -1; | 
|---|
| 990 | tb2->fastreuseport = -1; | 
|---|
| 991 | } | 
|---|
| 992 | } | 
|---|
| 993 | inet_csk(sk)->icsk_bind2_hash = tb2; | 
|---|
| 994 | sk_add_bind_node(sk, list: &tb2->owners); | 
|---|
| 995 | spin_unlock(lock: &head2->lock); | 
|---|
| 996 |  | 
|---|
| 997 | spin_unlock_bh(lock: &head->lock); | 
|---|
| 998 |  | 
|---|
| 999 | if (tb2 != new_tb2) | 
|---|
| 1000 | kmem_cache_free(s: hinfo->bind2_bucket_cachep, objp: new_tb2); | 
|---|
| 1001 |  | 
|---|
| 1002 | return 0; | 
|---|
| 1003 | } | 
|---|
| 1004 |  | 
|---|
| 1005 | int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) | 
|---|
| 1006 | { | 
|---|
| 1007 | return __inet_bhash2_update_saddr(sk, saddr, family, reset: false); | 
|---|
| 1008 | } | 
|---|
| 1009 | EXPORT_IPV6_MOD(inet_bhash2_update_saddr); | 
|---|
| 1010 |  | 
|---|
| 1011 | void inet_bhash2_reset_saddr(struct sock *sk) | 
|---|
| 1012 | { | 
|---|
| 1013 | if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) | 
|---|
| 1014 | __inet_bhash2_update_saddr(sk, NULL, family: 0, reset: true); | 
|---|
| 1015 | } | 
|---|
| 1016 | EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); | 
|---|
| 1017 |  | 
|---|
| 1018 | /* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm | 
|---|
| 1019 | * Note that we use 32bit integers (vs RFC 'short integers') | 
|---|
| 1020 | * because 2^16 is not a multiple of num_ephemeral and this | 
|---|
| 1021 | * property might be used by clever attacker. | 
|---|
| 1022 | * | 
|---|
| 1023 | * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though | 
|---|
| 1024 | * attacks were since demonstrated, thus we use 65536 by default instead | 
|---|
| 1025 | * to really give more isolation and privacy, at the expense of 256kB | 
|---|
| 1026 | * of kernel memory. | 
|---|
| 1027 | */ | 
|---|
| 1028 | #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) | 
|---|
| 1029 | static u32 *table_perturb; | 
|---|
| 1030 |  | 
|---|
| 1031 | int __inet_hash_connect(struct inet_timewait_death_row *death_row, | 
|---|
| 1032 | struct sock *sk, u64 port_offset, | 
|---|
| 1033 | u32 hash_port0, | 
|---|
| 1034 | int (*check_established)(struct inet_timewait_death_row *, | 
|---|
| 1035 | struct sock *, __u16, struct inet_timewait_sock **, | 
|---|
| 1036 | bool rcu_lookup, u32 hash)) | 
|---|
| 1037 | { | 
|---|
| 1038 | struct inet_hashinfo *hinfo = death_row->hashinfo; | 
|---|
| 1039 | struct inet_bind_hashbucket *head, *head2; | 
|---|
| 1040 | struct inet_timewait_sock *tw = NULL; | 
|---|
| 1041 | int port = inet_sk(sk)->inet_num; | 
|---|
| 1042 | struct net *net = sock_net(sk); | 
|---|
| 1043 | struct inet_bind2_bucket *tb2; | 
|---|
| 1044 | struct inet_bind_bucket *tb; | 
|---|
| 1045 | bool tb_created = false; | 
|---|
| 1046 | u32 remaining, offset; | 
|---|
| 1047 | int ret, i, low, high; | 
|---|
| 1048 | bool local_ports; | 
|---|
| 1049 | int step, l3mdev; | 
|---|
| 1050 | u32 index; | 
|---|
| 1051 |  | 
|---|
| 1052 | if (port) { | 
|---|
| 1053 | local_bh_disable(); | 
|---|
| 1054 | ret = check_established(death_row, sk, port, NULL, false, | 
|---|
| 1055 | hash_port0 + port); | 
|---|
| 1056 | local_bh_enable(); | 
|---|
| 1057 | return ret; | 
|---|
| 1058 | } | 
|---|
| 1059 |  | 
|---|
| 1060 | l3mdev = inet_sk_bound_l3mdev(sk); | 
|---|
| 1061 |  | 
|---|
| 1062 | local_ports = inet_sk_get_local_port_range(sk, low: &low, high: &high); | 
|---|
| 1063 | step = local_ports ? 1 : 2; | 
|---|
| 1064 |  | 
|---|
| 1065 | high++; /* [32768, 60999] -> [32768, 61000[ */ | 
|---|
| 1066 | remaining = high - low; | 
|---|
| 1067 | if (!local_ports && remaining > 1) | 
|---|
| 1068 | remaining &= ~1U; | 
|---|
| 1069 |  | 
|---|
| 1070 | get_random_sleepable_once(table_perturb, | 
|---|
| 1071 | INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); | 
|---|
| 1072 | index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); | 
|---|
| 1073 |  | 
|---|
| 1074 | offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); | 
|---|
| 1075 | offset %= remaining; | 
|---|
| 1076 |  | 
|---|
| 1077 | /* In first pass we try ports of @low parity. | 
|---|
| 1078 | * inet_csk_get_port() does the opposite choice. | 
|---|
| 1079 | */ | 
|---|
| 1080 | if (!local_ports) | 
|---|
| 1081 | offset &= ~1U; | 
|---|
| 1082 | other_parity_scan: | 
|---|
| 1083 | port = low + offset; | 
|---|
| 1084 | for (i = 0; i < remaining; i += step, port += step) { | 
|---|
| 1085 | if (unlikely(port >= high)) | 
|---|
| 1086 | port -= remaining; | 
|---|
| 1087 | if (inet_is_local_reserved_port(net, port)) | 
|---|
| 1088 | continue; | 
|---|
| 1089 | head = &hinfo->bhash[inet_bhashfn(net, lport: port, | 
|---|
| 1090 | bhash_size: hinfo->bhash_size)]; | 
|---|
| 1091 | rcu_read_lock(); | 
|---|
| 1092 | hlist_for_each_entry_rcu(tb, &head->chain, node) { | 
|---|
| 1093 | if (!inet_bind_bucket_match(tb, net, port, l3mdev)) | 
|---|
| 1094 | continue; | 
|---|
| 1095 | if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { | 
|---|
| 1096 | rcu_read_unlock(); | 
|---|
| 1097 | goto next_port; | 
|---|
| 1098 | } | 
|---|
| 1099 | if (!check_established(death_row, sk, port, &tw, true, | 
|---|
| 1100 | hash_port0 + port)) | 
|---|
| 1101 | break; | 
|---|
| 1102 | rcu_read_unlock(); | 
|---|
| 1103 | goto next_port; | 
|---|
| 1104 | } | 
|---|
| 1105 | rcu_read_unlock(); | 
|---|
| 1106 |  | 
|---|
| 1107 | spin_lock_bh(lock: &head->lock); | 
|---|
| 1108 |  | 
|---|
| 1109 | /* Does not bother with rcv_saddr checks, because | 
|---|
| 1110 | * the established check is already unique enough. | 
|---|
| 1111 | */ | 
|---|
| 1112 | inet_bind_bucket_for_each(tb, &head->chain) { | 
|---|
| 1113 | if (inet_bind_bucket_match(tb, net, port, l3mdev)) { | 
|---|
| 1114 | if (tb->fastreuse >= 0 || | 
|---|
| 1115 | tb->fastreuseport >= 0) | 
|---|
| 1116 | goto next_port_unlock; | 
|---|
| 1117 | WARN_ON(hlist_empty(&tb->bhash2)); | 
|---|
| 1118 | if (!check_established(death_row, sk, | 
|---|
| 1119 | port, &tw, false, | 
|---|
| 1120 | hash_port0 + port)) | 
|---|
| 1121 | goto ok; | 
|---|
| 1122 | goto next_port_unlock; | 
|---|
| 1123 | } | 
|---|
| 1124 | } | 
|---|
| 1125 |  | 
|---|
| 1126 | tb = inet_bind_bucket_create(cachep: hinfo->bind_bucket_cachep, | 
|---|
| 1127 | net, head, snum: port, l3mdev); | 
|---|
| 1128 | if (!tb) { | 
|---|
| 1129 | spin_unlock_bh(lock: &head->lock); | 
|---|
| 1130 | return -ENOMEM; | 
|---|
| 1131 | } | 
|---|
| 1132 | tb_created = true; | 
|---|
| 1133 | tb->fastreuse = -1; | 
|---|
| 1134 | tb->fastreuseport = -1; | 
|---|
| 1135 | goto ok; | 
|---|
| 1136 | next_port_unlock: | 
|---|
| 1137 | spin_unlock_bh(lock: &head->lock); | 
|---|
| 1138 | next_port: | 
|---|
| 1139 | cond_resched(); | 
|---|
| 1140 | } | 
|---|
| 1141 |  | 
|---|
| 1142 | if (!local_ports) { | 
|---|
| 1143 | offset++; | 
|---|
| 1144 | if ((offset & 1) && remaining > 1) | 
|---|
| 1145 | goto other_parity_scan; | 
|---|
| 1146 | } | 
|---|
| 1147 | return -EADDRNOTAVAIL; | 
|---|
| 1148 |  | 
|---|
| 1149 | ok: | 
|---|
| 1150 | /* Find the corresponding tb2 bucket since we need to | 
|---|
| 1151 | * add the socket to the bhash2 table as well | 
|---|
| 1152 | */ | 
|---|
| 1153 | head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); | 
|---|
| 1154 | spin_lock(lock: &head2->lock); | 
|---|
| 1155 |  | 
|---|
| 1156 | tb2 = inet_bind2_bucket_find(head: head2, net, port, l3mdev, sk); | 
|---|
| 1157 | if (!tb2) { | 
|---|
| 1158 | tb2 = inet_bind2_bucket_create(cachep: hinfo->bind2_bucket_cachep, net, | 
|---|
| 1159 | head: head2, tb, sk); | 
|---|
| 1160 | if (!tb2) | 
|---|
| 1161 | goto error; | 
|---|
| 1162 | tb2->fastreuse = -1; | 
|---|
| 1163 | tb2->fastreuseport = -1; | 
|---|
| 1164 | } | 
|---|
| 1165 |  | 
|---|
| 1166 | /* Here we want to add a little bit of randomness to the next source | 
|---|
| 1167 | * port that will be chosen. We use a max() with a random here so that | 
|---|
| 1168 | * on low contention the randomness is maximal and on high contention | 
|---|
| 1169 | * it may be inexistent. | 
|---|
| 1170 | */ | 
|---|
| 1171 | i = max_t(int, i, get_random_u32_below(8) * step); | 
|---|
| 1172 | WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); | 
|---|
| 1173 |  | 
|---|
| 1174 | /* Head lock still held and bh's disabled */ | 
|---|
| 1175 | inet_bind_hash(sk, tb, tb2, port); | 
|---|
| 1176 | sk->sk_userlocks |= SOCK_CONNECT_BIND; | 
|---|
| 1177 |  | 
|---|
| 1178 | if (sk_unhashed(sk)) { | 
|---|
| 1179 | inet_sk(sk)->inet_sport = htons(port); | 
|---|
| 1180 | inet_ehash_nolisten(sk, osk: (struct sock *)tw, NULL); | 
|---|
| 1181 | } | 
|---|
| 1182 | if (tw) | 
|---|
| 1183 | inet_twsk_bind_unhash(tw, hashinfo: hinfo); | 
|---|
| 1184 |  | 
|---|
| 1185 | spin_unlock(lock: &head2->lock); | 
|---|
| 1186 | spin_unlock(lock: &head->lock); | 
|---|
| 1187 |  | 
|---|
| 1188 | if (tw) | 
|---|
| 1189 | inet_twsk_deschedule_put(tw); | 
|---|
| 1190 | local_bh_enable(); | 
|---|
| 1191 | return 0; | 
|---|
| 1192 |  | 
|---|
| 1193 | error: | 
|---|
| 1194 | if (sk_hashed(sk)) { | 
|---|
| 1195 | spinlock_t *lock = inet_ehash_lockp(hashinfo: hinfo, hash: sk->sk_hash); | 
|---|
| 1196 |  | 
|---|
| 1197 | sock_prot_inuse_add(net, prot: sk->sk_prot, val: -1); | 
|---|
| 1198 |  | 
|---|
| 1199 | spin_lock(lock); | 
|---|
| 1200 | __sk_nulls_del_node_init_rcu(sk); | 
|---|
| 1201 | spin_unlock(lock); | 
|---|
| 1202 |  | 
|---|
| 1203 | sk->sk_hash = 0; | 
|---|
| 1204 | inet_sk(sk)->inet_sport = 0; | 
|---|
| 1205 | inet_sk(sk)->inet_num = 0; | 
|---|
| 1206 |  | 
|---|
| 1207 | if (tw) | 
|---|
| 1208 | inet_twsk_bind_unhash(tw, hashinfo: hinfo); | 
|---|
| 1209 | } | 
|---|
| 1210 |  | 
|---|
| 1211 | spin_unlock(lock: &head2->lock); | 
|---|
| 1212 | if (tb_created) | 
|---|
| 1213 | inet_bind_bucket_destroy(tb); | 
|---|
| 1214 | spin_unlock(lock: &head->lock); | 
|---|
| 1215 |  | 
|---|
| 1216 | if (tw) | 
|---|
| 1217 | inet_twsk_deschedule_put(tw); | 
|---|
| 1218 |  | 
|---|
| 1219 | local_bh_enable(); | 
|---|
| 1220 |  | 
|---|
| 1221 | return -ENOMEM; | 
|---|
| 1222 | } | 
|---|
| 1223 |  | 
|---|
| 1224 | /* | 
|---|
| 1225 | * Bind a port for a connect operation and hash it. | 
|---|
| 1226 | */ | 
|---|
| 1227 | int inet_hash_connect(struct inet_timewait_death_row *death_row, | 
|---|
| 1228 | struct sock *sk) | 
|---|
| 1229 | { | 
|---|
| 1230 | const struct inet_sock *inet = inet_sk(sk); | 
|---|
| 1231 | const struct net *net = sock_net(sk); | 
|---|
| 1232 | u64 port_offset = 0; | 
|---|
| 1233 | u32 hash_port0; | 
|---|
| 1234 |  | 
|---|
| 1235 | if (!inet_sk(sk)->inet_num) | 
|---|
| 1236 | port_offset = inet_sk_port_offset(sk); | 
|---|
| 1237 |  | 
|---|
| 1238 | hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, | 
|---|
| 1239 | inet->inet_daddr, inet->inet_dport); | 
|---|
| 1240 |  | 
|---|
| 1241 | return __inet_hash_connect(death_row, sk, port_offset, hash_port0, | 
|---|
| 1242 | check_established: __inet_check_established); | 
|---|
| 1243 | } | 
|---|
| 1244 |  | 
|---|
| 1245 | static void init_hashinfo_lhash2(struct inet_hashinfo *h) | 
|---|
| 1246 | { | 
|---|
| 1247 | int i; | 
|---|
| 1248 |  | 
|---|
| 1249 | for (i = 0; i <= h->lhash2_mask; i++) { | 
|---|
| 1250 | spin_lock_init(&h->lhash2[i].lock); | 
|---|
| 1251 | INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, | 
|---|
| 1252 | i + LISTENING_NULLS_BASE); | 
|---|
| 1253 | } | 
|---|
| 1254 | } | 
|---|
| 1255 |  | 
|---|
| 1256 | void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, | 
|---|
| 1257 | unsigned long numentries, int scale, | 
|---|
| 1258 | unsigned long low_limit, | 
|---|
| 1259 | unsigned long high_limit) | 
|---|
| 1260 | { | 
|---|
| 1261 | h->lhash2 = alloc_large_system_hash(tablename: name, | 
|---|
| 1262 | bucketsize: sizeof(*h->lhash2), | 
|---|
| 1263 | numentries, | 
|---|
| 1264 | scale, | 
|---|
| 1265 | flags: 0, | 
|---|
| 1266 | NULL, | 
|---|
| 1267 | hash_mask: &h->lhash2_mask, | 
|---|
| 1268 | low_limit, | 
|---|
| 1269 | high_limit); | 
|---|
| 1270 | init_hashinfo_lhash2(h); | 
|---|
| 1271 |  | 
|---|
| 1272 | /* this one is used for source ports of outgoing connections */ | 
|---|
| 1273 | table_perturb = alloc_large_system_hash(tablename: "Table-perturb", | 
|---|
| 1274 | bucketsize: sizeof(*table_perturb), | 
|---|
| 1275 | INET_TABLE_PERTURB_SIZE, | 
|---|
| 1276 | scale: 0, flags: 0, NULL, NULL, | 
|---|
| 1277 | INET_TABLE_PERTURB_SIZE, | 
|---|
| 1278 | INET_TABLE_PERTURB_SIZE); | 
|---|
| 1279 | } | 
|---|
| 1280 |  | 
|---|
| 1281 | int inet_hashinfo2_init_mod(struct inet_hashinfo *h) | 
|---|
| 1282 | { | 
|---|
| 1283 | h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); | 
|---|
| 1284 | if (!h->lhash2) | 
|---|
| 1285 | return -ENOMEM; | 
|---|
| 1286 |  | 
|---|
| 1287 | h->lhash2_mask = INET_LHTABLE_SIZE - 1; | 
|---|
| 1288 | /* INET_LHTABLE_SIZE must be a power of 2 */ | 
|---|
| 1289 | BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); | 
|---|
| 1290 |  | 
|---|
| 1291 | init_hashinfo_lhash2(h); | 
|---|
| 1292 | return 0; | 
|---|
| 1293 | } | 
|---|
| 1294 |  | 
|---|
| 1295 | int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) | 
|---|
| 1296 | { | 
|---|
| 1297 | unsigned int locksz = sizeof(spinlock_t); | 
|---|
| 1298 | unsigned int i, nblocks = 1; | 
|---|
| 1299 | spinlock_t *ptr = NULL; | 
|---|
| 1300 |  | 
|---|
| 1301 | if (locksz == 0) | 
|---|
| 1302 | goto set_mask; | 
|---|
| 1303 |  | 
|---|
| 1304 | /* Allocate 2 cache lines or at least one spinlock per cpu. */ | 
|---|
| 1305 | nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); | 
|---|
| 1306 |  | 
|---|
| 1307 | /* At least one page per NUMA node. */ | 
|---|
| 1308 | nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); | 
|---|
| 1309 |  | 
|---|
| 1310 | nblocks = roundup_pow_of_two(nblocks); | 
|---|
| 1311 |  | 
|---|
| 1312 | /* No more locks than number of hash buckets. */ | 
|---|
| 1313 | nblocks = min(nblocks, hashinfo->ehash_mask + 1); | 
|---|
| 1314 |  | 
|---|
| 1315 | if (num_online_nodes() > 1) { | 
|---|
| 1316 | /* Use vmalloc() to allow NUMA policy to spread pages | 
|---|
| 1317 | * on all available nodes if desired. | 
|---|
| 1318 | */ | 
|---|
| 1319 | ptr = vmalloc_array(nblocks, locksz); | 
|---|
| 1320 | } | 
|---|
| 1321 | if (!ptr) { | 
|---|
| 1322 | ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); | 
|---|
| 1323 | if (!ptr) | 
|---|
| 1324 | return -ENOMEM; | 
|---|
| 1325 | } | 
|---|
| 1326 | for (i = 0; i < nblocks; i++) | 
|---|
| 1327 | spin_lock_init(&ptr[i]); | 
|---|
| 1328 | hashinfo->ehash_locks = ptr; | 
|---|
| 1329 | set_mask: | 
|---|
| 1330 | hashinfo->ehash_locks_mask = nblocks - 1; | 
|---|
| 1331 | return 0; | 
|---|
| 1332 | } | 
|---|
| 1333 |  | 
|---|
| 1334 | struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, | 
|---|
| 1335 | unsigned int ehash_entries) | 
|---|
| 1336 | { | 
|---|
| 1337 | struct inet_hashinfo *new_hashinfo; | 
|---|
| 1338 | int i; | 
|---|
| 1339 |  | 
|---|
| 1340 | new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); | 
|---|
| 1341 | if (!new_hashinfo) | 
|---|
| 1342 | goto err; | 
|---|
| 1343 |  | 
|---|
| 1344 | new_hashinfo->ehash = vmalloc_huge(size: ehash_entries * sizeof(struct inet_ehash_bucket), | 
|---|
| 1345 | GFP_KERNEL_ACCOUNT); | 
|---|
| 1346 | if (!new_hashinfo->ehash) | 
|---|
| 1347 | goto free_hashinfo; | 
|---|
| 1348 |  | 
|---|
| 1349 | new_hashinfo->ehash_mask = ehash_entries - 1; | 
|---|
| 1350 |  | 
|---|
| 1351 | if (inet_ehash_locks_alloc(hashinfo: new_hashinfo)) | 
|---|
| 1352 | goto free_ehash; | 
|---|
| 1353 |  | 
|---|
| 1354 | for (i = 0; i < ehash_entries; i++) | 
|---|
| 1355 | INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); | 
|---|
| 1356 |  | 
|---|
| 1357 | new_hashinfo->pernet = true; | 
|---|
| 1358 |  | 
|---|
| 1359 | return new_hashinfo; | 
|---|
| 1360 |  | 
|---|
| 1361 | free_ehash: | 
|---|
| 1362 | vfree(addr: new_hashinfo->ehash); | 
|---|
| 1363 | free_hashinfo: | 
|---|
| 1364 | kfree(objp: new_hashinfo); | 
|---|
| 1365 | err: | 
|---|
| 1366 | return NULL; | 
|---|
| 1367 | } | 
|---|
| 1368 |  | 
|---|
| 1369 | void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) | 
|---|
| 1370 | { | 
|---|
| 1371 | if (!hashinfo->pernet) | 
|---|
| 1372 | return; | 
|---|
| 1373 |  | 
|---|
| 1374 | inet_ehash_locks_free(hashinfo); | 
|---|
| 1375 | vfree(addr: hashinfo->ehash); | 
|---|
| 1376 | kfree(objp: hashinfo); | 
|---|
| 1377 | } | 
|---|
| 1378 |  | 
|---|