1// SPDX-License-Identifier: GPL-2.0-only
2/* Common code for 32 and 64-bit NUMA */
3#include <linux/acpi.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6#include <linux/of.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/memblock.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/nodemask.h>
13#include <linux/sched.h>
14#include <linux/topology.h>
15#include <linux/sort.h>
16#include <linux/numa_memblks.h>
17
18#include <asm/e820/api.h>
19#include <asm/proto.h>
20#include <asm/dma.h>
21#include <asm/numa.h>
22#include <asm/amd/nb.h>
23
24#include "mm_internal.h"
25
26int numa_off;
27
28static __init int numa_setup(char *opt)
29{
30 if (!opt)
31 return -EINVAL;
32 if (!strncmp(opt, "off", 3))
33 numa_off = 1;
34 if (!strncmp(opt, "fake=", 5))
35 return numa_emu_cmdline(str: opt + 5);
36 if (!strncmp(opt, "noacpi", 6))
37 disable_srat();
38 if (!strncmp(opt, "nohmat", 6))
39 disable_hmat();
40 return 0;
41}
42early_param("numa", numa_setup);
43
44/*
45 * apicid, cpu, node mappings
46 */
47s16 __apicid_to_node[MAX_LOCAL_APIC] = {
48 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
49};
50
51int numa_cpu_node(int cpu)
52{
53 u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
54
55 if (apicid != BAD_APICID)
56 return __apicid_to_node[apicid];
57 return NUMA_NO_NODE;
58}
59
60cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
61EXPORT_SYMBOL(node_to_cpumask_map);
62
63/*
64 * Map cpu index to node index
65 */
66DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
67EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
68
69void numa_set_node(int cpu, int node)
70{
71 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
72
73 /* early setting, no percpu area yet */
74 if (cpu_to_node_map) {
75 cpu_to_node_map[cpu] = node;
76 return;
77 }
78
79#ifdef CONFIG_DEBUG_PER_CPU_MAPS
80 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
81 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
82 dump_stack();
83 return;
84 }
85#endif
86 per_cpu(x86_cpu_to_node_map, cpu) = node;
87
88 set_cpu_numa_node(cpu, node);
89}
90
91void numa_clear_node(int cpu)
92{
93 numa_set_node(cpu, NUMA_NO_NODE);
94}
95
96/*
97 * Allocate node_to_cpumask_map based on number of available nodes
98 * Requires node_possible_map to be valid.
99 *
100 * Note: cpumask_of_node() is not valid until after this is done.
101 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
102 */
103void __init setup_node_to_cpumask_map(void)
104{
105 unsigned int node;
106
107 /* setup nr_node_ids if not done yet */
108 if (nr_node_ids == MAX_NUMNODES)
109 setup_nr_node_ids();
110
111 /* allocate the map */
112 for (node = 0; node < nr_node_ids; node++)
113 alloc_bootmem_cpumask_var(mask: &node_to_cpumask_map[node]);
114
115 /* cpumask_of_node() will now work */
116 pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
117}
118
119static int __init numa_register_nodes(void)
120{
121 int nid;
122
123 if (!memblock_validate_numa_coverage(SZ_1M))
124 return -EINVAL;
125
126 /* Finally register nodes. */
127 for_each_node_mask(nid, node_possible_map) {
128 unsigned long start_pfn, end_pfn;
129
130 /*
131 * Note, get_pfn_range_for_nid() depends on
132 * memblock_set_node() having already happened
133 */
134 get_pfn_range_for_nid(nid, start_pfn: &start_pfn, end_pfn: &end_pfn);
135 if (start_pfn >= end_pfn)
136 continue;
137
138 alloc_node_data(nid);
139 node_set_online(nid);
140 }
141
142 /* Dump memblock with node info and return. */
143 memblock_dump_all();
144 return 0;
145}
146
147/*
148 * There are unfortunately some poorly designed mainboards around that
149 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
150 * mapping. To avoid this fill in the mapping for all possible CPUs,
151 * as the number of CPUs is not known yet. We round robin the existing
152 * nodes.
153 */
154static void __init numa_init_array(void)
155{
156 int rr, i;
157
158 rr = first_node(node_online_map);
159 for (i = 0; i < nr_cpu_ids; i++) {
160 if (early_cpu_to_node(cpu: i) != NUMA_NO_NODE)
161 continue;
162 numa_set_node(cpu: i, node: rr);
163 rr = next_node_in(rr, node_online_map);
164 }
165}
166
167static int __init numa_init(int (*init_func)(void))
168{
169 int i;
170 int ret;
171
172 for (i = 0; i < MAX_LOCAL_APIC; i++)
173 set_apicid_to_node(apicid: i, NUMA_NO_NODE);
174
175 ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
176 if (ret < 0)
177 return ret;
178
179 ret = numa_register_nodes();
180 if (ret < 0)
181 return ret;
182
183 for (i = 0; i < nr_cpu_ids; i++) {
184 int nid = early_cpu_to_node(cpu: i);
185
186 if (nid == NUMA_NO_NODE)
187 continue;
188 if (!node_online(nid))
189 numa_clear_node(cpu: i);
190 }
191 numa_init_array();
192
193 return 0;
194}
195
196/**
197 * dummy_numa_init - Fallback dummy NUMA init
198 *
199 * Used if there's no underlying NUMA architecture, NUMA initialization
200 * fails, or NUMA is disabled on the command line.
201 *
202 * Must online at least one node and add memory blocks that cover all
203 * allowed memory. This function must not fail.
204 */
205static int __init dummy_numa_init(void)
206{
207 printk(KERN_INFO "%s\n",
208 numa_off ? "NUMA turned off" : "No NUMA configuration found");
209 printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
210 0LLU, PFN_PHYS(max_pfn) - 1);
211
212 node_set(0, numa_nodes_parsed);
213 numa_add_memblk(nodeid: 0, start: 0, PFN_PHYS(max_pfn));
214
215 return 0;
216}
217
218/**
219 * x86_numa_init - Initialize NUMA
220 *
221 * Try each configured NUMA initialization method until one succeeds. The
222 * last fallback is dummy single node config encompassing whole memory and
223 * never fails.
224 */
225void __init x86_numa_init(void)
226{
227 if (!numa_off) {
228#ifdef CONFIG_ACPI_NUMA
229 if (!numa_init(init_func: x86_acpi_numa_init))
230 return;
231#endif
232#ifdef CONFIG_AMD_NUMA
233 if (!numa_init(init_func: amd_numa_init))
234 return;
235#endif
236 if (acpi_disabled && !numa_init(init_func: of_numa_init))
237 return;
238 }
239
240 numa_init(init_func: dummy_numa_init);
241}
242
243
244/*
245 * A node may exist which has one or more Generic Initiators but no CPUs and no
246 * memory.
247 *
248 * This function must be called after init_cpu_to_node(), to ensure that any
249 * memoryless CPU nodes have already been brought online, and before the
250 * node_data[nid] is needed for zone list setup in build_all_zonelists().
251 *
252 * When this function is called, any nodes containing either memory and/or CPUs
253 * will already be online and there is no need to do anything extra, even if
254 * they also contain one or more Generic Initiators.
255 */
256void __init init_gi_nodes(void)
257{
258 int nid;
259
260 /*
261 * Exclude this node from
262 * bringup_nonboot_cpus
263 * cpu_up
264 * __try_online_node
265 * register_one_node
266 * because node_subsys is not initialized yet.
267 * TODO remove dependency on node_online
268 */
269 for_each_node_state(nid, N_GENERIC_INITIATOR)
270 if (!node_online(nid))
271 node_set_online(nid);
272}
273
274/*
275 * Setup early cpu_to_node.
276 *
277 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
278 * and apicid_to_node[] tables have valid entries for a CPU.
279 * This means we skip cpu_to_node[] initialisation for NUMA
280 * emulation and faking node case (when running a kernel compiled
281 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
282 * is already initialized in a round robin manner at numa_init_array,
283 * prior to this call, and this initialization is good enough
284 * for the fake NUMA cases.
285 *
286 * Called before the per_cpu areas are setup.
287 */
288void __init init_cpu_to_node(void)
289{
290 int cpu;
291 u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
292
293 BUG_ON(cpu_to_apicid == NULL);
294
295 for_each_possible_cpu(cpu) {
296 int node = numa_cpu_node(cpu);
297
298 if (node == NUMA_NO_NODE)
299 continue;
300
301 /*
302 * Exclude this node from
303 * bringup_nonboot_cpus
304 * cpu_up
305 * __try_online_node
306 * register_one_node
307 * because node_subsys is not initialized yet.
308 * TODO remove dependency on node_online
309 */
310 if (!node_online(node))
311 node_set_online(nid: node);
312
313 numa_set_node(cpu, node);
314 }
315}
316
317#ifndef CONFIG_DEBUG_PER_CPU_MAPS
318
319# ifndef CONFIG_NUMA_EMU
320void numa_add_cpu(unsigned int cpu)
321{
322 cpumask_set_cpu(cpu, dstp: node_to_cpumask_map[early_cpu_to_node(cpu)]);
323}
324
325void numa_remove_cpu(unsigned int cpu)
326{
327 cpumask_clear_cpu(cpu, dstp: node_to_cpumask_map[early_cpu_to_node(cpu)]);
328}
329# endif /* !CONFIG_NUMA_EMU */
330
331#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
332
333int __cpu_to_node(int cpu)
334{
335 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
336 printk(KERN_WARNING
337 "cpu_to_node(%d): usage too early!\n", cpu);
338 dump_stack();
339 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
340 }
341 return per_cpu(x86_cpu_to_node_map, cpu);
342}
343EXPORT_SYMBOL(__cpu_to_node);
344
345/*
346 * Same function as cpu_to_node() but used if called before the
347 * per_cpu areas are setup.
348 */
349int early_cpu_to_node(int cpu)
350{
351 if (early_per_cpu_ptr(x86_cpu_to_node_map))
352 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
353
354 if (!cpu_possible(cpu)) {
355 printk(KERN_WARNING
356 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
357 dump_stack();
358 return NUMA_NO_NODE;
359 }
360 return per_cpu(x86_cpu_to_node_map, cpu);
361}
362
363void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
364{
365 struct cpumask *mask;
366
367 if (node == NUMA_NO_NODE) {
368 /* early_cpu_to_node() already emits a warning and trace */
369 return;
370 }
371 mask = node_to_cpumask_map[node];
372 if (!cpumask_available(mask)) {
373 pr_err("node_to_cpumask_map[%i] NULL\n", node);
374 dump_stack();
375 return;
376 }
377
378 if (enable)
379 cpumask_set_cpu(cpu, mask);
380 else
381 cpumask_clear_cpu(cpu, mask);
382
383 printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
384 enable ? "numa_add_cpu" : "numa_remove_cpu",
385 cpu, node, cpumask_pr_args(mask));
386 return;
387}
388
389# ifndef CONFIG_NUMA_EMU
390static void numa_set_cpumask(int cpu, bool enable)
391{
392 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
393}
394
395void numa_add_cpu(unsigned int cpu)
396{
397 numa_set_cpumask(cpu, true);
398}
399
400void numa_remove_cpu(unsigned int cpu)
401{
402 numa_set_cpumask(cpu, false);
403}
404# endif /* !CONFIG_NUMA_EMU */
405
406/*
407 * Returns a pointer to the bitmask of CPUs on Node 'node'.
408 */
409const struct cpumask *cpumask_of_node(int node)
410{
411 if ((unsigned)node >= nr_node_ids) {
412 printk(KERN_WARNING
413 "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
414 node, nr_node_ids);
415 dump_stack();
416 return cpu_none_mask;
417 }
418 if (!cpumask_available(node_to_cpumask_map[node])) {
419 printk(KERN_WARNING
420 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
421 node);
422 dump_stack();
423 return cpu_online_mask;
424 }
425 return node_to_cpumask_map[node];
426}
427EXPORT_SYMBOL(cpumask_of_node);
428
429#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
430
431#ifdef CONFIG_NUMA_EMU
432void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
433 unsigned int nr_emu_nids)
434{
435 int i, j;
436
437 /*
438 * Transform __apicid_to_node table to use emulated nids by
439 * reverse-mapping phys_nid. The maps should always exist but fall
440 * back to zero just in case.
441 */
442 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
443 if (__apicid_to_node[i] == NUMA_NO_NODE)
444 continue;
445 for (j = 0; j < nr_emu_nids; j++)
446 if (__apicid_to_node[i] == emu_nid_to_phys[j])
447 break;
448 __apicid_to_node[i] = j < nr_emu_nids ? j : 0;
449 }
450}
451
452u64 __init numa_emu_dma_end(void)
453{
454 return PFN_PHYS(MAX_DMA32_PFN);
455}
456#endif /* CONFIG_NUMA_EMU */
457