numa.c source code [Linux/arch/x86/mm/numa.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Common code for 32 and 64-bit NUMA /
3	#include <linux/acpi.h>
4	#include <linux/kernel.h>
5	#include <linux/mm.h>
6	#include <linux/of.h>
7	#include <linux/string.h>
8	#include <linux/init.h>
9	#include <linux/memblock.h>
10	#include <linux/mmzone.h>
11	#include <linux/ctype.h>
12	#include <linux/nodemask.h>
13	#include <linux/sched.h>
14	#include <linux/topology.h>
15	#include <linux/sort.h>
16	#include <linux/numa_memblks.h>
17
18	#include <asm/e820/api.h>
19	#include <asm/proto.h>
20	#include <asm/dma.h>
21	#include <asm/numa.h>
22	#include <asm/amd/nb.h>
23
24	#include "mm_internal.h"
25
26	int numa_off;
27
28	static __init int numa_setup(char *opt)
29	{
30	if (!opt)
31	return -EINVAL;
32	if (!strncmp(opt, "off", `3`))
33	numa_off = `1`;
34	if (!strncmp(opt, "fake=", `5`))
35	return numa_emu_cmdline(str: opt + `5`);
36	if (!strncmp(opt, "noacpi", `6`))
37	disable_srat();
38	if (!strncmp(opt, "nohmat", `6`))
39	disable_hmat();
40	return `0`;
41	}
42	early_param("numa", numa_setup);
43
44	/*
45	* apicid, cpu, node mappings
46	*/
47	s16 __apicid_to_node[MAX_LOCAL_APIC] = {
48	[`0` ... MAX_LOCAL_APIC-`1`] = NUMA_NO_NODE
49	};
50
51	int numa_cpu_node(int cpu)
52	{
53	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
54
55	if (apicid != BAD_APICID)
56	return __apicid_to_node[apicid];
57	return NUMA_NO_NODE;
58	}
59
60	cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
61	EXPORT_SYMBOL(node_to_cpumask_map);
62
63	/*
64	* Map cpu index to node index
65	*/
66	DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
67	EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
68
69	void numa_set_node(int cpu, int node)
70	{
71	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
72
73	/ early setting, no percpu area yet /
74	if (cpu_to_node_map) {
75	cpu_to_node_map[cpu] = node;
76	return;
77	}
78
79	#ifdef CONFIG_DEBUG_PER_CPU_MAPS
80	if (cpu >= nr_cpu_ids \|\| !cpu_possible(cpu)) {
81	printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
82	dump_stack();
83	return;
84	}
85	#endif
86	per_cpu(x86_cpu_to_node_map, cpu) = node;
87
88	set_cpu_numa_node(cpu, node);
89	}
90
91	void numa_clear_node(int cpu)
92	{
93	numa_set_node(cpu, NUMA_NO_NODE);
94	}
95
96	/*
97	* Allocate node_to_cpumask_map based on number of available nodes
98	* Requires node_possible_map to be valid.
99	*
100	* Note: cpumask_of_node() is not valid until after this is done.
101	* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
102	*/
103	void __init setup_node_to_cpumask_map(void)
104	{
105	unsigned int node;
106
107	/ setup nr_node_ids if not done yet /
108	if (nr_node_ids == MAX_NUMNODES)
109	setup_nr_node_ids();
110
111	/ allocate the map /
112	for (node = `0`; node < nr_node_ids; node++)
113	alloc_bootmem_cpumask_var(mask: &node_to_cpumask_map[node]);
114
115	/ cpumask_of_node() will now work /
116	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
117	}
118
119	static int __init numa_register_nodes(void)
120	{
121	int nid;
122
123	if (!memblock_validate_numa_coverage(SZ_1M))
124	return -EINVAL;
125
126	/ Finally register nodes. /
127	for_each_node_mask(nid, node_possible_map) {
128	unsigned long start_pfn, end_pfn;
129
130	/*
131	* Note, get_pfn_range_for_nid() depends on
132	* memblock_set_node() having already happened
133	*/
134	get_pfn_range_for_nid(nid, start_pfn: &start_pfn, end_pfn: &end_pfn);
135	if (start_pfn >= end_pfn)
136	continue;
137
138	alloc_node_data(nid);
139	node_set_online(nid);
140	}
141
142	/ Dump memblock with node info and return. /
143	memblock_dump_all();
144	return `0`;
145	}
146
147	/*
148	* There are unfortunately some poorly designed mainboards around that
149	* only connect memory to a single CPU. This breaks the 1:1 cpu->node
150	* mapping. To avoid this fill in the mapping for all possible CPUs,
151	* as the number of CPUs is not known yet. We round robin the existing
152	* nodes.
153	*/
154	static void __init numa_init_array(void)
155	{
156	int rr, i;
157
158	rr = first_node(node_online_map);
159	for (i = `0`; i < nr_cpu_ids; i++) {
160	if (early_cpu_to_node(cpu: i) != NUMA_NO_NODE)
161	continue;
162	numa_set_node(cpu: i, node: rr);
163	rr = next_node_in(rr, node_online_map);
164	}
165	}
166
167	static int __init numa_init(int (init_func)(void*))
168	{
169	int i;
170	int ret;
171
172	for (i = `0`; i < MAX_LOCAL_APIC; i++)
173	set_apicid_to_node(apicid: i, NUMA_NO_NODE);
174
175	ret = numa_memblks_init(init_func, / memblock_force_top_down / true);
176	if (ret < `0`)
177	return ret;
178
179	ret = numa_register_nodes();
180	if (ret < `0`)
181	return ret;
182
183	for (i = `0`; i < nr_cpu_ids; i++) {
184	int nid = early_cpu_to_node(cpu: i);
185
186	if (nid == NUMA_NO_NODE)
187	continue;
188	if (!node_online(nid))
189	numa_clear_node(cpu: i);
190	}
191	numa_init_array();
192
193	return `0`;
194	}
195
196	/**
197	* dummy_numa_init - Fallback dummy NUMA init
198	*
199	* Used if there's no underlying NUMA architecture, NUMA initialization
200	* fails, or NUMA is disabled on the command line.
201	*
202	* Must online at least one node and add memory blocks that cover all
203	* allowed memory. This function must not fail.
204	*/
205	static int __init dummy_numa_init(void)
206	{
207	printk(KERN_INFO "%s\n",
208	numa_off ? "NUMA turned off" : "No NUMA configuration found");
209	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
210	`0LLU`, PFN_PHYS(max_pfn) - `1`);
211
212	node_set(`0`, numa_nodes_parsed);
213	numa_add_memblk(nodeid: `0`, start: `0`, PFN_PHYS(max_pfn));
214
215	return `0`;
216	}
217
218	/**
219	* x86_numa_init - Initialize NUMA
220	*
221	* Try each configured NUMA initialization method until one succeeds. The
222	* last fallback is dummy single node config encompassing whole memory and
223	* never fails.
224	*/
225	void __init x86_numa_init(void)
226	{
227	if (!numa_off) {
228	#ifdef CONFIG_ACPI_NUMA
229	if (!numa_init(init_func: x86_acpi_numa_init))
230	return;
231	#endif
232	#ifdef CONFIG_AMD_NUMA
233	if (!numa_init(init_func: amd_numa_init))
234	return;
235	#endif
236	if (acpi_disabled && !numa_init(init_func: of_numa_init))
237	return;
238	}
239
240	numa_init(init_func: dummy_numa_init);
241	}
242
243
244	/*
245	* A node may exist which has one or more Generic Initiators but no CPUs and no
246	* memory.
247	*
248	* This function must be called after init_cpu_to_node(), to ensure that any
249	* memoryless CPU nodes have already been brought online, and before the
250	* node_data[nid] is needed for zone list setup in build_all_zonelists().
251	*
252	* When this function is called, any nodes containing either memory and/or CPUs
253	* will already be online and there is no need to do anything extra, even if
254	* they also contain one or more Generic Initiators.
255	*/
256	void __init init_gi_nodes(void)
257	{
258	int nid;
259
260	/*
261	* Exclude this node from
262	* bringup_nonboot_cpus
263	* cpu_up
264	* __try_online_node
265	* register_one_node
266	* because node_subsys is not initialized yet.
267	* TODO remove dependency on node_online
268	*/
269	for_each_node_state(nid, N_GENERIC_INITIATOR)
270	if (!node_online(nid))
271	node_set_online(nid);
272	}
273
274	/*
275	* Setup early cpu_to_node.
276	*
277	* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
278	* and apicid_to_node[] tables have valid entries for a CPU.
279	* This means we skip cpu_to_node[] initialisation for NUMA
280	* emulation and faking node case (when running a kernel compiled
281	* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
282	* is already initialized in a round robin manner at numa_init_array,
283	* prior to this call, and this initialization is good enough
284	* for the fake NUMA cases.
285	*
286	* Called before the per_cpu areas are setup.
287	*/
288	void __init init_cpu_to_node(void)
289	{
290	int cpu;
291	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
292
293	BUG_ON(cpu_to_apicid == NULL);
294
295	for_each_possible_cpu(cpu) {
296	int node = numa_cpu_node(cpu);
297
298	if (node == NUMA_NO_NODE)
299	continue;
300
301	/*
302	* Exclude this node from
303	* bringup_nonboot_cpus
304	* cpu_up
305	* __try_online_node
306	* register_one_node
307	* because node_subsys is not initialized yet.
308	* TODO remove dependency on node_online
309	*/
310	if (!node_online(node))
311	node_set_online(nid: node);
312
313	numa_set_node(cpu, node);
314	}
315	}
316
317	#ifndef CONFIG_DEBUG_PER_CPU_MAPS
318
319	# ifndef CONFIG_NUMA_EMU
320	void numa_add_cpu(unsigned int cpu)
321	{
322	cpumask_set_cpu(cpu, dstp: node_to_cpumask_map[early_cpu_to_node(cpu)]);
323	}
324
325	void numa_remove_cpu(unsigned int cpu)
326	{
327	cpumask_clear_cpu(cpu, dstp: node_to_cpumask_map[early_cpu_to_node(cpu)]);
328	}
329	# endif /* !CONFIG_NUMA_EMU */
330
331	#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
332
333	int __cpu_to_node(int cpu)
334	{
335	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
336	printk(KERN_WARNING
337	"cpu_to_node(%d): usage too early!\n", cpu);
338	dump_stack();
339	return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
340	}
341	return per_cpu(x86_cpu_to_node_map, cpu);
342	}
343	EXPORT_SYMBOL(__cpu_to_node);
344
345	/*
346	* Same function as cpu_to_node() but used if called before the
347	* per_cpu areas are setup.
348	*/
349	int early_cpu_to_node(int cpu)
350	{
351	if (early_per_cpu_ptr(x86_cpu_to_node_map))
352	return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
353
354	if (!cpu_possible(cpu)) {
355	printk(KERN_WARNING
356	"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
357	dump_stack();
358	return NUMA_NO_NODE;
359	}
360	return per_cpu(x86_cpu_to_node_map, cpu);
361	}
362
363	void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
364	{
365	struct cpumask *mask;
366
367	if (node == NUMA_NO_NODE) {
368	/ early_cpu_to_node() already emits a warning and trace /
369	return;
370	}
371	mask = node_to_cpumask_map[node];
372	if (!cpumask_available(mask)) {
373	pr_err("node_to_cpumask_map[%i] NULL\n", node);
374	dump_stack();
375	return;
376	}
377
378	if (enable)
379	cpumask_set_cpu(cpu, mask);
380	else
381	cpumask_clear_cpu(cpu, mask);
382
383	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
384	enable ? "numa_add_cpu" : "numa_remove_cpu",
385	cpu, node, cpumask_pr_args(mask));
386	return;
387	}
388
389	# ifndef CONFIG_NUMA_EMU
390	static void numa_set_cpumask(int cpu, bool enable)
391	{
392	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
393	}
394
395	void numa_add_cpu(unsigned int cpu)
396	{
397	numa_set_cpumask(cpu, true);
398	}
399
400	void numa_remove_cpu(unsigned int cpu)
401	{
402	numa_set_cpumask(cpu, false);
403	}
404	# endif /* !CONFIG_NUMA_EMU */
405
406	/*
407	* Returns a pointer to the bitmask of CPUs on Node 'node'.
408	*/
409	const struct cpumask cpumask_of_node(int* node)
410	{
411	if ((unsigned)node >= nr_node_ids) {
412	printk(KERN_WARNING
413	"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
414	node, nr_node_ids);
415	dump_stack();
416	return cpu_none_mask;
417	}
418	if (!cpumask_available(node_to_cpumask_map[node])) {
419	printk(KERN_WARNING
420	"cpumask_of_node(%d): no node_to_cpumask_map!\n",
421	node);
422	dump_stack();
423	return cpu_online_mask;
424	}
425	return node_to_cpumask_map[node];
426	}
427	EXPORT_SYMBOL(cpumask_of_node);
428
429	#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
430
431	#ifdef CONFIG_NUMA_EMU
432	void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
433	unsigned int nr_emu_nids)
434	{
435	int i, j;
436
437	/*
438	* Transform __apicid_to_node table to use emulated nids by
439	* reverse-mapping phys_nid. The maps should always exist but fall
440	* back to zero just in case.
441	*/
442	for (i = `0`; i < ARRAY_SIZE(__apicid_to_node); i++) {
443	if (__apicid_to_node[i] == NUMA_NO_NODE)
444	continue;
445	for (j = `0`; j < nr_emu_nids; j++)
446	if (__apicid_to_node[i] == emu_nid_to_phys[j])
447	break;
448	__apicid_to_node[i] = j < nr_emu_nids ? j : `0`;
449	}
450	}
451
452	u64 __init numa_emu_dma_end(void)
453	{
454	return PFN_PHYS(MAX_DMA32_PFN);
455	}
456	#endif /* CONFIG_NUMA_EMU */
457

Browse the source code of Linux/arch/x86/mm/numa.c