pgtable.c source code [Linux/arch/x86/mm/pgtable.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/mm.h>
3	#include <linux/gfp.h>
4	#include <linux/hugetlb.h>
5	#include <asm/pgalloc.h>
6	#include <asm/tlb.h>
7	#include <asm/fixmap.h>
8	#include <asm/mtrr.h>
9
10	#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11	phys_addr_t physical_mask __ro_after_init = (`1ULL` << __PHYSICAL_MASK_SHIFT) - `1`;
12	EXPORT_SYMBOL(physical_mask);
13	SYM_PIC_ALIAS(physical_mask);
14	#endif
15
16	pgtable_t pte_alloc_one(struct mm_struct *mm)
17	{
18	return __pte_alloc_one(mm, GFP_PGTABLE_USER);
19	}
20
21	void ___pte_free_tlb(struct mmu_gather tlb, struct* page *pte)
22	{
23	paravirt_release_pte(page_to_pfn(pte));
24	tlb_remove_ptdesc(tlb, page_ptdesc(pte));
25	}
26
27	#if CONFIG_PGTABLE_LEVELS > 2
28	void ___pmd_free_tlb(struct mmu_gather tlb, pmd_t pmd)
29	{
30	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
31	/*
32	* NOTE! For PAE, any changes to the top page-directory-pointer-table
33	* entries need a full cr3 reload to flush.
34	*/
35	#ifdef CONFIG_X86_PAE
36	tlb->need_flush_all = `1`;
37	#endif
38	tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: pmd));
39	}
40
41	#if CONFIG_PGTABLE_LEVELS > 3
42	void ___pud_free_tlb(struct mmu_gather tlb, pud_t pud)
43	{
44	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
45	tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: pud));
46	}
47
48	#if CONFIG_PGTABLE_LEVELS > 4
49	void ___p4d_free_tlb(struct mmu_gather tlb, p4d_t p4d)
50	{
51	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
52	tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: p4d));
53	}
54	#endif /* CONFIG_PGTABLE_LEVELS > 4 */
55	#endif /* CONFIG_PGTABLE_LEVELS > 3 */
56	#endif /* CONFIG_PGTABLE_LEVELS > 2 */
57
58	static inline void pgd_list_add(pgd_t *pgd)
59	{
60	struct ptdesc *ptdesc = virt_to_ptdesc(x: pgd);
61
62	list_add(new: &ptdesc->pt_list, head: &pgd_list);
63	}
64
65	static inline void pgd_list_del(pgd_t *pgd)
66	{
67	struct ptdesc *ptdesc = virt_to_ptdesc(x: pgd);
68
69	list_del(entry: &ptdesc->pt_list);
70	}
71
72	static void pgd_set_mm(pgd_t pgd, struct* mm_struct *mm)
73	{
74	virt_to_ptdesc(x: pgd)->pt_mm = mm;
75	}
76
77	struct mm_struct pgd_page_get_mm(struct* page *page)
78	{
79	return page_ptdesc(page)->pt_mm;
80	}
81
82	static void pgd_ctor(struct mm_struct mm, pgd_t pgd)
83	{
84	/ PAE preallocates all its PMDs. No cloning needed. /
85	if (!IS_ENABLED(CONFIG_X86_PAE))
86	clone_pgd_range(dst: pgd + KERNEL_PGD_BOUNDARY,
87	swapper_pg_dir + KERNEL_PGD_BOUNDARY,
88	KERNEL_PGD_PTRS);
89
90	/ List used to sync kernel mapping updates /
91	pgd_set_mm(pgd, mm);
92	pgd_list_add(pgd);
93	}
94
95	static void pgd_dtor(pgd_t *pgd)
96	{
97	spin_lock(lock: &pgd_lock);
98	pgd_list_del(pgd);
99	spin_unlock(lock: &pgd_lock);
100	}
101
102	/*
103	* List of all pgd's needed for non-PAE so it can invalidate entries
104	* in both cached and uncached pgd's; not needed for PAE since the
105	* kernel pmd is shared. If PAE were not to share the pmd a similar
106	* tactic would be needed. This is essentially codepath-based locking
107	* against pageattr.c; it is the unique case in which a valid change
108	* of kernel pagetables can't be lazily synchronized by vmalloc faults.
109	* vmalloc faults work because attached pagetables are never freed.
110	* -- nyc
111	*/
112
113	#ifdef CONFIG_X86_PAE
114	/*
115	* In PAE mode, we need to do a cr3 reload (=tlb flush) when
116	* updating the top-level pagetable entries to guarantee the
117	* processor notices the update. Since this is expensive, and
118	* all 4 top-level entries are used almost immediately in a
119	* new process's life, we just pre-populate them here.
120	*/
121	#define PREALLOCATED_PMDS PTRS_PER_PGD
122
123	/*
124	* "USER_PMDS" are the PMDs for the user copy of the page tables when
125	* PTI is enabled. They do not exist when PTI is disabled. Note that
126	* this is distinct from the user _portion_ of the kernel page tables
127	* which always exists.
128	*
129	* We allocate separate PMDs for the kernel part of the user page-table
130	* when PTI is enabled. We need them to map the per-process LDT into the
131	* user-space page-table.
132	*/
133	#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
134	KERNEL_PGD_PTRS : 0)
135	#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
136
137	void pud_populate(struct mm_struct mm, pud_t pudp, pmd_t *pmd)
138	{
139	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
140
141	/ Note: almost everything apart from _PAGE_PRESENT is*
142	reserved at the pmd (PDPT) level. /*
143	set_pud(pudp, __pud(__pa(pmd) \| _PAGE_PRESENT));
144
145	/*
146	* According to Intel App note "TLBs, Paging-Structure Caches,
147	* and Their Invalidation", April 2007, document 317080-001,
148	* section 8.1: in PAE mode we explicitly have to flush the
149	* TLB via cr3 if the top-level pgd is changed...
150	*/
151	flush_tlb_mm(mm);
152	}
153	#else /* !CONFIG_X86_PAE */
154
155	/ No need to prepopulate any pagetable entries in non-PAE modes. /
156	#define PREALLOCATED_PMDS 0
157	#define PREALLOCATED_USER_PMDS 0
158	#define MAX_PREALLOCATED_USER_PMDS 0
159	#endif /* CONFIG_X86_PAE */
160
161	static void free_pmds(struct mm_struct mm, pmd_t pmds[], int count)
162	{
163	int i;
164	struct ptdesc *ptdesc;
165
166	for (i = `0`; i < count; i++)
167	if (pmds[i]) {
168	ptdesc = virt_to_ptdesc(x: pmds[i]);
169
170	pagetable_dtor(ptdesc);
171	pagetable_free(pt: ptdesc);
172	mm_dec_nr_pmds(mm);
173	}
174	}
175
176	static int preallocate_pmds(struct mm_struct mm, pmd_t pmds[], int count)
177	{
178	int i;
179	bool failed = false;
180	gfp_t gfp = GFP_PGTABLE_USER;
181
182	if (mm == &init_mm)
183	gfp &= ~__GFP_ACCOUNT;
184	gfp &= ~__GFP_HIGHMEM;
185
186	for (i = `0`; i < count; i++) {
187	pmd_t *pmd = NULL;
188	struct ptdesc *ptdesc = pagetable_alloc(gfp, `0`);
189
190	if (!ptdesc)
191	failed = true;
192	if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
193	pagetable_free(pt: ptdesc);
194	ptdesc = NULL;
195	failed = true;
196	}
197	if (ptdesc) {
198	mm_inc_nr_pmds(mm);
199	pmd = ptdesc_address(pt: ptdesc);
200	}
201
202	pmds[i] = pmd;
203	}
204
205	if (failed) {
206	free_pmds(mm, pmds, count);
207	return -ENOMEM;
208	}
209
210	return `0`;
211	}
212
213	/*
214	* Mop up any pmd pages which may still be attached to the pgd.
215	* Normally they will be freed by munmap/exit_mmap, but any pmd we
216	* preallocate which never got a corresponding vma will need to be
217	* freed manually.
218	*/
219	static void mop_up_one_pmd(struct mm_struct mm, pgd_t pgdp)
220	{
221	pgd_t pgd = *pgdp;
222
223	if (pgd_val(pgd) != `0`) {
224	pmd_t pmd = (pmd_t )pgd_page_vaddr(pgd);
225
226	pgd_clear(pgdp);
227
228	paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
229	pmd_free(mm, pmd);
230	mm_dec_nr_pmds(mm);
231	}
232	}
233
234	static void pgd_mop_up_pmds(struct mm_struct mm, pgd_t pgdp)
235	{
236	int i;
237
238	for (i = `0`; i < PREALLOCATED_PMDS; i++)
239	mop_up_one_pmd(mm, pgdp: &pgdp[i]);
240
241	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
242
243	if (!boot_cpu_has(X86_FEATURE_PTI))
244	return;
245
246	pgdp = kernel_to_user_pgdp(pgdp);
247
248	for (i = `0`; i < PREALLOCATED_USER_PMDS; i++)
249	mop_up_one_pmd(mm, pgdp: &pgdp[i + KERNEL_PGD_BOUNDARY]);
250	#endif
251	}
252
253	static void pgd_prepopulate_pmd(struct mm_struct mm, pgd_t pgd, pmd_t *pmds[])
254	{
255	p4d_t *p4d;
256	pud_t *pud;
257	int i;
258
259	p4d = p4d_offset(pgd, address: `0`);
260	pud = pud_offset(p4d, address: `0`);
261
262	for (i = `0`; i < PREALLOCATED_PMDS; i++, pud++) {
263	pmd_t *pmd = pmds[i];
264
265	if (i >= KERNEL_PGD_BOUNDARY)
266	memcpy(to: pmd, from: (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
267	len: sizeof(pmd_t) * PTRS_PER_PMD);
268
269	pud_populate(mm, pud, pmd);
270	}
271	}
272
273	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
274	static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
275	pgd_t k_pgd, pmd_t pmds[])
276	{
277	pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
278	pgd_t *u_pgd = kernel_to_user_pgdp(pgdp: k_pgd);
279	p4d_t *u_p4d;
280	pud_t *u_pud;
281	int i;
282
283	u_p4d = p4d_offset(pgd: u_pgd, address: `0`);
284	u_pud = pud_offset(p4d: u_p4d, address: `0`);
285
286	s_pgd += KERNEL_PGD_BOUNDARY;
287	u_pud += KERNEL_PGD_BOUNDARY;
288
289	for (i = `0`; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
290	pmd_t *pmd = pmds[i];
291
292	memcpy(to: pmd, from: (pmd_t )pgd_page_vaddr(pgd: s_pgd),
293	len: sizeof(pmd_t) * PTRS_PER_PMD);
294
295	pud_populate(mm, pud: u_pud, pmd);
296	}
297
298	}
299	#else
300	static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
301	pgd_t k_pgd, pmd_t pmds[])
302	{
303	}
304	#endif
305
306	static inline pgd_t _pgd_alloc(struct* mm_struct *mm)
307	{
308	/*
309	* PTI and Xen need a whole page for the PAE PGD
310	* even though the hardware only needs 32 bytes.
311	*
312	* For simplicity, allocate a page for all users.
313	*/
314	return __pgd_alloc(mm, pgd_allocation_order());
315	}
316
317	static inline void _pgd_free(struct mm_struct mm, pgd_t pgd)
318	{
319	__pgd_free(mm, pgd);
320	}
321
322	pgd_t pgd_alloc(struct* mm_struct *mm)
323	{
324	pgd_t *pgd;
325	pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
326	pmd_t *pmds[PREALLOCATED_PMDS];
327
328	pgd = _pgd_alloc(mm);
329
330	if (pgd == NULL)
331	goto out;
332
333	mm->pgd = pgd;
334
335	if (sizeof(pmds) != `0` &&
336	preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != `0`)
337	goto out_free_pgd;
338
339	if (sizeof(u_pmds) != `0` &&
340	preallocate_pmds(mm, pmds: u_pmds, PREALLOCATED_USER_PMDS) != `0`)
341	goto out_free_pmds;
342
343	if (paravirt_pgd_alloc(mm) != `0`)
344	goto out_free_user_pmds;
345
346	/*
347	* Make sure that pre-populating the pmds is atomic with
348	* respect to anything walking the pgd_list, so that they
349	* never see a partially populated pgd.
350	*/
351	spin_lock(lock: &pgd_lock);
352
353	pgd_ctor(mm, pgd);
354	if (sizeof(pmds) != `0`)
355	pgd_prepopulate_pmd(mm, pgd, pmds);
356
357	if (sizeof(u_pmds) != `0`)
358	pgd_prepopulate_user_pmd(mm, k_pgd: pgd, pmds: u_pmds);
359
360	spin_unlock(lock: &pgd_lock);
361
362	return pgd;
363
364	out_free_user_pmds:
365	if (sizeof(u_pmds) != `0`)
366	free_pmds(mm, pmds: u_pmds, PREALLOCATED_USER_PMDS);
367	out_free_pmds:
368	if (sizeof(pmds) != `0`)
369	free_pmds(mm, pmds, PREALLOCATED_PMDS);
370	out_free_pgd:
371	_pgd_free(mm, pgd);
372	out:
373	return NULL;
374	}
375
376	void pgd_free(struct mm_struct mm, pgd_t pgd)
377	{
378	pgd_mop_up_pmds(mm, pgdp: pgd);
379	pgd_dtor(pgd);
380	paravirt_pgd_free(mm, pgd);
381	_pgd_free(mm, pgd);
382	}
383
384	/*
385	* Used to set accessed or dirty bits in the page table entries
386	* on other architectures. On x86, the accessed and dirty bits
387	* are tracked by hardware. However, do_wp_page calls this function
388	* to also make the pte writeable at the same time the dirty bit is
389	* set. In that case we do actually need to write the PTE.
390	*/
391	int ptep_set_access_flags(struct vm_area_struct *vma,
392	unsigned long address, pte_t *ptep,
393	pte_t entry, int dirty)
394	{
395	int changed = !pte_same(a: *ptep, b: entry);
396
397	if (changed && dirty)
398	set_pte(ptep, entry);
399
400	return changed;
401	}
402
403	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
404	int pmdp_set_access_flags(struct vm_area_struct *vma,
405	unsigned long address, pmd_t *pmdp,
406	pmd_t entry, int dirty)
407	{
408	int changed = !pmd_same(*pmdp, entry);
409
410	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
411
412	if (changed && dirty) {
413	set_pmd(pmdp, entry);
414	/*
415	* We had a write-protection fault here and changed the pmd
416	* to to more permissive. No need to flush the TLB for that,
417	* #PF is architecturally guaranteed to do that and in the
418	* worst-case we'll generate a spurious fault.
419	*/
420	}
421
422	return changed;
423	}
424
425	int pudp_set_access_flags(struct vm_area_struct vma, unsigned* long address,
426	pud_t pudp, pud_t entry, int* dirty)
427	{
428	int changed = !pud_same(*pudp, entry);
429
430	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
431
432	if (changed && dirty) {
433	set_pud(pudp, entry);
434	/*
435	* We had a write-protection fault here and changed the pud
436	* to to more permissive. No need to flush the TLB for that,
437	* #PF is architecturally guaranteed to do that and in the
438	* worst-case we'll generate a spurious fault.
439	*/
440	}
441
442	return changed;
443	}
444	#endif
445
446	int ptep_test_and_clear_young(struct vm_area_struct *vma,
447	unsigned long addr, pte_t *ptep)
448	{
449	int ret = `0`;
450
451	if (pte_young(pte: *ptep))
452	ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
453	addr: (unsigned long *) &ptep->pte);
454
455	return ret;
456	}
457
458	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) \|\| defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
459	int pmdp_test_and_clear_young(struct vm_area_struct *vma,
460	unsigned long addr, pmd_t *pmdp)
461	{
462	int ret = `0`;
463
464	if (pmd_young(pmd: *pmdp))
465	ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
466	addr: (unsigned long *)pmdp);
467
468	return ret;
469	}
470	#endif
471
472	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473	int pudp_test_and_clear_young(struct vm_area_struct *vma,
474	unsigned long addr, pud_t *pudp)
475	{
476	int ret = `0`;
477
478	if (pud_young(*pudp))
479	ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
480	(unsigned long *)pudp);
481
482	return ret;
483	}
484	#endif
485
486	int ptep_clear_flush_young(struct vm_area_struct *vma,
487	unsigned long address, pte_t *ptep)
488	{
489	/*
490	* On x86 CPUs, clearing the accessed bit without a TLB flush
491	* doesn't cause data corruption. [ It could cause incorrect
492	* page aging and the (mistaken) reclaim of hot pages, but the
493	* chance of that should be relatively low. ]
494	*
495	* So as a performance optimization don't flush the TLB when
496	* clearing the accessed bit, it will eventually be flushed by
497	* a context switch or a VM operation anyway. [ In the rare
498	* event of it not getting flushed for a long time the delay
499	* shouldn't really matter because there's no real memory
500	* pressure for swapout to react to. ]
501	*/
502	return ptep_test_and_clear_young(vma, addr: address, ptep);
503	}
504
505	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
506	int pmdp_clear_flush_young(struct vm_area_struct *vma,
507	unsigned long address, pmd_t *pmdp)
508	{
509	int young;
510
511	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
512
513	young = pmdp_test_and_clear_young(vma, address, pmdp);
514	if (young)
515	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
516
517	return young;
518	}
519
520	pmd_t pmdp_invalidate_ad(struct vm_area_struct vma, unsigned* long address,
521	pmd_t *pmdp)
522	{
523	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
524
525	/*
526	* No flush is necessary. Once an invalid PTE is established, the PTE's
527	* access and dirty bits cannot be updated.
528	*/
529	return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
530	}
531	#endif
532
533	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
534	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
535	pud_t pudp_invalidate(struct vm_area_struct vma, unsigned* long address,
536	pud_t *pudp)
537	{
538	VM_WARN_ON_ONCE(!pud_present(*pudp));
539	pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
540	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
541	return old;
542	}
543	#endif
544
545	/**
546	* reserve_top_address - Reserve a hole in the top of the kernel address space
547	* @reserve: Size of hole to reserve
548	*
549	* Can be used to relocate the fixmap area and poke a hole in the top
550	* of the kernel address space to make room for a hypervisor.
551	*/
552	void __init reserve_top_address(unsigned long reserve)
553	{
554	#ifdef CONFIG_X86_32
555	BUG_ON(fixmaps_set > `0`);
556	__FIXADDR_TOP = round_down(-reserve, `1` << PMD_SHIFT) - PAGE_SIZE;
557	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
558	-reserve, __FIXADDR_TOP + PAGE_SIZE);
559	#endif
560	}
561
562	int fixmaps_set;
563
564	void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
565	{
566	unsigned long address = __fix_to_virt(idx);
567
568	#ifdef CONFIG_X86_64
569	/*
570	* Ensure that the static initial page tables are covering the
571	* fixmap completely.
572	*/
573	BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
574	(FIXMAP_PMD_NUM * PTRS_PER_PTE));
575	#endif
576
577	if (idx >= __end_of_fixed_addresses) {
578	BUG();
579	return;
580	}
581	set_pte_vaddr(vaddr: address, pte);
582	fixmaps_set++;
583	}
584
585	void native_set_fixmap(unsigned / enum fixed_addresses / idx,
586	phys_addr_t phys, pgprot_t flags)
587	{
588	/ Sanitize 'prot' against any unsupported bits: /
589	pgprot_val(flags) &= __default_kernel_pte_mask;
590
591	__native_set_fixmap(idx, pte: pfn_pte(page_nr: phys >> PAGE_SHIFT, pgprot: flags));
592	}
593
594	#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
595	#if CONFIG_PGTABLE_LEVELS > 4
596	/**
597	* p4d_set_huge - Set up kernel P4D mapping
598	* @p4d: Pointer to the P4D entry
599	* @addr: Virtual address associated with the P4D entry
600	* @prot: Protection bits to use
601	*
602	* No 512GB pages yet -- always return 0
603	*/
604	int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
605	{
606	return `0`;
607	}
608
609	/**
610	* p4d_clear_huge - Clear kernel P4D mapping when it is set
611	* @p4d: Pointer to the P4D entry to clear
612	*
613	* No 512GB pages yet -- do nothing
614	*/
615	void p4d_clear_huge(p4d_t *p4d)
616	{
617	}
618	#endif
619
620	/**
621	* pud_set_huge - Set up kernel PUD mapping
622	* @pud: Pointer to the PUD entry
623	* @addr: Virtual address associated with the PUD entry
624	* @prot: Protection bits to use
625	*
626	* MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
627	* function sets up a huge page only if the complete range has the same MTRR
628	* caching mode.
629	*
630	* Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
631	* page mapping attempt fails.
632	*
633	* Returns 1 on success and 0 on failure.
634	*/
635	int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
636	{
637	u8 uniform;
638
639	mtrr_type_lookup(addr, end: addr + PUD_SIZE, uniform: &uniform);
640	if (!uniform)
641	return `0`;
642
643	/ Bail out if we are we on a populated non-leaf entry: /
644	if (pud_present(pud: pud) && !pud_leaf(pud: pud))
645	return `0`;
646
647	set_pte((pte_t *)pud, pfn_pte(
648	(u64)addr >> PAGE_SHIFT,
649	__pgprot(protval_4k_2_large(pgprot_val(prot)) \| _PAGE_PSE)));
650
651	return `1`;
652	}
653
654	/**
655	* pmd_set_huge - Set up kernel PMD mapping
656	* @pmd: Pointer to the PMD entry
657	* @addr: Virtual address associated with the PMD entry
658	* @prot: Protection bits to use
659	*
660	* See text over pud_set_huge() above.
661	*
662	* Returns 1 on success and 0 on failure.
663	*/
664	int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
665	{
666	u8 uniform;
667
668	mtrr_type_lookup(addr, end: addr + PMD_SIZE, uniform: &uniform);
669	if (!uniform) {
670	pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
671	__func__, addr, addr + PMD_SIZE);
672	return `0`;
673	}
674
675	/ Bail out if we are we on a populated non-leaf entry: /
676	if (pmd_present(pmd: pmd) && !pmd_leaf(pte: pmd))
677	return `0`;
678
679	set_pte((pte_t *)pmd, pfn_pte(
680	(u64)addr >> PAGE_SHIFT,
681	__pgprot(protval_4k_2_large(pgprot_val(prot)) \| _PAGE_PSE)));
682
683	return `1`;
684	}
685
686	/**
687	* pud_clear_huge - Clear kernel PUD mapping when it is set
688	* @pud: Pointer to the PUD entry to clear.
689	*
690	* Returns 1 on success and 0 on failure (no PUD map is found).
691	*/
692	int pud_clear_huge(pud_t *pud)
693	{
694	if (pud_leaf(pud: *pud)) {
695	pud_clear(pud);
696	return `1`;
697	}
698
699	return `0`;
700	}
701
702	/**
703	* pmd_clear_huge - Clear kernel PMD mapping when it is set
704	* @pmd: Pointer to the PMD entry to clear.
705	*
706	* Returns 1 on success and 0 on failure (no PMD map is found).
707	*/
708	int pmd_clear_huge(pmd_t *pmd)
709	{
710	if (pmd_leaf(pte: *pmd)) {
711	pmd_clear(pmd);
712	return `1`;
713	}
714
715	return `0`;
716	}
717
718	#ifdef CONFIG_X86_64
719	/**
720	* pud_free_pmd_page - Clear PUD entry and free PMD page
721	* @pud: Pointer to a PUD
722	* @addr: Virtual address associated with PUD
723	*
724	* Context: The PUD range has been unmapped and TLB purged.
725	* Return: 1 if clearing the entry succeeded. 0 otherwise.
726	*
727	* NOTE: Callers must allow a single page allocation.
728	*/
729	int pud_free_pmd_page(pud_t pud, unsigned* long addr)
730	{
731	pmd_t pmd, pmd_sv;
732	pte_t *pte;
733	int i;
734
735	pmd = pud_pgtable(pud: *pud);
736	pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
737	if (!pmd_sv)
738	return `0`;
739
740	for (i = `0`; i < PTRS_PER_PMD; i++) {
741	pmd_sv[i] = pmd[i];
742	if (!pmd_none(pmd: pmd[i]))
743	pmd_clear(&pmd[i]);
744	}
745
746	pud_clear(pud);
747
748	/ INVLPG to clear all paging-structure caches /
749	flush_tlb_kernel_range(start: addr, end: addr + PAGE_SIZE-`1`);
750
751	for (i = `0`; i < PTRS_PER_PMD; i++) {
752	if (!pmd_none(pmd: pmd_sv[i])) {
753	pte = (pte_t *)pmd_page_vaddr(pmd: pmd_sv[i]);
754	pte_free_kernel(mm: &init_mm, pte);
755	}
756	}
757
758	free_page((unsigned long)pmd_sv);
759
760	pmd_free(mm: &init_mm, pmd);
761
762	return `1`;
763	}
764
765	/**
766	* pmd_free_pte_page - Clear PMD entry and free PTE page.
767	* @pmd: Pointer to the PMD
768	* @addr: Virtual address associated with PMD
769	*
770	* Context: The PMD range has been unmapped and TLB purged.
771	* Return: 1 if clearing the entry succeeded. 0 otherwise.
772	*/
773	int pmd_free_pte_page(pmd_t pmd, unsigned* long addr)
774	{
775	pte_t *pte;
776
777	pte = (pte_t )pmd_page_vaddr(pmd: pmd);
778	pmd_clear(pmd);
779
780	/ INVLPG to clear all paging-structure caches /
781	flush_tlb_kernel_range(start: addr, end: addr + PAGE_SIZE-`1`);
782
783	pte_free_kernel(mm: &init_mm, pte);
784
785	return `1`;
786	}
787
788	#else /* !CONFIG_X86_64 */
789
790	/*
791	* Disable free page handling on x86-PAE. This assures that ioremap()
792	* does not update sync'd PMD entries. See vmalloc_sync_one().
793	*/
794	int pmd_free_pte_page(pmd_t pmd, unsigned* long addr)
795	{
796	return pmd_none(*pmd);
797	}
798
799	#endif /* CONFIG_X86_64 */
800	#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
801
802	pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
803	{
804	if (vma->vm_flags & VM_SHADOW_STACK)
805	return pte_mkwrite_shstk(pte);
806
807	pte = pte_mkwrite_novma(pte);
808
809	return pte_clear_saveddirty(pte);
810	}
811
812	pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
813	{
814	if (vma->vm_flags & VM_SHADOW_STACK)
815	return pmd_mkwrite_shstk(pmd);
816
817	pmd = pmd_mkwrite_novma(pmd);
818
819	return pmd_clear_saveddirty(pmd);
820	}
821
822	void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
823	{
824	/*
825	* Hardware before shadow stack can (rarely) set Dirty=1
826	* on a Write=0 PTE. So the below condition
827	* only indicates a software bug when shadow stack is
828	* supported by the HW. This checking is covered in
829	* pte_shstk().
830	*/
831	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
832	pte_shstk(pte));
833	}
834
835	void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
836	{
837	/ See note in arch_check_zapped_pte() /
838	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
839	pmd_shstk(pmd));
840	}
841
842	void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
843	{
844	/ See note in arch_check_zapped_pte() /
845	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
846	}
847

Browse the source code of Linux/arch/x86/mm/pgtable.c