1// SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h>
3#include <linux/gfp.h>
4#include <linux/hugetlb.h>
5#include <asm/pgalloc.h>
6#include <asm/tlb.h>
7#include <asm/fixmap.h>
8#include <asm/mtrr.h>
9
10#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12EXPORT_SYMBOL(physical_mask);
13SYM_PIC_ALIAS(physical_mask);
14#endif
15
16pgtable_t pte_alloc_one(struct mm_struct *mm)
17{
18 return __pte_alloc_one(mm, GFP_PGTABLE_USER);
19}
20
21void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
22{
23 paravirt_release_pte(page_to_pfn(pte));
24 tlb_remove_ptdesc(tlb, page_ptdesc(pte));
25}
26
27#if CONFIG_PGTABLE_LEVELS > 2
28void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
29{
30 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
31 /*
32 * NOTE! For PAE, any changes to the top page-directory-pointer-table
33 * entries need a full cr3 reload to flush.
34 */
35#ifdef CONFIG_X86_PAE
36 tlb->need_flush_all = 1;
37#endif
38 tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: pmd));
39}
40
41#if CONFIG_PGTABLE_LEVELS > 3
42void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
43{
44 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
45 tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: pud));
46}
47
48#if CONFIG_PGTABLE_LEVELS > 4
49void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
50{
51 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
52 tlb_remove_ptdesc(tlb, pt: virt_to_ptdesc(x: p4d));
53}
54#endif /* CONFIG_PGTABLE_LEVELS > 4 */
55#endif /* CONFIG_PGTABLE_LEVELS > 3 */
56#endif /* CONFIG_PGTABLE_LEVELS > 2 */
57
58static inline void pgd_list_add(pgd_t *pgd)
59{
60 struct ptdesc *ptdesc = virt_to_ptdesc(x: pgd);
61
62 list_add(new: &ptdesc->pt_list, head: &pgd_list);
63}
64
65static inline void pgd_list_del(pgd_t *pgd)
66{
67 struct ptdesc *ptdesc = virt_to_ptdesc(x: pgd);
68
69 list_del(entry: &ptdesc->pt_list);
70}
71
72static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
73{
74 virt_to_ptdesc(x: pgd)->pt_mm = mm;
75}
76
77struct mm_struct *pgd_page_get_mm(struct page *page)
78{
79 return page_ptdesc(page)->pt_mm;
80}
81
82static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
83{
84 /* PAE preallocates all its PMDs. No cloning needed. */
85 if (!IS_ENABLED(CONFIG_X86_PAE))
86 clone_pgd_range(dst: pgd + KERNEL_PGD_BOUNDARY,
87 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
88 KERNEL_PGD_PTRS);
89
90 /* List used to sync kernel mapping updates */
91 pgd_set_mm(pgd, mm);
92 pgd_list_add(pgd);
93}
94
95static void pgd_dtor(pgd_t *pgd)
96{
97 spin_lock(lock: &pgd_lock);
98 pgd_list_del(pgd);
99 spin_unlock(lock: &pgd_lock);
100}
101
102/*
103 * List of all pgd's needed for non-PAE so it can invalidate entries
104 * in both cached and uncached pgd's; not needed for PAE since the
105 * kernel pmd is shared. If PAE were not to share the pmd a similar
106 * tactic would be needed. This is essentially codepath-based locking
107 * against pageattr.c; it is the unique case in which a valid change
108 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
109 * vmalloc faults work because attached pagetables are never freed.
110 * -- nyc
111 */
112
113#ifdef CONFIG_X86_PAE
114/*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 */
121#define PREALLOCATED_PMDS PTRS_PER_PGD
122
123/*
124 * "USER_PMDS" are the PMDs for the user copy of the page tables when
125 * PTI is enabled. They do not exist when PTI is disabled. Note that
126 * this is distinct from the user _portion_ of the kernel page tables
127 * which always exists.
128 *
129 * We allocate separate PMDs for the kernel part of the user page-table
130 * when PTI is enabled. We need them to map the per-process LDT into the
131 * user-space page-table.
132 */
133#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
134 KERNEL_PGD_PTRS : 0)
135#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
136
137void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
138{
139 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
140
141 /* Note: almost everything apart from _PAGE_PRESENT is
142 reserved at the pmd (PDPT) level. */
143 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
144
145 /*
146 * According to Intel App note "TLBs, Paging-Structure Caches,
147 * and Their Invalidation", April 2007, document 317080-001,
148 * section 8.1: in PAE mode we explicitly have to flush the
149 * TLB via cr3 if the top-level pgd is changed...
150 */
151 flush_tlb_mm(mm);
152}
153#else /* !CONFIG_X86_PAE */
154
155/* No need to prepopulate any pagetable entries in non-PAE modes. */
156#define PREALLOCATED_PMDS 0
157#define PREALLOCATED_USER_PMDS 0
158#define MAX_PREALLOCATED_USER_PMDS 0
159#endif /* CONFIG_X86_PAE */
160
161static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
162{
163 int i;
164 struct ptdesc *ptdesc;
165
166 for (i = 0; i < count; i++)
167 if (pmds[i]) {
168 ptdesc = virt_to_ptdesc(x: pmds[i]);
169
170 pagetable_dtor(ptdesc);
171 pagetable_free(pt: ptdesc);
172 mm_dec_nr_pmds(mm);
173 }
174}
175
176static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
177{
178 int i;
179 bool failed = false;
180 gfp_t gfp = GFP_PGTABLE_USER;
181
182 if (mm == &init_mm)
183 gfp &= ~__GFP_ACCOUNT;
184 gfp &= ~__GFP_HIGHMEM;
185
186 for (i = 0; i < count; i++) {
187 pmd_t *pmd = NULL;
188 struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
189
190 if (!ptdesc)
191 failed = true;
192 if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
193 pagetable_free(pt: ptdesc);
194 ptdesc = NULL;
195 failed = true;
196 }
197 if (ptdesc) {
198 mm_inc_nr_pmds(mm);
199 pmd = ptdesc_address(pt: ptdesc);
200 }
201
202 pmds[i] = pmd;
203 }
204
205 if (failed) {
206 free_pmds(mm, pmds, count);
207 return -ENOMEM;
208 }
209
210 return 0;
211}
212
213/*
214 * Mop up any pmd pages which may still be attached to the pgd.
215 * Normally they will be freed by munmap/exit_mmap, but any pmd we
216 * preallocate which never got a corresponding vma will need to be
217 * freed manually.
218 */
219static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
220{
221 pgd_t pgd = *pgdp;
222
223 if (pgd_val(pgd) != 0) {
224 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
225
226 pgd_clear(pgdp);
227
228 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
229 pmd_free(mm, pmd);
230 mm_dec_nr_pmds(mm);
231 }
232}
233
234static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
235{
236 int i;
237
238 for (i = 0; i < PREALLOCATED_PMDS; i++)
239 mop_up_one_pmd(mm, pgdp: &pgdp[i]);
240
241#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
242
243 if (!boot_cpu_has(X86_FEATURE_PTI))
244 return;
245
246 pgdp = kernel_to_user_pgdp(pgdp);
247
248 for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
249 mop_up_one_pmd(mm, pgdp: &pgdp[i + KERNEL_PGD_BOUNDARY]);
250#endif
251}
252
253static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
254{
255 p4d_t *p4d;
256 pud_t *pud;
257 int i;
258
259 p4d = p4d_offset(pgd, address: 0);
260 pud = pud_offset(p4d, address: 0);
261
262 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
263 pmd_t *pmd = pmds[i];
264
265 if (i >= KERNEL_PGD_BOUNDARY)
266 memcpy(to: pmd, from: (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
267 len: sizeof(pmd_t) * PTRS_PER_PMD);
268
269 pud_populate(mm, pud, pmd);
270 }
271}
272
273#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
274static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
275 pgd_t *k_pgd, pmd_t *pmds[])
276{
277 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
278 pgd_t *u_pgd = kernel_to_user_pgdp(pgdp: k_pgd);
279 p4d_t *u_p4d;
280 pud_t *u_pud;
281 int i;
282
283 u_p4d = p4d_offset(pgd: u_pgd, address: 0);
284 u_pud = pud_offset(p4d: u_p4d, address: 0);
285
286 s_pgd += KERNEL_PGD_BOUNDARY;
287 u_pud += KERNEL_PGD_BOUNDARY;
288
289 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
290 pmd_t *pmd = pmds[i];
291
292 memcpy(to: pmd, from: (pmd_t *)pgd_page_vaddr(pgd: *s_pgd),
293 len: sizeof(pmd_t) * PTRS_PER_PMD);
294
295 pud_populate(mm, pud: u_pud, pmd);
296 }
297
298}
299#else
300static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
301 pgd_t *k_pgd, pmd_t *pmds[])
302{
303}
304#endif
305
306static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
307{
308 /*
309 * PTI and Xen need a whole page for the PAE PGD
310 * even though the hardware only needs 32 bytes.
311 *
312 * For simplicity, allocate a page for all users.
313 */
314 return __pgd_alloc(mm, pgd_allocation_order());
315}
316
317static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
318{
319 __pgd_free(mm, pgd);
320}
321
322pgd_t *pgd_alloc(struct mm_struct *mm)
323{
324 pgd_t *pgd;
325 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
326 pmd_t *pmds[PREALLOCATED_PMDS];
327
328 pgd = _pgd_alloc(mm);
329
330 if (pgd == NULL)
331 goto out;
332
333 mm->pgd = pgd;
334
335 if (sizeof(pmds) != 0 &&
336 preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
337 goto out_free_pgd;
338
339 if (sizeof(u_pmds) != 0 &&
340 preallocate_pmds(mm, pmds: u_pmds, PREALLOCATED_USER_PMDS) != 0)
341 goto out_free_pmds;
342
343 if (paravirt_pgd_alloc(mm) != 0)
344 goto out_free_user_pmds;
345
346 /*
347 * Make sure that pre-populating the pmds is atomic with
348 * respect to anything walking the pgd_list, so that they
349 * never see a partially populated pgd.
350 */
351 spin_lock(lock: &pgd_lock);
352
353 pgd_ctor(mm, pgd);
354 if (sizeof(pmds) != 0)
355 pgd_prepopulate_pmd(mm, pgd, pmds);
356
357 if (sizeof(u_pmds) != 0)
358 pgd_prepopulate_user_pmd(mm, k_pgd: pgd, pmds: u_pmds);
359
360 spin_unlock(lock: &pgd_lock);
361
362 return pgd;
363
364out_free_user_pmds:
365 if (sizeof(u_pmds) != 0)
366 free_pmds(mm, pmds: u_pmds, PREALLOCATED_USER_PMDS);
367out_free_pmds:
368 if (sizeof(pmds) != 0)
369 free_pmds(mm, pmds, PREALLOCATED_PMDS);
370out_free_pgd:
371 _pgd_free(mm, pgd);
372out:
373 return NULL;
374}
375
376void pgd_free(struct mm_struct *mm, pgd_t *pgd)
377{
378 pgd_mop_up_pmds(mm, pgdp: pgd);
379 pgd_dtor(pgd);
380 paravirt_pgd_free(mm, pgd);
381 _pgd_free(mm, pgd);
382}
383
384/*
385 * Used to set accessed or dirty bits in the page table entries
386 * on other architectures. On x86, the accessed and dirty bits
387 * are tracked by hardware. However, do_wp_page calls this function
388 * to also make the pte writeable at the same time the dirty bit is
389 * set. In that case we do actually need to write the PTE.
390 */
391int ptep_set_access_flags(struct vm_area_struct *vma,
392 unsigned long address, pte_t *ptep,
393 pte_t entry, int dirty)
394{
395 int changed = !pte_same(a: *ptep, b: entry);
396
397 if (changed && dirty)
398 set_pte(ptep, entry);
399
400 return changed;
401}
402
403#ifdef CONFIG_TRANSPARENT_HUGEPAGE
404int pmdp_set_access_flags(struct vm_area_struct *vma,
405 unsigned long address, pmd_t *pmdp,
406 pmd_t entry, int dirty)
407{
408 int changed = !pmd_same(*pmdp, entry);
409
410 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
411
412 if (changed && dirty) {
413 set_pmd(pmdp, entry);
414 /*
415 * We had a write-protection fault here and changed the pmd
416 * to to more permissive. No need to flush the TLB for that,
417 * #PF is architecturally guaranteed to do that and in the
418 * worst-case we'll generate a spurious fault.
419 */
420 }
421
422 return changed;
423}
424
425int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
426 pud_t *pudp, pud_t entry, int dirty)
427{
428 int changed = !pud_same(*pudp, entry);
429
430 VM_BUG_ON(address & ~HPAGE_PUD_MASK);
431
432 if (changed && dirty) {
433 set_pud(pudp, entry);
434 /*
435 * We had a write-protection fault here and changed the pud
436 * to to more permissive. No need to flush the TLB for that,
437 * #PF is architecturally guaranteed to do that and in the
438 * worst-case we'll generate a spurious fault.
439 */
440 }
441
442 return changed;
443}
444#endif
445
446int ptep_test_and_clear_young(struct vm_area_struct *vma,
447 unsigned long addr, pte_t *ptep)
448{
449 int ret = 0;
450
451 if (pte_young(pte: *ptep))
452 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
453 addr: (unsigned long *) &ptep->pte);
454
455 return ret;
456}
457
458#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
459int pmdp_test_and_clear_young(struct vm_area_struct *vma,
460 unsigned long addr, pmd_t *pmdp)
461{
462 int ret = 0;
463
464 if (pmd_young(pmd: *pmdp))
465 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
466 addr: (unsigned long *)pmdp);
467
468 return ret;
469}
470#endif
471
472#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473int pudp_test_and_clear_young(struct vm_area_struct *vma,
474 unsigned long addr, pud_t *pudp)
475{
476 int ret = 0;
477
478 if (pud_young(*pudp))
479 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
480 (unsigned long *)pudp);
481
482 return ret;
483}
484#endif
485
486int ptep_clear_flush_young(struct vm_area_struct *vma,
487 unsigned long address, pte_t *ptep)
488{
489 /*
490 * On x86 CPUs, clearing the accessed bit without a TLB flush
491 * doesn't cause data corruption. [ It could cause incorrect
492 * page aging and the (mistaken) reclaim of hot pages, but the
493 * chance of that should be relatively low. ]
494 *
495 * So as a performance optimization don't flush the TLB when
496 * clearing the accessed bit, it will eventually be flushed by
497 * a context switch or a VM operation anyway. [ In the rare
498 * event of it not getting flushed for a long time the delay
499 * shouldn't really matter because there's no real memory
500 * pressure for swapout to react to. ]
501 */
502 return ptep_test_and_clear_young(vma, addr: address, ptep);
503}
504
505#ifdef CONFIG_TRANSPARENT_HUGEPAGE
506int pmdp_clear_flush_young(struct vm_area_struct *vma,
507 unsigned long address, pmd_t *pmdp)
508{
509 int young;
510
511 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
512
513 young = pmdp_test_and_clear_young(vma, address, pmdp);
514 if (young)
515 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
516
517 return young;
518}
519
520pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
521 pmd_t *pmdp)
522{
523 VM_WARN_ON_ONCE(!pmd_present(*pmdp));
524
525 /*
526 * No flush is necessary. Once an invalid PTE is established, the PTE's
527 * access and dirty bits cannot be updated.
528 */
529 return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
530}
531#endif
532
533#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
534 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
535pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
536 pud_t *pudp)
537{
538 VM_WARN_ON_ONCE(!pud_present(*pudp));
539 pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
540 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
541 return old;
542}
543#endif
544
545/**
546 * reserve_top_address - Reserve a hole in the top of the kernel address space
547 * @reserve: Size of hole to reserve
548 *
549 * Can be used to relocate the fixmap area and poke a hole in the top
550 * of the kernel address space to make room for a hypervisor.
551 */
552void __init reserve_top_address(unsigned long reserve)
553{
554#ifdef CONFIG_X86_32
555 BUG_ON(fixmaps_set > 0);
556 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
557 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
558 -reserve, __FIXADDR_TOP + PAGE_SIZE);
559#endif
560}
561
562int fixmaps_set;
563
564void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
565{
566 unsigned long address = __fix_to_virt(idx);
567
568#ifdef CONFIG_X86_64
569 /*
570 * Ensure that the static initial page tables are covering the
571 * fixmap completely.
572 */
573 BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
574 (FIXMAP_PMD_NUM * PTRS_PER_PTE));
575#endif
576
577 if (idx >= __end_of_fixed_addresses) {
578 BUG();
579 return;
580 }
581 set_pte_vaddr(vaddr: address, pte);
582 fixmaps_set++;
583}
584
585void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
586 phys_addr_t phys, pgprot_t flags)
587{
588 /* Sanitize 'prot' against any unsupported bits: */
589 pgprot_val(flags) &= __default_kernel_pte_mask;
590
591 __native_set_fixmap(idx, pte: pfn_pte(page_nr: phys >> PAGE_SHIFT, pgprot: flags));
592}
593
594#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
595#if CONFIG_PGTABLE_LEVELS > 4
596/**
597 * p4d_set_huge - Set up kernel P4D mapping
598 * @p4d: Pointer to the P4D entry
599 * @addr: Virtual address associated with the P4D entry
600 * @prot: Protection bits to use
601 *
602 * No 512GB pages yet -- always return 0
603 */
604int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
605{
606 return 0;
607}
608
609/**
610 * p4d_clear_huge - Clear kernel P4D mapping when it is set
611 * @p4d: Pointer to the P4D entry to clear
612 *
613 * No 512GB pages yet -- do nothing
614 */
615void p4d_clear_huge(p4d_t *p4d)
616{
617}
618#endif
619
620/**
621 * pud_set_huge - Set up kernel PUD mapping
622 * @pud: Pointer to the PUD entry
623 * @addr: Virtual address associated with the PUD entry
624 * @prot: Protection bits to use
625 *
626 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
627 * function sets up a huge page only if the complete range has the same MTRR
628 * caching mode.
629 *
630 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
631 * page mapping attempt fails.
632 *
633 * Returns 1 on success and 0 on failure.
634 */
635int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
636{
637 u8 uniform;
638
639 mtrr_type_lookup(addr, end: addr + PUD_SIZE, uniform: &uniform);
640 if (!uniform)
641 return 0;
642
643 /* Bail out if we are we on a populated non-leaf entry: */
644 if (pud_present(pud: *pud) && !pud_leaf(pud: *pud))
645 return 0;
646
647 set_pte((pte_t *)pud, pfn_pte(
648 (u64)addr >> PAGE_SHIFT,
649 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
650
651 return 1;
652}
653
654/**
655 * pmd_set_huge - Set up kernel PMD mapping
656 * @pmd: Pointer to the PMD entry
657 * @addr: Virtual address associated with the PMD entry
658 * @prot: Protection bits to use
659 *
660 * See text over pud_set_huge() above.
661 *
662 * Returns 1 on success and 0 on failure.
663 */
664int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
665{
666 u8 uniform;
667
668 mtrr_type_lookup(addr, end: addr + PMD_SIZE, uniform: &uniform);
669 if (!uniform) {
670 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
671 __func__, addr, addr + PMD_SIZE);
672 return 0;
673 }
674
675 /* Bail out if we are we on a populated non-leaf entry: */
676 if (pmd_present(pmd: *pmd) && !pmd_leaf(pte: *pmd))
677 return 0;
678
679 set_pte((pte_t *)pmd, pfn_pte(
680 (u64)addr >> PAGE_SHIFT,
681 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
682
683 return 1;
684}
685
686/**
687 * pud_clear_huge - Clear kernel PUD mapping when it is set
688 * @pud: Pointer to the PUD entry to clear.
689 *
690 * Returns 1 on success and 0 on failure (no PUD map is found).
691 */
692int pud_clear_huge(pud_t *pud)
693{
694 if (pud_leaf(pud: *pud)) {
695 pud_clear(pud);
696 return 1;
697 }
698
699 return 0;
700}
701
702/**
703 * pmd_clear_huge - Clear kernel PMD mapping when it is set
704 * @pmd: Pointer to the PMD entry to clear.
705 *
706 * Returns 1 on success and 0 on failure (no PMD map is found).
707 */
708int pmd_clear_huge(pmd_t *pmd)
709{
710 if (pmd_leaf(pte: *pmd)) {
711 pmd_clear(pmd);
712 return 1;
713 }
714
715 return 0;
716}
717
718#ifdef CONFIG_X86_64
719/**
720 * pud_free_pmd_page - Clear PUD entry and free PMD page
721 * @pud: Pointer to a PUD
722 * @addr: Virtual address associated with PUD
723 *
724 * Context: The PUD range has been unmapped and TLB purged.
725 * Return: 1 if clearing the entry succeeded. 0 otherwise.
726 *
727 * NOTE: Callers must allow a single page allocation.
728 */
729int pud_free_pmd_page(pud_t *pud, unsigned long addr)
730{
731 pmd_t *pmd, *pmd_sv;
732 pte_t *pte;
733 int i;
734
735 pmd = pud_pgtable(pud: *pud);
736 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
737 if (!pmd_sv)
738 return 0;
739
740 for (i = 0; i < PTRS_PER_PMD; i++) {
741 pmd_sv[i] = pmd[i];
742 if (!pmd_none(pmd: pmd[i]))
743 pmd_clear(&pmd[i]);
744 }
745
746 pud_clear(pud);
747
748 /* INVLPG to clear all paging-structure caches */
749 flush_tlb_kernel_range(start: addr, end: addr + PAGE_SIZE-1);
750
751 for (i = 0; i < PTRS_PER_PMD; i++) {
752 if (!pmd_none(pmd: pmd_sv[i])) {
753 pte = (pte_t *)pmd_page_vaddr(pmd: pmd_sv[i]);
754 pte_free_kernel(mm: &init_mm, pte);
755 }
756 }
757
758 free_page((unsigned long)pmd_sv);
759
760 pmd_free(mm: &init_mm, pmd);
761
762 return 1;
763}
764
765/**
766 * pmd_free_pte_page - Clear PMD entry and free PTE page.
767 * @pmd: Pointer to the PMD
768 * @addr: Virtual address associated with PMD
769 *
770 * Context: The PMD range has been unmapped and TLB purged.
771 * Return: 1 if clearing the entry succeeded. 0 otherwise.
772 */
773int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
774{
775 pte_t *pte;
776
777 pte = (pte_t *)pmd_page_vaddr(pmd: *pmd);
778 pmd_clear(pmd);
779
780 /* INVLPG to clear all paging-structure caches */
781 flush_tlb_kernel_range(start: addr, end: addr + PAGE_SIZE-1);
782
783 pte_free_kernel(mm: &init_mm, pte);
784
785 return 1;
786}
787
788#else /* !CONFIG_X86_64 */
789
790/*
791 * Disable free page handling on x86-PAE. This assures that ioremap()
792 * does not update sync'd PMD entries. See vmalloc_sync_one().
793 */
794int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
795{
796 return pmd_none(*pmd);
797}
798
799#endif /* CONFIG_X86_64 */
800#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
801
802pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
803{
804 if (vma->vm_flags & VM_SHADOW_STACK)
805 return pte_mkwrite_shstk(pte);
806
807 pte = pte_mkwrite_novma(pte);
808
809 return pte_clear_saveddirty(pte);
810}
811
812pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
813{
814 if (vma->vm_flags & VM_SHADOW_STACK)
815 return pmd_mkwrite_shstk(pmd);
816
817 pmd = pmd_mkwrite_novma(pmd);
818
819 return pmd_clear_saveddirty(pmd);
820}
821
822void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
823{
824 /*
825 * Hardware before shadow stack can (rarely) set Dirty=1
826 * on a Write=0 PTE. So the below condition
827 * only indicates a software bug when shadow stack is
828 * supported by the HW. This checking is covered in
829 * pte_shstk().
830 */
831 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
832 pte_shstk(pte));
833}
834
835void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
836{
837 /* See note in arch_check_zapped_pte() */
838 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
839 pmd_shstk(pmd));
840}
841
842void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
843{
844 /* See note in arch_check_zapped_pte() */
845 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
846}
847