| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/pagewalk.h> |
| 3 | #include <linux/highmem.h> |
| 4 | #include <linux/sched.h> |
| 5 | #include <linux/hugetlb.h> |
| 6 | #include <linux/mmu_context.h> |
| 7 | #include <linux/swap.h> |
| 8 | #include <linux/swapops.h> |
| 9 | |
| 10 | #include <asm/tlbflush.h> |
| 11 | |
| 12 | #include "internal.h" |
| 13 | |
| 14 | /* |
| 15 | * We want to know the real level where a entry is located ignoring any |
| 16 | * folding of levels which may be happening. For example if p4d is folded then |
| 17 | * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). |
| 18 | */ |
| 19 | static int real_depth(int depth) |
| 20 | { |
| 21 | if (depth == 3 && PTRS_PER_PMD == 1) |
| 22 | depth = 2; |
| 23 | if (depth == 2 && PTRS_PER_PUD == 1) |
| 24 | depth = 1; |
| 25 | if (depth == 1 && PTRS_PER_P4D == 1) |
| 26 | depth = 0; |
| 27 | return depth; |
| 28 | } |
| 29 | |
| 30 | static int walk_pte_range_inner(pte_t *pte, unsigned long addr, |
| 31 | unsigned long end, struct mm_walk *walk) |
| 32 | { |
| 33 | const struct mm_walk_ops *ops = walk->ops; |
| 34 | int err = 0; |
| 35 | |
| 36 | for (;;) { |
| 37 | if (ops->install_pte && pte_none(pte: ptep_get(ptep: pte))) { |
| 38 | pte_t new_pte; |
| 39 | |
| 40 | err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte, |
| 41 | walk); |
| 42 | if (err) |
| 43 | break; |
| 44 | |
| 45 | set_pte_at(walk->mm, addr, pte, new_pte); |
| 46 | /* Non-present before, so for arches that need it. */ |
| 47 | if (!WARN_ON_ONCE(walk->no_vma)) |
| 48 | update_mmu_cache(vma: walk->vma, addr, ptep: pte); |
| 49 | } else { |
| 50 | err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); |
| 51 | if (err) |
| 52 | break; |
| 53 | } |
| 54 | if (addr >= end - PAGE_SIZE) |
| 55 | break; |
| 56 | addr += PAGE_SIZE; |
| 57 | pte++; |
| 58 | } |
| 59 | return err; |
| 60 | } |
| 61 | |
| 62 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
| 63 | struct mm_walk *walk) |
| 64 | { |
| 65 | pte_t *pte; |
| 66 | int err = 0; |
| 67 | spinlock_t *ptl; |
| 68 | |
| 69 | if (walk->no_vma) { |
| 70 | /* |
| 71 | * pte_offset_map() might apply user-specific validation. |
| 72 | * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() |
| 73 | * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), |
| 74 | * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. |
| 75 | */ |
| 76 | if (walk->mm == &init_mm || addr >= TASK_SIZE) |
| 77 | pte = pte_offset_kernel(pmd, address: addr); |
| 78 | else |
| 79 | pte = pte_offset_map(pmd, addr); |
| 80 | if (pte) { |
| 81 | err = walk_pte_range_inner(pte, addr, end, walk); |
| 82 | if (walk->mm != &init_mm && addr < TASK_SIZE) |
| 83 | pte_unmap(pte); |
| 84 | } |
| 85 | } else { |
| 86 | pte = pte_offset_map_lock(mm: walk->mm, pmd, addr, ptlp: &ptl); |
| 87 | if (pte) { |
| 88 | err = walk_pte_range_inner(pte, addr, end, walk); |
| 89 | pte_unmap_unlock(pte, ptl); |
| 90 | } |
| 91 | } |
| 92 | if (!pte) |
| 93 | walk->action = ACTION_AGAIN; |
| 94 | return err; |
| 95 | } |
| 96 | |
| 97 | static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, |
| 98 | struct mm_walk *walk) |
| 99 | { |
| 100 | pmd_t *pmd; |
| 101 | unsigned long next; |
| 102 | const struct mm_walk_ops *ops = walk->ops; |
| 103 | bool has_handler = ops->pte_entry; |
| 104 | bool has_install = ops->install_pte; |
| 105 | int err = 0; |
| 106 | int depth = real_depth(depth: 3); |
| 107 | |
| 108 | pmd = pmd_offset(pud, address: addr); |
| 109 | do { |
| 110 | again: |
| 111 | next = pmd_addr_end(addr, end); |
| 112 | if (pmd_none(pmd: *pmd)) { |
| 113 | if (has_install) |
| 114 | err = __pte_alloc(mm: walk->mm, pmd); |
| 115 | else if (ops->pte_hole) |
| 116 | err = ops->pte_hole(addr, next, depth, walk); |
| 117 | if (err) |
| 118 | break; |
| 119 | if (!has_install) |
| 120 | continue; |
| 121 | } |
| 122 | |
| 123 | walk->action = ACTION_SUBTREE; |
| 124 | |
| 125 | /* |
| 126 | * This implies that each ->pmd_entry() handler |
| 127 | * needs to know about pmd_trans_huge() pmds |
| 128 | */ |
| 129 | if (ops->pmd_entry) |
| 130 | err = ops->pmd_entry(pmd, addr, next, walk); |
| 131 | if (err) |
| 132 | break; |
| 133 | |
| 134 | if (walk->action == ACTION_AGAIN) |
| 135 | goto again; |
| 136 | if (walk->action == ACTION_CONTINUE) |
| 137 | continue; |
| 138 | |
| 139 | if (!has_handler) { /* No handlers for lower page tables. */ |
| 140 | if (!has_install) |
| 141 | continue; /* Nothing to do. */ |
| 142 | /* |
| 143 | * We are ONLY installing, so avoid unnecessarily |
| 144 | * splitting a present huge page. |
| 145 | */ |
| 146 | if (pmd_present(pmd: *pmd) && pmd_trans_huge(pmd: *pmd)) |
| 147 | continue; |
| 148 | } |
| 149 | |
| 150 | if (walk->vma) |
| 151 | split_huge_pmd(walk->vma, pmd, addr); |
| 152 | else if (pmd_leaf(pte: *pmd) || !pmd_present(pmd: *pmd)) |
| 153 | continue; /* Nothing to do. */ |
| 154 | |
| 155 | err = walk_pte_range(pmd, addr, end: next, walk); |
| 156 | if (err) |
| 157 | break; |
| 158 | |
| 159 | if (walk->action == ACTION_AGAIN) |
| 160 | goto again; |
| 161 | |
| 162 | } while (pmd++, addr = next, addr != end); |
| 163 | |
| 164 | return err; |
| 165 | } |
| 166 | |
| 167 | static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, |
| 168 | struct mm_walk *walk) |
| 169 | { |
| 170 | pud_t *pud; |
| 171 | unsigned long next; |
| 172 | const struct mm_walk_ops *ops = walk->ops; |
| 173 | bool has_handler = ops->pmd_entry || ops->pte_entry; |
| 174 | bool has_install = ops->install_pte; |
| 175 | int err = 0; |
| 176 | int depth = real_depth(depth: 2); |
| 177 | |
| 178 | pud = pud_offset(p4d, address: addr); |
| 179 | do { |
| 180 | again: |
| 181 | next = pud_addr_end(addr, end); |
| 182 | if (pud_none(pud: *pud)) { |
| 183 | if (has_install) |
| 184 | err = __pmd_alloc(mm: walk->mm, pud, address: addr); |
| 185 | else if (ops->pte_hole) |
| 186 | err = ops->pte_hole(addr, next, depth, walk); |
| 187 | if (err) |
| 188 | break; |
| 189 | if (!has_install) |
| 190 | continue; |
| 191 | } |
| 192 | |
| 193 | walk->action = ACTION_SUBTREE; |
| 194 | |
| 195 | if (ops->pud_entry) |
| 196 | err = ops->pud_entry(pud, addr, next, walk); |
| 197 | if (err) |
| 198 | break; |
| 199 | |
| 200 | if (walk->action == ACTION_AGAIN) |
| 201 | goto again; |
| 202 | if (walk->action == ACTION_CONTINUE) |
| 203 | continue; |
| 204 | |
| 205 | if (!has_handler) { /* No handlers for lower page tables. */ |
| 206 | if (!has_install) |
| 207 | continue; /* Nothing to do. */ |
| 208 | /* |
| 209 | * We are ONLY installing, so avoid unnecessarily |
| 210 | * splitting a present huge page. |
| 211 | */ |
| 212 | if (pud_present(pud: *pud) && pud_trans_huge(pud: *pud)) |
| 213 | continue; |
| 214 | } |
| 215 | |
| 216 | if (walk->vma) |
| 217 | split_huge_pud(walk->vma, pud, addr); |
| 218 | else if (pud_leaf(pud: *pud) || !pud_present(pud: *pud)) |
| 219 | continue; /* Nothing to do. */ |
| 220 | |
| 221 | if (pud_none(pud: *pud)) |
| 222 | goto again; |
| 223 | |
| 224 | err = walk_pmd_range(pud, addr, end: next, walk); |
| 225 | if (err) |
| 226 | break; |
| 227 | } while (pud++, addr = next, addr != end); |
| 228 | |
| 229 | return err; |
| 230 | } |
| 231 | |
| 232 | static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, |
| 233 | struct mm_walk *walk) |
| 234 | { |
| 235 | p4d_t *p4d; |
| 236 | unsigned long next; |
| 237 | const struct mm_walk_ops *ops = walk->ops; |
| 238 | bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry; |
| 239 | bool has_install = ops->install_pte; |
| 240 | int err = 0; |
| 241 | int depth = real_depth(depth: 1); |
| 242 | |
| 243 | p4d = p4d_offset(pgd, address: addr); |
| 244 | do { |
| 245 | next = p4d_addr_end(addr, end); |
| 246 | if (p4d_none_or_clear_bad(p4d)) { |
| 247 | if (has_install) |
| 248 | err = __pud_alloc(mm: walk->mm, p4d, address: addr); |
| 249 | else if (ops->pte_hole) |
| 250 | err = ops->pte_hole(addr, next, depth, walk); |
| 251 | if (err) |
| 252 | break; |
| 253 | if (!has_install) |
| 254 | continue; |
| 255 | } |
| 256 | if (ops->p4d_entry) { |
| 257 | err = ops->p4d_entry(p4d, addr, next, walk); |
| 258 | if (err) |
| 259 | break; |
| 260 | } |
| 261 | if (has_handler || has_install) |
| 262 | err = walk_pud_range(p4d, addr, end: next, walk); |
| 263 | if (err) |
| 264 | break; |
| 265 | } while (p4d++, addr = next, addr != end); |
| 266 | |
| 267 | return err; |
| 268 | } |
| 269 | |
| 270 | static int walk_pgd_range(unsigned long addr, unsigned long end, |
| 271 | struct mm_walk *walk) |
| 272 | { |
| 273 | pgd_t *pgd; |
| 274 | unsigned long next; |
| 275 | const struct mm_walk_ops *ops = walk->ops; |
| 276 | bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry || |
| 277 | ops->pte_entry; |
| 278 | bool has_install = ops->install_pte; |
| 279 | int err = 0; |
| 280 | |
| 281 | if (walk->pgd) |
| 282 | pgd = walk->pgd + pgd_index(addr); |
| 283 | else |
| 284 | pgd = pgd_offset(walk->mm, addr); |
| 285 | do { |
| 286 | next = pgd_addr_end(addr, end); |
| 287 | if (pgd_none_or_clear_bad(pgd)) { |
| 288 | if (has_install) |
| 289 | err = __p4d_alloc(mm: walk->mm, pgd, address: addr); |
| 290 | else if (ops->pte_hole) |
| 291 | err = ops->pte_hole(addr, next, 0, walk); |
| 292 | if (err) |
| 293 | break; |
| 294 | if (!has_install) |
| 295 | continue; |
| 296 | } |
| 297 | if (ops->pgd_entry) { |
| 298 | err = ops->pgd_entry(pgd, addr, next, walk); |
| 299 | if (err) |
| 300 | break; |
| 301 | } |
| 302 | if (has_handler || has_install) |
| 303 | err = walk_p4d_range(pgd, addr, end: next, walk); |
| 304 | if (err) |
| 305 | break; |
| 306 | } while (pgd++, addr = next, addr != end); |
| 307 | |
| 308 | return err; |
| 309 | } |
| 310 | |
| 311 | #ifdef CONFIG_HUGETLB_PAGE |
| 312 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, |
| 313 | unsigned long end) |
| 314 | { |
| 315 | unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); |
| 316 | return boundary < end ? boundary : end; |
| 317 | } |
| 318 | |
| 319 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
| 320 | struct mm_walk *walk) |
| 321 | { |
| 322 | struct vm_area_struct *vma = walk->vma; |
| 323 | struct hstate *h = hstate_vma(vma); |
| 324 | unsigned long next; |
| 325 | unsigned long hmask = huge_page_mask(h); |
| 326 | unsigned long sz = huge_page_size(h); |
| 327 | pte_t *pte; |
| 328 | const struct mm_walk_ops *ops = walk->ops; |
| 329 | int err = 0; |
| 330 | |
| 331 | hugetlb_vma_lock_read(vma); |
| 332 | do { |
| 333 | next = hugetlb_entry_end(h, addr, end); |
| 334 | pte = hugetlb_walk(vma, addr: addr & hmask, sz); |
| 335 | if (pte) |
| 336 | err = ops->hugetlb_entry(pte, hmask, addr, next, walk); |
| 337 | else if (ops->pte_hole) |
| 338 | err = ops->pte_hole(addr, next, -1, walk); |
| 339 | if (err) |
| 340 | break; |
| 341 | } while (addr = next, addr != end); |
| 342 | hugetlb_vma_unlock_read(vma); |
| 343 | |
| 344 | return err; |
| 345 | } |
| 346 | |
| 347 | #else /* CONFIG_HUGETLB_PAGE */ |
| 348 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
| 349 | struct mm_walk *walk) |
| 350 | { |
| 351 | return 0; |
| 352 | } |
| 353 | |
| 354 | #endif /* CONFIG_HUGETLB_PAGE */ |
| 355 | |
| 356 | /* |
| 357 | * Decide whether we really walk over the current vma on [@start, @end) |
| 358 | * or skip it via the returned value. Return 0 if we do walk over the |
| 359 | * current vma, and return 1 if we skip the vma. Negative values means |
| 360 | * error, where we abort the current walk. |
| 361 | */ |
| 362 | static int walk_page_test(unsigned long start, unsigned long end, |
| 363 | struct mm_walk *walk) |
| 364 | { |
| 365 | struct vm_area_struct *vma = walk->vma; |
| 366 | const struct mm_walk_ops *ops = walk->ops; |
| 367 | |
| 368 | if (ops->test_walk) |
| 369 | return ops->test_walk(start, end, walk); |
| 370 | |
| 371 | /* |
| 372 | * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP |
| 373 | * range, so we don't walk over it as we do for normal vmas. However, |
| 374 | * Some callers are interested in handling hole range and they don't |
| 375 | * want to just ignore any single address range. Such users certainly |
| 376 | * define their ->pte_hole() callbacks, so let's delegate them to handle |
| 377 | * vma(VM_PFNMAP). |
| 378 | */ |
| 379 | if (vma->vm_flags & VM_PFNMAP) { |
| 380 | int err = 1; |
| 381 | if (ops->pte_hole) |
| 382 | err = ops->pte_hole(start, end, -1, walk); |
| 383 | return err ? err : 1; |
| 384 | } |
| 385 | return 0; |
| 386 | } |
| 387 | |
| 388 | static int __walk_page_range(unsigned long start, unsigned long end, |
| 389 | struct mm_walk *walk) |
| 390 | { |
| 391 | int err = 0; |
| 392 | struct vm_area_struct *vma = walk->vma; |
| 393 | const struct mm_walk_ops *ops = walk->ops; |
| 394 | bool is_hugetlb = is_vm_hugetlb_page(vma); |
| 395 | |
| 396 | /* We do not support hugetlb PTE installation. */ |
| 397 | if (ops->install_pte && is_hugetlb) |
| 398 | return -EINVAL; |
| 399 | |
| 400 | if (ops->pre_vma) { |
| 401 | err = ops->pre_vma(start, end, walk); |
| 402 | if (err) |
| 403 | return err; |
| 404 | } |
| 405 | |
| 406 | if (is_hugetlb) { |
| 407 | if (ops->hugetlb_entry) |
| 408 | err = walk_hugetlb_range(addr: start, end, walk); |
| 409 | } else |
| 410 | err = walk_pgd_range(addr: start, end, walk); |
| 411 | |
| 412 | if (ops->post_vma) |
| 413 | ops->post_vma(walk); |
| 414 | |
| 415 | return err; |
| 416 | } |
| 417 | |
| 418 | static inline void process_mm_walk_lock(struct mm_struct *mm, |
| 419 | enum page_walk_lock walk_lock) |
| 420 | { |
| 421 | if (walk_lock == PGWALK_RDLOCK) |
| 422 | mmap_assert_locked(mm); |
| 423 | else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY) |
| 424 | mmap_assert_write_locked(mm); |
| 425 | } |
| 426 | |
| 427 | static inline void process_vma_walk_lock(struct vm_area_struct *vma, |
| 428 | enum page_walk_lock walk_lock) |
| 429 | { |
| 430 | #ifdef CONFIG_PER_VMA_LOCK |
| 431 | switch (walk_lock) { |
| 432 | case PGWALK_WRLOCK: |
| 433 | vma_start_write(vma); |
| 434 | break; |
| 435 | case PGWALK_WRLOCK_VERIFY: |
| 436 | vma_assert_write_locked(vma); |
| 437 | break; |
| 438 | case PGWALK_VMA_RDLOCK_VERIFY: |
| 439 | vma_assert_locked(vma); |
| 440 | break; |
| 441 | case PGWALK_RDLOCK: |
| 442 | /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ |
| 443 | break; |
| 444 | } |
| 445 | #endif |
| 446 | } |
| 447 | |
| 448 | /* |
| 449 | * See the comment for walk_page_range(), this performs the heavy lifting of the |
| 450 | * operation, only sets no restrictions on how the walk proceeds. |
| 451 | * |
| 452 | * We usually restrict the ability to install PTEs, but this functionality is |
| 453 | * available to internal memory management code and provided in mm/internal.h. |
| 454 | */ |
| 455 | int walk_page_range_mm(struct mm_struct *mm, unsigned long start, |
| 456 | unsigned long end, const struct mm_walk_ops *ops, |
| 457 | void *private) |
| 458 | { |
| 459 | int err = 0; |
| 460 | unsigned long next; |
| 461 | struct vm_area_struct *vma; |
| 462 | struct mm_walk walk = { |
| 463 | .ops = ops, |
| 464 | .mm = mm, |
| 465 | .private = private, |
| 466 | }; |
| 467 | |
| 468 | if (start >= end) |
| 469 | return -EINVAL; |
| 470 | |
| 471 | if (!walk.mm) |
| 472 | return -EINVAL; |
| 473 | |
| 474 | process_mm_walk_lock(mm: walk.mm, walk_lock: ops->walk_lock); |
| 475 | |
| 476 | vma = find_vma(mm: walk.mm, addr: start); |
| 477 | do { |
| 478 | if (!vma) { /* after the last vma */ |
| 479 | walk.vma = NULL; |
| 480 | next = end; |
| 481 | if (ops->pte_hole) |
| 482 | err = ops->pte_hole(start, next, -1, &walk); |
| 483 | } else if (start < vma->vm_start) { /* outside vma */ |
| 484 | walk.vma = NULL; |
| 485 | next = min(end, vma->vm_start); |
| 486 | if (ops->pte_hole) |
| 487 | err = ops->pte_hole(start, next, -1, &walk); |
| 488 | } else { /* inside vma */ |
| 489 | process_vma_walk_lock(vma, walk_lock: ops->walk_lock); |
| 490 | walk.vma = vma; |
| 491 | next = min(end, vma->vm_end); |
| 492 | vma = find_vma(mm, addr: vma->vm_end); |
| 493 | |
| 494 | err = walk_page_test(start, end: next, walk: &walk); |
| 495 | if (err > 0) { |
| 496 | /* |
| 497 | * positive return values are purely for |
| 498 | * controlling the pagewalk, so should never |
| 499 | * be passed to the callers. |
| 500 | */ |
| 501 | err = 0; |
| 502 | continue; |
| 503 | } |
| 504 | if (err < 0) |
| 505 | break; |
| 506 | err = __walk_page_range(start, end: next, walk: &walk); |
| 507 | } |
| 508 | if (err) |
| 509 | break; |
| 510 | } while (start = next, start < end); |
| 511 | return err; |
| 512 | } |
| 513 | |
| 514 | /* |
| 515 | * Determine if the walk operations specified are permitted to be used for a |
| 516 | * page table walk. |
| 517 | * |
| 518 | * This check is performed on all functions which are parameterised by walk |
| 519 | * operations and exposed in include/linux/pagewalk.h. |
| 520 | * |
| 521 | * Internal memory management code can use the walk_page_range_mm() function to |
| 522 | * be able to use all page walking operations. |
| 523 | */ |
| 524 | static bool check_ops_valid(const struct mm_walk_ops *ops) |
| 525 | { |
| 526 | /* |
| 527 | * The installation of PTEs is solely under the control of memory |
| 528 | * management logic and subject to many subtle locking, security and |
| 529 | * cache considerations so we cannot permit other users to do so, and |
| 530 | * certainly not for exported symbols. |
| 531 | */ |
| 532 | if (ops->install_pte) |
| 533 | return false; |
| 534 | |
| 535 | return true; |
| 536 | } |
| 537 | |
| 538 | /** |
| 539 | * walk_page_range - walk page table with caller specific callbacks |
| 540 | * @mm: mm_struct representing the target process of page table walk |
| 541 | * @start: start address of the virtual address range |
| 542 | * @end: end address of the virtual address range |
| 543 | * @ops: operation to call during the walk |
| 544 | * @private: private data for callbacks' usage |
| 545 | * |
| 546 | * Recursively walk the page table tree of the process represented by @mm |
| 547 | * within the virtual address range [@start, @end). During walking, we can do |
| 548 | * some caller-specific works for each entry, by setting up pmd_entry(), |
| 549 | * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these |
| 550 | * callbacks, the associated entries/pages are just ignored. |
| 551 | * The return values of these callbacks are commonly defined like below: |
| 552 | * |
| 553 | * - 0 : succeeded to handle the current entry, and if you don't reach the |
| 554 | * end address yet, continue to walk. |
| 555 | * - >0 : succeeded to handle the current entry, and return to the caller |
| 556 | * with caller specific value. |
| 557 | * - <0 : failed to handle the current entry, and return to the caller |
| 558 | * with error code. |
| 559 | * |
| 560 | * Before starting to walk page table, some callers want to check whether |
| 561 | * they really want to walk over the current vma, typically by checking |
| 562 | * its vm_flags. walk_page_test() and @ops->test_walk() are used for this |
| 563 | * purpose. |
| 564 | * |
| 565 | * If operations need to be staged before and committed after a vma is walked, |
| 566 | * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), |
| 567 | * since it is intended to handle commit-type operations, can't return any |
| 568 | * errors. |
| 569 | * |
| 570 | * struct mm_walk keeps current values of some common data like vma and pmd, |
| 571 | * which are useful for the access from callbacks. If you want to pass some |
| 572 | * caller-specific data to callbacks, @private should be helpful. |
| 573 | * |
| 574 | * Locking: |
| 575 | * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, |
| 576 | * because these function traverse vma list and/or access to vma's data. |
| 577 | */ |
| 578 | int walk_page_range(struct mm_struct *mm, unsigned long start, |
| 579 | unsigned long end, const struct mm_walk_ops *ops, |
| 580 | void *private) |
| 581 | { |
| 582 | if (!check_ops_valid(ops)) |
| 583 | return -EINVAL; |
| 584 | |
| 585 | return walk_page_range_mm(mm, start, end, ops, private); |
| 586 | } |
| 587 | |
| 588 | /** |
| 589 | * walk_kernel_page_table_range - walk a range of kernel pagetables. |
| 590 | * @start: start address of the virtual address range |
| 591 | * @end: end address of the virtual address range |
| 592 | * @ops: operation to call during the walk |
| 593 | * @pgd: pgd to walk if different from mm->pgd |
| 594 | * @private: private data for callbacks' usage |
| 595 | * |
| 596 | * Similar to walk_page_range() but can walk any page tables even if they are |
| 597 | * not backed by VMAs. Because 'unusual' entries may be walked this function |
| 598 | * will also not lock the PTEs for the pte_entry() callback. This is useful for |
| 599 | * walking kernel pages tables or page tables for firmware. |
| 600 | * |
| 601 | * Note: Be careful to walk the kernel pages tables, the caller may be need to |
| 602 | * take other effective approaches (mmap lock may be insufficient) to prevent |
| 603 | * the intermediate kernel page tables belonging to the specified address range |
| 604 | * from being freed (e.g. memory hot-remove). |
| 605 | */ |
| 606 | int walk_kernel_page_table_range(unsigned long start, unsigned long end, |
| 607 | const struct mm_walk_ops *ops, pgd_t *pgd, void *private) |
| 608 | { |
| 609 | /* |
| 610 | * Kernel intermediate page tables are usually not freed, so the mmap |
| 611 | * read lock is sufficient. But there are some exceptions. |
| 612 | * E.g. memory hot-remove. In which case, the mmap lock is insufficient |
| 613 | * to prevent the intermediate kernel pages tables belonging to the |
| 614 | * specified address range from being freed. The caller should take |
| 615 | * other actions to prevent this race. |
| 616 | */ |
| 617 | mmap_assert_locked(mm: &init_mm); |
| 618 | |
| 619 | return walk_kernel_page_table_range_lockless(start, end, ops, pgd, |
| 620 | private); |
| 621 | } |
| 622 | |
| 623 | /* |
| 624 | * Use this function to walk the kernel page tables locklessly. It should be |
| 625 | * guaranteed that the caller has exclusive access over the range they are |
| 626 | * operating on - that there should be no concurrent access, for example, |
| 627 | * changing permissions for vmalloc objects. |
| 628 | */ |
| 629 | int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, |
| 630 | const struct mm_walk_ops *ops, pgd_t *pgd, void *private) |
| 631 | { |
| 632 | struct mm_walk walk = { |
| 633 | .ops = ops, |
| 634 | .mm = &init_mm, |
| 635 | .pgd = pgd, |
| 636 | .private = private, |
| 637 | .no_vma = true |
| 638 | }; |
| 639 | |
| 640 | if (start >= end) |
| 641 | return -EINVAL; |
| 642 | if (!check_ops_valid(ops)) |
| 643 | return -EINVAL; |
| 644 | |
| 645 | return walk_pgd_range(addr: start, end, walk: &walk); |
| 646 | } |
| 647 | |
| 648 | /** |
| 649 | * walk_page_range_debug - walk a range of pagetables not backed by a vma |
| 650 | * @mm: mm_struct representing the target process of page table walk |
| 651 | * @start: start address of the virtual address range |
| 652 | * @end: end address of the virtual address range |
| 653 | * @ops: operation to call during the walk |
| 654 | * @pgd: pgd to walk if different from mm->pgd |
| 655 | * @private: private data for callbacks' usage |
| 656 | * |
| 657 | * Similar to walk_page_range() but can walk any page tables even if they are |
| 658 | * not backed by VMAs. Because 'unusual' entries may be walked this function |
| 659 | * will also not lock the PTEs for the pte_entry() callback. |
| 660 | * |
| 661 | * This is for debugging purposes ONLY. |
| 662 | */ |
| 663 | int walk_page_range_debug(struct mm_struct *mm, unsigned long start, |
| 664 | unsigned long end, const struct mm_walk_ops *ops, |
| 665 | pgd_t *pgd, void *private) |
| 666 | { |
| 667 | struct mm_walk walk = { |
| 668 | .ops = ops, |
| 669 | .mm = mm, |
| 670 | .pgd = pgd, |
| 671 | .private = private, |
| 672 | .no_vma = true |
| 673 | }; |
| 674 | |
| 675 | /* For convenience, we allow traversal of kernel mappings. */ |
| 676 | if (mm == &init_mm) |
| 677 | return walk_kernel_page_table_range(start, end, ops, |
| 678 | pgd, private); |
| 679 | if (start >= end || !walk.mm) |
| 680 | return -EINVAL; |
| 681 | if (!check_ops_valid(ops)) |
| 682 | return -EINVAL; |
| 683 | |
| 684 | /* |
| 685 | * The mmap lock protects the page walker from changes to the page |
| 686 | * tables during the walk. However a read lock is insufficient to |
| 687 | * protect those areas which don't have a VMA as munmap() detaches |
| 688 | * the VMAs before downgrading to a read lock and actually tearing |
| 689 | * down PTEs/page tables. In which case, the mmap write lock should |
| 690 | * be held. |
| 691 | */ |
| 692 | mmap_assert_write_locked(mm); |
| 693 | |
| 694 | return walk_pgd_range(addr: start, end, walk: &walk); |
| 695 | } |
| 696 | |
| 697 | int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, |
| 698 | unsigned long end, const struct mm_walk_ops *ops, |
| 699 | void *private) |
| 700 | { |
| 701 | struct mm_walk walk = { |
| 702 | .ops = ops, |
| 703 | .mm = vma->vm_mm, |
| 704 | .vma = vma, |
| 705 | .private = private, |
| 706 | }; |
| 707 | |
| 708 | if (start >= end || !walk.mm) |
| 709 | return -EINVAL; |
| 710 | if (start < vma->vm_start || end > vma->vm_end) |
| 711 | return -EINVAL; |
| 712 | if (!check_ops_valid(ops)) |
| 713 | return -EINVAL; |
| 714 | |
| 715 | process_mm_walk_lock(mm: walk.mm, walk_lock: ops->walk_lock); |
| 716 | process_vma_walk_lock(vma, walk_lock: ops->walk_lock); |
| 717 | return __walk_page_range(start, end, walk: &walk); |
| 718 | } |
| 719 | |
| 720 | int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, |
| 721 | void *private) |
| 722 | { |
| 723 | struct mm_walk walk = { |
| 724 | .ops = ops, |
| 725 | .mm = vma->vm_mm, |
| 726 | .vma = vma, |
| 727 | .private = private, |
| 728 | }; |
| 729 | |
| 730 | if (!walk.mm) |
| 731 | return -EINVAL; |
| 732 | if (!check_ops_valid(ops)) |
| 733 | return -EINVAL; |
| 734 | |
| 735 | process_mm_walk_lock(mm: walk.mm, walk_lock: ops->walk_lock); |
| 736 | process_vma_walk_lock(vma, walk_lock: ops->walk_lock); |
| 737 | return __walk_page_range(start: vma->vm_start, end: vma->vm_end, walk: &walk); |
| 738 | } |
| 739 | |
| 740 | /** |
| 741 | * walk_page_mapping - walk all memory areas mapped into a struct address_space. |
| 742 | * @mapping: Pointer to the struct address_space |
| 743 | * @first_index: First page offset in the address_space |
| 744 | * @nr: Number of incremental page offsets to cover |
| 745 | * @ops: operation to call during the walk |
| 746 | * @private: private data for callbacks' usage |
| 747 | * |
| 748 | * This function walks all memory areas mapped into a struct address_space. |
| 749 | * The walk is limited to only the given page-size index range, but if |
| 750 | * the index boundaries cross a huge page-table entry, that entry will be |
| 751 | * included. |
| 752 | * |
| 753 | * Also see walk_page_range() for additional information. |
| 754 | * |
| 755 | * Locking: |
| 756 | * This function can't require that the struct mm_struct::mmap_lock is held, |
| 757 | * since @mapping may be mapped by multiple processes. Instead |
| 758 | * @mapping->i_mmap_rwsem must be held. This might have implications in the |
| 759 | * callbacks, and it's up tho the caller to ensure that the |
| 760 | * struct mm_struct::mmap_lock is not needed. |
| 761 | * |
| 762 | * Also this means that a caller can't rely on the struct |
| 763 | * vm_area_struct::vm_flags to be constant across a call, |
| 764 | * except for immutable flags. Callers requiring this shouldn't use |
| 765 | * this function. |
| 766 | * |
| 767 | * Return: 0 on success, negative error code on failure, positive number on |
| 768 | * caller defined premature termination. |
| 769 | */ |
| 770 | int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, |
| 771 | pgoff_t nr, const struct mm_walk_ops *ops, |
| 772 | void *private) |
| 773 | { |
| 774 | struct mm_walk walk = { |
| 775 | .ops = ops, |
| 776 | .private = private, |
| 777 | }; |
| 778 | struct vm_area_struct *vma; |
| 779 | pgoff_t vba, vea, cba, cea; |
| 780 | unsigned long start_addr, end_addr; |
| 781 | int err = 0; |
| 782 | |
| 783 | if (!check_ops_valid(ops)) |
| 784 | return -EINVAL; |
| 785 | |
| 786 | lockdep_assert_held(&mapping->i_mmap_rwsem); |
| 787 | vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, |
| 788 | first_index + nr - 1) { |
| 789 | /* Clip to the vma */ |
| 790 | vba = vma->vm_pgoff; |
| 791 | vea = vba + vma_pages(vma); |
| 792 | cba = first_index; |
| 793 | cba = max(cba, vba); |
| 794 | cea = first_index + nr; |
| 795 | cea = min(cea, vea); |
| 796 | |
| 797 | start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; |
| 798 | end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; |
| 799 | if (start_addr >= end_addr) |
| 800 | continue; |
| 801 | |
| 802 | walk.vma = vma; |
| 803 | walk.mm = vma->vm_mm; |
| 804 | |
| 805 | err = walk_page_test(start: vma->vm_start, end: vma->vm_end, walk: &walk); |
| 806 | if (err > 0) { |
| 807 | err = 0; |
| 808 | break; |
| 809 | } else if (err < 0) |
| 810 | break; |
| 811 | |
| 812 | err = __walk_page_range(start: start_addr, end: end_addr, walk: &walk); |
| 813 | if (err) |
| 814 | break; |
| 815 | } |
| 816 | |
| 817 | return err; |
| 818 | } |
| 819 | |
| 820 | /** |
| 821 | * folio_walk_start - walk the page tables to a folio |
| 822 | * @fw: filled with information on success. |
| 823 | * @vma: the VMA. |
| 824 | * @addr: the virtual address to use for the page table walk. |
| 825 | * @flags: flags modifying which folios to walk to. |
| 826 | * |
| 827 | * Walk the page tables using @addr in a given @vma to a mapped folio and |
| 828 | * return the folio, making sure that the page table entry referenced by |
| 829 | * @addr cannot change until folio_walk_end() was called. |
| 830 | * |
| 831 | * As default, this function returns only folios that are not special (e.g., not |
| 832 | * the zeropage) and never returns folios that are supposed to be ignored by the |
| 833 | * VM as documented by vm_normal_page(). If requested, zeropages will be |
| 834 | * returned as well. |
| 835 | * |
| 836 | * As default, this function only considers present page table entries. |
| 837 | * If requested, it will also consider migration entries. |
| 838 | * |
| 839 | * If this function returns NULL it might either indicate "there is nothing" or |
| 840 | * "there is nothing suitable". |
| 841 | * |
| 842 | * On success, @fw is filled and the function returns the folio while the PTL |
| 843 | * is still held and folio_walk_end() must be called to clean up, |
| 844 | * releasing any held locks. The returned folio must *not* be used after the |
| 845 | * call to folio_walk_end(), unless a short-term folio reference is taken before |
| 846 | * that call. |
| 847 | * |
| 848 | * @fw->page will correspond to the page that is effectively referenced by |
| 849 | * @addr. However, for migration entries and shared zeropages @fw->page is |
| 850 | * set to NULL. Note that large folios might be mapped by multiple page table |
| 851 | * entries, and this function will always only lookup a single entry as |
| 852 | * specified by @addr, which might or might not cover more than a single page of |
| 853 | * the returned folio. |
| 854 | * |
| 855 | * This function must *not* be used as a naive replacement for |
| 856 | * get_user_pages() / pin_user_pages(), especially not to perform DMA or |
| 857 | * to carelessly modify page content. This function may *only* be used to grab |
| 858 | * short-term folio references, never to grab long-term folio references. |
| 859 | * |
| 860 | * Using the page table entry pointers in @fw for reading or modifying the |
| 861 | * entry should be avoided where possible: however, there might be valid |
| 862 | * use cases. |
| 863 | * |
| 864 | * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. |
| 865 | * For example, PMD page table sharing might require prior unsharing. Also, |
| 866 | * logical hugetlb entries might span multiple physical page table entries, |
| 867 | * which *must* be modified in a single operation (set_huge_pte_at(), |
| 868 | * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might |
| 869 | * not correspond to the first physical entry of a logical hugetlb entry. |
| 870 | * |
| 871 | * The mmap lock must be held in read mode. |
| 872 | * |
| 873 | * Return: folio pointer on success, otherwise NULL. |
| 874 | */ |
| 875 | struct folio *folio_walk_start(struct folio_walk *fw, |
| 876 | struct vm_area_struct *vma, unsigned long addr, |
| 877 | folio_walk_flags_t flags) |
| 878 | { |
| 879 | unsigned long entry_size; |
| 880 | bool expose_page = true; |
| 881 | struct page *page; |
| 882 | pud_t *pudp, pud; |
| 883 | pmd_t *pmdp, pmd; |
| 884 | pte_t *ptep, pte; |
| 885 | spinlock_t *ptl; |
| 886 | pgd_t *pgdp; |
| 887 | p4d_t *p4dp; |
| 888 | |
| 889 | mmap_assert_locked(mm: vma->vm_mm); |
| 890 | vma_pgtable_walk_begin(vma); |
| 891 | |
| 892 | if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) |
| 893 | goto not_found; |
| 894 | |
| 895 | pgdp = pgd_offset(vma->vm_mm, addr); |
| 896 | if (pgd_none_or_clear_bad(pgd: pgdp)) |
| 897 | goto not_found; |
| 898 | |
| 899 | p4dp = p4d_offset(pgd: pgdp, address: addr); |
| 900 | if (p4d_none_or_clear_bad(p4d: p4dp)) |
| 901 | goto not_found; |
| 902 | |
| 903 | pudp = pud_offset(p4d: p4dp, address: addr); |
| 904 | pud = pudp_get(pudp); |
| 905 | if (pud_none(pud)) |
| 906 | goto not_found; |
| 907 | if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && |
| 908 | (!pud_present(pud) || pud_leaf(pud))) { |
| 909 | ptl = pud_lock(mm: vma->vm_mm, pud: pudp); |
| 910 | pud = pudp_get(pudp); |
| 911 | |
| 912 | entry_size = PUD_SIZE; |
| 913 | fw->level = FW_LEVEL_PUD; |
| 914 | fw->pudp = pudp; |
| 915 | fw->pud = pud; |
| 916 | |
| 917 | if (pud_none(pud)) { |
| 918 | spin_unlock(lock: ptl); |
| 919 | goto not_found; |
| 920 | } else if (pud_present(pud) && !pud_leaf(pud)) { |
| 921 | spin_unlock(lock: ptl); |
| 922 | goto pmd_table; |
| 923 | } else if (pud_present(pud)) { |
| 924 | page = vm_normal_page_pud(vma, addr, pud); |
| 925 | if (page) |
| 926 | goto found; |
| 927 | } |
| 928 | /* |
| 929 | * TODO: FW_MIGRATION support for PUD migration entries |
| 930 | * once there are relevant users. |
| 931 | */ |
| 932 | spin_unlock(lock: ptl); |
| 933 | goto not_found; |
| 934 | } |
| 935 | |
| 936 | pmd_table: |
| 937 | VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); |
| 938 | pmdp = pmd_offset(pud: pudp, address: addr); |
| 939 | pmd = pmdp_get_lockless(pmdp); |
| 940 | if (pmd_none(pmd)) |
| 941 | goto not_found; |
| 942 | if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && |
| 943 | (!pmd_present(pmd) || pmd_leaf(pte: pmd))) { |
| 944 | ptl = pmd_lock(mm: vma->vm_mm, pmd: pmdp); |
| 945 | pmd = pmdp_get(pmdp); |
| 946 | |
| 947 | entry_size = PMD_SIZE; |
| 948 | fw->level = FW_LEVEL_PMD; |
| 949 | fw->pmdp = pmdp; |
| 950 | fw->pmd = pmd; |
| 951 | |
| 952 | if (pmd_none(pmd)) { |
| 953 | spin_unlock(lock: ptl); |
| 954 | goto not_found; |
| 955 | } else if (pmd_present(pmd) && !pmd_leaf(pte: pmd)) { |
| 956 | spin_unlock(lock: ptl); |
| 957 | goto pte_table; |
| 958 | } else if (pmd_present(pmd)) { |
| 959 | page = vm_normal_page_pmd(vma, addr, pmd); |
| 960 | if (page) { |
| 961 | goto found; |
| 962 | } else if ((flags & FW_ZEROPAGE) && |
| 963 | is_huge_zero_pmd(pmd)) { |
| 964 | page = pfn_to_page(pmd_pfn(pmd)); |
| 965 | expose_page = false; |
| 966 | goto found; |
| 967 | } |
| 968 | } else if ((flags & FW_MIGRATION) && |
| 969 | is_pmd_migration_entry(pmd)) { |
| 970 | swp_entry_t entry = pmd_to_swp_entry(pmd); |
| 971 | |
| 972 | page = pfn_swap_entry_to_page(entry); |
| 973 | expose_page = false; |
| 974 | goto found; |
| 975 | } |
| 976 | spin_unlock(lock: ptl); |
| 977 | goto not_found; |
| 978 | } |
| 979 | |
| 980 | pte_table: |
| 981 | VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); |
| 982 | ptep = pte_offset_map_lock(mm: vma->vm_mm, pmd: pmdp, addr, ptlp: &ptl); |
| 983 | if (!ptep) |
| 984 | goto not_found; |
| 985 | pte = ptep_get(ptep); |
| 986 | |
| 987 | entry_size = PAGE_SIZE; |
| 988 | fw->level = FW_LEVEL_PTE; |
| 989 | fw->ptep = ptep; |
| 990 | fw->pte = pte; |
| 991 | |
| 992 | if (pte_present(a: pte)) { |
| 993 | page = vm_normal_page(vma, addr, pte); |
| 994 | if (page) |
| 995 | goto found; |
| 996 | if ((flags & FW_ZEROPAGE) && |
| 997 | is_zero_pfn(pfn: pte_pfn(pte))) { |
| 998 | page = pfn_to_page(pte_pfn(pte)); |
| 999 | expose_page = false; |
| 1000 | goto found; |
| 1001 | } |
| 1002 | } else if (!pte_none(pte)) { |
| 1003 | swp_entry_t entry = pte_to_swp_entry(pte); |
| 1004 | |
| 1005 | if ((flags & FW_MIGRATION) && |
| 1006 | is_migration_entry(entry)) { |
| 1007 | page = pfn_swap_entry_to_page(entry); |
| 1008 | expose_page = false; |
| 1009 | goto found; |
| 1010 | } |
| 1011 | } |
| 1012 | pte_unmap_unlock(ptep, ptl); |
| 1013 | not_found: |
| 1014 | vma_pgtable_walk_end(vma); |
| 1015 | return NULL; |
| 1016 | found: |
| 1017 | if (expose_page) |
| 1018 | /* Note: Offset from the mapped page, not the folio start. */ |
| 1019 | fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); |
| 1020 | else |
| 1021 | fw->page = NULL; |
| 1022 | fw->ptl = ptl; |
| 1023 | return page_folio(page); |
| 1024 | } |
| 1025 | |