| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/mm.h> |
| 3 | #include <linux/rmap.h> |
| 4 | #include <linux/hugetlb.h> |
| 5 | #include <linux/swap.h> |
| 6 | #include <linux/swapops.h> |
| 7 | |
| 8 | #include "internal.h" |
| 9 | |
| 10 | static inline bool not_found(struct page_vma_mapped_walk *pvmw) |
| 11 | { |
| 12 | page_vma_mapped_walk_done(pvmw); |
| 13 | return false; |
| 14 | } |
| 15 | |
| 16 | static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, |
| 17 | spinlock_t **ptlp) |
| 18 | { |
| 19 | pte_t ptent; |
| 20 | |
| 21 | if (pvmw->flags & PVMW_SYNC) { |
| 22 | /* Use the stricter lookup */ |
| 23 | pvmw->pte = pte_offset_map_lock(mm: pvmw->vma->vm_mm, pmd: pvmw->pmd, |
| 24 | addr: pvmw->address, ptlp: &pvmw->ptl); |
| 25 | *ptlp = pvmw->ptl; |
| 26 | return !!pvmw->pte; |
| 27 | } |
| 28 | |
| 29 | again: |
| 30 | /* |
| 31 | * It is important to return the ptl corresponding to pte, |
| 32 | * in case *pvmw->pmd changes underneath us; so we need to |
| 33 | * return it even when choosing not to lock, in case caller |
| 34 | * proceeds to loop over next ptes, and finds a match later. |
| 35 | * Though, in most cases, page lock already protects this. |
| 36 | */ |
| 37 | pvmw->pte = pte_offset_map_rw_nolock(mm: pvmw->vma->vm_mm, pmd: pvmw->pmd, |
| 38 | addr: pvmw->address, pmdvalp, ptlp); |
| 39 | if (!pvmw->pte) |
| 40 | return false; |
| 41 | |
| 42 | ptent = ptep_get(ptep: pvmw->pte); |
| 43 | |
| 44 | if (pvmw->flags & PVMW_MIGRATION) { |
| 45 | if (!is_swap_pte(pte: ptent)) |
| 46 | return false; |
| 47 | } else if (is_swap_pte(pte: ptent)) { |
| 48 | swp_entry_t entry; |
| 49 | /* |
| 50 | * Handle un-addressable ZONE_DEVICE memory. |
| 51 | * |
| 52 | * We get here when we are trying to unmap a private |
| 53 | * device page from the process address space. Such |
| 54 | * page is not CPU accessible and thus is mapped as |
| 55 | * a special swap entry, nonetheless it still does |
| 56 | * count as a valid regular mapping for the page |
| 57 | * (and is accounted as such in page maps count). |
| 58 | * |
| 59 | * So handle this special case as if it was a normal |
| 60 | * page mapping ie lock CPU page table and return true. |
| 61 | * |
| 62 | * For more details on device private memory see HMM |
| 63 | * (include/linux/hmm.h or mm/hmm.c). |
| 64 | */ |
| 65 | entry = pte_to_swp_entry(pte: ptent); |
| 66 | if (!is_device_private_entry(entry) && |
| 67 | !is_device_exclusive_entry(entry)) |
| 68 | return false; |
| 69 | } else if (!pte_present(a: ptent)) { |
| 70 | return false; |
| 71 | } |
| 72 | spin_lock(lock: *ptlp); |
| 73 | if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) { |
| 74 | pte_unmap_unlock(pvmw->pte, *ptlp); |
| 75 | goto again; |
| 76 | } |
| 77 | pvmw->ptl = *ptlp; |
| 78 | |
| 79 | return true; |
| 80 | } |
| 81 | |
| 82 | /** |
| 83 | * check_pte - check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is |
| 84 | * mapped at the @pvmw->pte |
| 85 | * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range |
| 86 | * for checking |
| 87 | * @pte_nr: the number of small pages described by @pvmw->pte. |
| 88 | * |
| 89 | * page_vma_mapped_walk() found a place where pfn range is *potentially* |
| 90 | * mapped. check_pte() has to validate this. |
| 91 | * |
| 92 | * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to |
| 93 | * arbitrary page. |
| 94 | * |
| 95 | * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration |
| 96 | * entry that points to [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) |
| 97 | * |
| 98 | * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to |
| 99 | * [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) |
| 100 | * |
| 101 | * Otherwise, return false. |
| 102 | * |
| 103 | */ |
| 104 | static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr) |
| 105 | { |
| 106 | unsigned long pfn; |
| 107 | pte_t ptent = ptep_get(ptep: pvmw->pte); |
| 108 | |
| 109 | if (pvmw->flags & PVMW_MIGRATION) { |
| 110 | swp_entry_t entry; |
| 111 | if (!is_swap_pte(pte: ptent)) |
| 112 | return false; |
| 113 | entry = pte_to_swp_entry(pte: ptent); |
| 114 | |
| 115 | if (!is_migration_entry(entry)) |
| 116 | return false; |
| 117 | |
| 118 | pfn = swp_offset_pfn(entry); |
| 119 | } else if (is_swap_pte(pte: ptent)) { |
| 120 | swp_entry_t entry; |
| 121 | |
| 122 | /* Handle un-addressable ZONE_DEVICE memory */ |
| 123 | entry = pte_to_swp_entry(pte: ptent); |
| 124 | if (!is_device_private_entry(entry) && |
| 125 | !is_device_exclusive_entry(entry)) |
| 126 | return false; |
| 127 | |
| 128 | pfn = swp_offset_pfn(entry); |
| 129 | } else { |
| 130 | if (!pte_present(a: ptent)) |
| 131 | return false; |
| 132 | |
| 133 | pfn = pte_pfn(pte: ptent); |
| 134 | } |
| 135 | |
| 136 | if ((pfn + pte_nr - 1) < pvmw->pfn) |
| 137 | return false; |
| 138 | if (pfn > (pvmw->pfn + pvmw->nr_pages - 1)) |
| 139 | return false; |
| 140 | return true; |
| 141 | } |
| 142 | |
| 143 | /* Returns true if the two ranges overlap. Careful to not overflow. */ |
| 144 | static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) |
| 145 | { |
| 146 | if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) |
| 147 | return false; |
| 148 | if (pfn > pvmw->pfn + pvmw->nr_pages - 1) |
| 149 | return false; |
| 150 | return true; |
| 151 | } |
| 152 | |
| 153 | static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) |
| 154 | { |
| 155 | pvmw->address = (pvmw->address + size) & ~(size - 1); |
| 156 | if (!pvmw->address) |
| 157 | pvmw->address = ULONG_MAX; |
| 158 | } |
| 159 | |
| 160 | /** |
| 161 | * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at |
| 162 | * @pvmw->address |
| 163 | * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags |
| 164 | * must be set. pmd, pte and ptl must be NULL. |
| 165 | * |
| 166 | * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point |
| 167 | * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is |
| 168 | * adjusted if needed (for PTE-mapped THPs). |
| 169 | * |
| 170 | * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page |
| 171 | * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in |
| 172 | * a loop to find all PTEs that map the THP. |
| 173 | * |
| 174 | * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry |
| 175 | * regardless of which page table level the page is mapped at. @pvmw->pmd is |
| 176 | * NULL. |
| 177 | * |
| 178 | * Returns false if there are no more page table entries for the page in |
| 179 | * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. |
| 180 | * |
| 181 | * If you need to stop the walk before page_vma_mapped_walk() returned false, |
| 182 | * use page_vma_mapped_walk_done(). It will do the housekeeping. |
| 183 | */ |
| 184 | bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) |
| 185 | { |
| 186 | struct vm_area_struct *vma = pvmw->vma; |
| 187 | struct mm_struct *mm = vma->vm_mm; |
| 188 | unsigned long end; |
| 189 | spinlock_t *ptl; |
| 190 | pgd_t *pgd; |
| 191 | p4d_t *p4d; |
| 192 | pud_t *pud; |
| 193 | pmd_t pmde; |
| 194 | |
| 195 | /* The only possible pmd mapping has been handled on last iteration */ |
| 196 | if (pvmw->pmd && !pvmw->pte) |
| 197 | return not_found(pvmw); |
| 198 | |
| 199 | if (unlikely(is_vm_hugetlb_page(vma))) { |
| 200 | struct hstate *hstate = hstate_vma(vma); |
| 201 | unsigned long size = huge_page_size(h: hstate); |
| 202 | /* The only possible mapping was handled on last iteration */ |
| 203 | if (pvmw->pte) |
| 204 | return not_found(pvmw); |
| 205 | /* |
| 206 | * All callers that get here will already hold the |
| 207 | * i_mmap_rwsem. Therefore, no additional locks need to be |
| 208 | * taken before calling hugetlb_walk(). |
| 209 | */ |
| 210 | pvmw->pte = hugetlb_walk(vma, addr: pvmw->address, sz: size); |
| 211 | if (!pvmw->pte) |
| 212 | return false; |
| 213 | |
| 214 | pvmw->ptl = huge_pte_lock(h: hstate, mm, pte: pvmw->pte); |
| 215 | if (!check_pte(pvmw, pte_nr: pages_per_huge_page(h: hstate))) |
| 216 | return not_found(pvmw); |
| 217 | return true; |
| 218 | } |
| 219 | |
| 220 | end = vma_address_end(pvmw); |
| 221 | if (pvmw->pte) |
| 222 | goto next_pte; |
| 223 | restart: |
| 224 | do { |
| 225 | pgd = pgd_offset(mm, pvmw->address); |
| 226 | if (!pgd_present(pgd: *pgd)) { |
| 227 | step_forward(pvmw, PGDIR_SIZE); |
| 228 | continue; |
| 229 | } |
| 230 | p4d = p4d_offset(pgd, address: pvmw->address); |
| 231 | if (!p4d_present(p4d: *p4d)) { |
| 232 | step_forward(pvmw, P4D_SIZE); |
| 233 | continue; |
| 234 | } |
| 235 | pud = pud_offset(p4d, address: pvmw->address); |
| 236 | if (!pud_present(pud: *pud)) { |
| 237 | step_forward(pvmw, PUD_SIZE); |
| 238 | continue; |
| 239 | } |
| 240 | |
| 241 | pvmw->pmd = pmd_offset(pud, address: pvmw->address); |
| 242 | /* |
| 243 | * Make sure the pmd value isn't cached in a register by the |
| 244 | * compiler and used as a stale value after we've observed a |
| 245 | * subsequent update. |
| 246 | */ |
| 247 | pmde = pmdp_get_lockless(pmdp: pvmw->pmd); |
| 248 | |
| 249 | if (pmd_trans_huge(pmd: pmde) || is_pmd_migration_entry(pmd: pmde)) { |
| 250 | pvmw->ptl = pmd_lock(mm, pmd: pvmw->pmd); |
| 251 | pmde = *pvmw->pmd; |
| 252 | if (!pmd_present(pmd: pmde)) { |
| 253 | swp_entry_t entry; |
| 254 | |
| 255 | if (!thp_migration_supported() || |
| 256 | !(pvmw->flags & PVMW_MIGRATION)) |
| 257 | return not_found(pvmw); |
| 258 | entry = pmd_to_swp_entry(pmd: pmde); |
| 259 | if (!is_migration_entry(entry) || |
| 260 | !check_pmd(pfn: swp_offset_pfn(entry), pvmw)) |
| 261 | return not_found(pvmw); |
| 262 | return true; |
| 263 | } |
| 264 | if (likely(pmd_trans_huge(pmde))) { |
| 265 | if (pvmw->flags & PVMW_MIGRATION) |
| 266 | return not_found(pvmw); |
| 267 | if (!check_pmd(pfn: pmd_pfn(pmd: pmde), pvmw)) |
| 268 | return not_found(pvmw); |
| 269 | return true; |
| 270 | } |
| 271 | /* THP pmd was split under us: handle on pte level */ |
| 272 | spin_unlock(lock: pvmw->ptl); |
| 273 | pvmw->ptl = NULL; |
| 274 | } else if (!pmd_present(pmd: pmde)) { |
| 275 | /* |
| 276 | * If PVMW_SYNC, take and drop THP pmd lock so that we |
| 277 | * cannot return prematurely, while zap_huge_pmd() has |
| 278 | * cleared *pmd but not decremented compound_mapcount(). |
| 279 | */ |
| 280 | if ((pvmw->flags & PVMW_SYNC) && |
| 281 | thp_vma_suitable_order(vma, addr: pvmw->address, |
| 282 | PMD_ORDER) && |
| 283 | (pvmw->nr_pages >= HPAGE_PMD_NR)) { |
| 284 | spinlock_t *ptl = pmd_lock(mm, pmd: pvmw->pmd); |
| 285 | |
| 286 | spin_unlock(lock: ptl); |
| 287 | } |
| 288 | step_forward(pvmw, PMD_SIZE); |
| 289 | continue; |
| 290 | } |
| 291 | if (!map_pte(pvmw, pmdvalp: &pmde, ptlp: &ptl)) { |
| 292 | if (!pvmw->pte) |
| 293 | goto restart; |
| 294 | goto next_pte; |
| 295 | } |
| 296 | this_pte: |
| 297 | if (check_pte(pvmw, pte_nr: 1)) |
| 298 | return true; |
| 299 | next_pte: |
| 300 | do { |
| 301 | pvmw->address += PAGE_SIZE; |
| 302 | if (pvmw->address >= end) |
| 303 | return not_found(pvmw); |
| 304 | /* Did we cross page table boundary? */ |
| 305 | if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { |
| 306 | if (pvmw->ptl) { |
| 307 | spin_unlock(lock: pvmw->ptl); |
| 308 | pvmw->ptl = NULL; |
| 309 | } |
| 310 | pte_unmap(pte: pvmw->pte); |
| 311 | pvmw->pte = NULL; |
| 312 | pvmw->flags |= PVMW_PGTABLE_CROSSED; |
| 313 | goto restart; |
| 314 | } |
| 315 | pvmw->pte++; |
| 316 | } while (pte_none(pte: ptep_get(ptep: pvmw->pte))); |
| 317 | |
| 318 | if (!pvmw->ptl) { |
| 319 | spin_lock(lock: ptl); |
| 320 | if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { |
| 321 | pte_unmap_unlock(pvmw->pte, ptl); |
| 322 | pvmw->pte = NULL; |
| 323 | goto restart; |
| 324 | } |
| 325 | pvmw->ptl = ptl; |
| 326 | } |
| 327 | goto this_pte; |
| 328 | } while (pvmw->address < end); |
| 329 | |
| 330 | return false; |
| 331 | } |
| 332 | |
| 333 | #ifdef CONFIG_MEMORY_FAILURE |
| 334 | /** |
| 335 | * page_mapped_in_vma - check whether a page is really mapped in a VMA |
| 336 | * @page: the page to test |
| 337 | * @vma: the VMA to test |
| 338 | * |
| 339 | * Return: The address the page is mapped at if the page is in the range |
| 340 | * covered by the VMA and present in the page table. If the page is |
| 341 | * outside the VMA or not present, returns -EFAULT. |
| 342 | * Only valid for normal file or anonymous VMAs. |
| 343 | */ |
| 344 | unsigned long page_mapped_in_vma(const struct page *page, |
| 345 | struct vm_area_struct *vma) |
| 346 | { |
| 347 | const struct folio *folio = page_folio(page); |
| 348 | struct page_vma_mapped_walk pvmw = { |
| 349 | .pfn = page_to_pfn(page), |
| 350 | .nr_pages = 1, |
| 351 | .vma = vma, |
| 352 | .flags = PVMW_SYNC, |
| 353 | }; |
| 354 | |
| 355 | pvmw.address = vma_address(vma, page_pgoff(folio, page), 1); |
| 356 | if (pvmw.address == -EFAULT) |
| 357 | goto out; |
| 358 | if (!page_vma_mapped_walk(&pvmw)) |
| 359 | return -EFAULT; |
| 360 | page_vma_mapped_walk_done(&pvmw); |
| 361 | out: |
| 362 | return pvmw.address; |
| 363 | } |
| 364 | #endif |
| 365 | |