| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * HugeTLB Vmemmap Optimization (HVO) |
| 4 | * |
| 5 | * Copyright (c) 2020, ByteDance. All rights reserved. |
| 6 | * |
| 7 | * Author: Muchun Song <songmuchun@bytedance.com> |
| 8 | * |
| 9 | * See Documentation/mm/vmemmap_dedup.rst |
| 10 | */ |
| 11 | #define pr_fmt(fmt) "HugeTLB: " fmt |
| 12 | |
| 13 | #include <linux/pgtable.h> |
| 14 | #include <linux/moduleparam.h> |
| 15 | #include <linux/bootmem_info.h> |
| 16 | #include <linux/mmdebug.h> |
| 17 | #include <linux/pagewalk.h> |
| 18 | #include <asm/pgalloc.h> |
| 19 | #include <asm/tlbflush.h> |
| 20 | #include "hugetlb_vmemmap.h" |
| 21 | |
| 22 | /** |
| 23 | * struct vmemmap_remap_walk - walk vmemmap page table |
| 24 | * |
| 25 | * @remap_pte: called for each lowest-level entry (PTE). |
| 26 | * @nr_walked: the number of walked pte. |
| 27 | * @reuse_page: the page which is reused for the tail vmemmap pages. |
| 28 | * @reuse_addr: the virtual address of the @reuse_page page. |
| 29 | * @vmemmap_pages: the list head of the vmemmap pages that can be freed |
| 30 | * or is mapped from. |
| 31 | * @flags: used to modify behavior in vmemmap page table walking |
| 32 | * operations. |
| 33 | */ |
| 34 | struct vmemmap_remap_walk { |
| 35 | void (*remap_pte)(pte_t *pte, unsigned long addr, |
| 36 | struct vmemmap_remap_walk *walk); |
| 37 | unsigned long nr_walked; |
| 38 | struct page *reuse_page; |
| 39 | unsigned long reuse_addr; |
| 40 | struct list_head *vmemmap_pages; |
| 41 | |
| 42 | /* Skip the TLB flush when we split the PMD */ |
| 43 | #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) |
| 44 | /* Skip the TLB flush when we remap the PTE */ |
| 45 | #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) |
| 46 | /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ |
| 47 | #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) |
| 48 | unsigned long flags; |
| 49 | }; |
| 50 | |
| 51 | static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, |
| 52 | struct vmemmap_remap_walk *walk) |
| 53 | { |
| 54 | pmd_t __pmd; |
| 55 | int i; |
| 56 | unsigned long addr = start; |
| 57 | pte_t *pgtable; |
| 58 | |
| 59 | pgtable = pte_alloc_one_kernel(&init_mm); |
| 60 | if (!pgtable) |
| 61 | return -ENOMEM; |
| 62 | |
| 63 | pmd_populate_kernel(mm: &init_mm, pmd: &__pmd, pte: pgtable); |
| 64 | |
| 65 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { |
| 66 | pte_t entry, *pte; |
| 67 | pgprot_t pgprot = PAGE_KERNEL; |
| 68 | |
| 69 | entry = mk_pte(page: head + i, pgprot); |
| 70 | pte = pte_offset_kernel(pmd: &__pmd, address: addr); |
| 71 | set_pte_at(&init_mm, addr, pte, entry); |
| 72 | } |
| 73 | |
| 74 | spin_lock(lock: &init_mm.page_table_lock); |
| 75 | if (likely(pmd_leaf(*pmd))) { |
| 76 | /* |
| 77 | * Higher order allocations from buddy allocator must be able to |
| 78 | * be treated as indepdenent small pages (as they can be freed |
| 79 | * individually). |
| 80 | */ |
| 81 | if (!PageReserved(page: head)) |
| 82 | split_page(page: head, order: get_order(PMD_SIZE)); |
| 83 | |
| 84 | /* Make pte visible before pmd. See comment in pmd_install(). */ |
| 85 | smp_wmb(); |
| 86 | pmd_populate_kernel(mm: &init_mm, pmd, pte: pgtable); |
| 87 | if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) |
| 88 | flush_tlb_kernel_range(start, end: start + PMD_SIZE); |
| 89 | } else { |
| 90 | pte_free_kernel(mm: &init_mm, pte: pgtable); |
| 91 | } |
| 92 | spin_unlock(lock: &init_mm.page_table_lock); |
| 93 | |
| 94 | return 0; |
| 95 | } |
| 96 | |
| 97 | static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, |
| 98 | unsigned long next, struct mm_walk *walk) |
| 99 | { |
| 100 | int ret = 0; |
| 101 | struct page *head; |
| 102 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; |
| 103 | |
| 104 | /* Only splitting, not remapping the vmemmap pages. */ |
| 105 | if (!vmemmap_walk->remap_pte) |
| 106 | walk->action = ACTION_CONTINUE; |
| 107 | |
| 108 | spin_lock(lock: &init_mm.page_table_lock); |
| 109 | head = pmd_leaf(pte: *pmd) ? pmd_page(*pmd) : NULL; |
| 110 | /* |
| 111 | * Due to HugeTLB alignment requirements and the vmemmap |
| 112 | * pages being at the start of the hotplugged memory |
| 113 | * region in memory_hotplug.memmap_on_memory case. Checking |
| 114 | * the vmemmap page associated with the first vmemmap page |
| 115 | * if it is self-hosted is sufficient. |
| 116 | * |
| 117 | * [ hotplugged memory ] |
| 118 | * [ section ][...][ section ] |
| 119 | * [ vmemmap ][ usable memory ] |
| 120 | * ^ | ^ | |
| 121 | * +--+ | | |
| 122 | * +------------------------+ |
| 123 | */ |
| 124 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { |
| 125 | struct page *page = head ? head + pte_index(address: addr) : |
| 126 | pte_page(ptep_get(pte_offset_kernel(pmd, addr))); |
| 127 | |
| 128 | if (PageVmemmapSelfHosted(page)) |
| 129 | ret = -ENOTSUPP; |
| 130 | } |
| 131 | spin_unlock(lock: &init_mm.page_table_lock); |
| 132 | if (!head || ret) |
| 133 | return ret; |
| 134 | |
| 135 | return vmemmap_split_pmd(pmd, head, start: addr & PMD_MASK, walk: vmemmap_walk); |
| 136 | } |
| 137 | |
| 138 | static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, |
| 139 | unsigned long next, struct mm_walk *walk) |
| 140 | { |
| 141 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; |
| 142 | |
| 143 | /* |
| 144 | * The reuse_page is found 'first' in page table walking before |
| 145 | * starting remapping. |
| 146 | */ |
| 147 | if (!vmemmap_walk->reuse_page) |
| 148 | vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); |
| 149 | else |
| 150 | vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); |
| 151 | vmemmap_walk->nr_walked++; |
| 152 | |
| 153 | return 0; |
| 154 | } |
| 155 | |
| 156 | static const struct mm_walk_ops vmemmap_remap_ops = { |
| 157 | .pmd_entry = vmemmap_pmd_entry, |
| 158 | .pte_entry = vmemmap_pte_entry, |
| 159 | }; |
| 160 | |
| 161 | static int vmemmap_remap_range(unsigned long start, unsigned long end, |
| 162 | struct vmemmap_remap_walk *walk) |
| 163 | { |
| 164 | int ret; |
| 165 | |
| 166 | VM_BUG_ON(!PAGE_ALIGNED(start | end)); |
| 167 | |
| 168 | mmap_read_lock(mm: &init_mm); |
| 169 | ret = walk_kernel_page_table_range(start, end, ops: &vmemmap_remap_ops, |
| 170 | NULL, private: walk); |
| 171 | mmap_read_unlock(mm: &init_mm); |
| 172 | if (ret) |
| 173 | return ret; |
| 174 | |
| 175 | if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) |
| 176 | flush_tlb_kernel_range(start, end); |
| 177 | |
| 178 | return 0; |
| 179 | } |
| 180 | |
| 181 | /* |
| 182 | * Free a vmemmap page. A vmemmap page can be allocated from the memblock |
| 183 | * allocator or buddy allocator. If the PG_reserved flag is set, it means |
| 184 | * that it allocated from the memblock allocator, just free it via the |
| 185 | * free_bootmem_page(). Otherwise, use __free_page(). |
| 186 | */ |
| 187 | static inline void free_vmemmap_page(struct page *page) |
| 188 | { |
| 189 | if (PageReserved(page)) { |
| 190 | memmap_boot_pages_add(delta: -1); |
| 191 | free_bootmem_page(page); |
| 192 | } else { |
| 193 | memmap_pages_add(delta: -1); |
| 194 | __free_page(page); |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | /* Free a list of the vmemmap pages */ |
| 199 | static void free_vmemmap_page_list(struct list_head *list) |
| 200 | { |
| 201 | struct page *page, *next; |
| 202 | |
| 203 | list_for_each_entry_safe(page, next, list, lru) |
| 204 | free_vmemmap_page(page); |
| 205 | } |
| 206 | |
| 207 | static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, |
| 208 | struct vmemmap_remap_walk *walk) |
| 209 | { |
| 210 | /* |
| 211 | * Remap the tail pages as read-only to catch illegal write operation |
| 212 | * to the tail pages. |
| 213 | */ |
| 214 | pgprot_t pgprot = PAGE_KERNEL_RO; |
| 215 | struct page *page = pte_page(ptep_get(pte)); |
| 216 | pte_t entry; |
| 217 | |
| 218 | /* Remapping the head page requires r/w */ |
| 219 | if (unlikely(addr == walk->reuse_addr)) { |
| 220 | pgprot = PAGE_KERNEL; |
| 221 | list_del(entry: &walk->reuse_page->lru); |
| 222 | |
| 223 | /* |
| 224 | * Makes sure that preceding stores to the page contents from |
| 225 | * vmemmap_remap_free() become visible before the set_pte_at() |
| 226 | * write. |
| 227 | */ |
| 228 | smp_wmb(); |
| 229 | } |
| 230 | |
| 231 | entry = mk_pte(page: walk->reuse_page, pgprot); |
| 232 | list_add(new: &page->lru, head: walk->vmemmap_pages); |
| 233 | set_pte_at(&init_mm, addr, pte, entry); |
| 234 | } |
| 235 | |
| 236 | /* |
| 237 | * How many struct page structs need to be reset. When we reuse the head |
| 238 | * struct page, the special metadata (e.g. page->flags or page->mapping) |
| 239 | * cannot copy to the tail struct page structs. The invalid value will be |
| 240 | * checked in the free_tail_page_prepare(). In order to avoid the message |
| 241 | * of "corrupted mapping in tail page". We need to reset at least 4 (one |
| 242 | * head struct page struct and three tail struct page structs) struct page |
| 243 | * structs. |
| 244 | */ |
| 245 | #define NR_RESET_STRUCT_PAGE 4 |
| 246 | |
| 247 | static inline void reset_struct_pages(struct page *start) |
| 248 | { |
| 249 | struct page *from = start + NR_RESET_STRUCT_PAGE; |
| 250 | |
| 251 | BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); |
| 252 | memcpy(to: start, from, len: sizeof(*from) * NR_RESET_STRUCT_PAGE); |
| 253 | } |
| 254 | |
| 255 | static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, |
| 256 | struct vmemmap_remap_walk *walk) |
| 257 | { |
| 258 | pgprot_t pgprot = PAGE_KERNEL; |
| 259 | struct page *page; |
| 260 | void *to; |
| 261 | |
| 262 | BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); |
| 263 | |
| 264 | page = list_first_entry(walk->vmemmap_pages, struct page, lru); |
| 265 | list_del(entry: &page->lru); |
| 266 | to = page_to_virt(page); |
| 267 | copy_page(to, from: (void *)walk->reuse_addr); |
| 268 | reset_struct_pages(start: to); |
| 269 | |
| 270 | /* |
| 271 | * Makes sure that preceding stores to the page contents become visible |
| 272 | * before the set_pte_at() write. |
| 273 | */ |
| 274 | smp_wmb(); |
| 275 | set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); |
| 276 | } |
| 277 | |
| 278 | /** |
| 279 | * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) |
| 280 | * backing PMDs of the directmap into PTEs |
| 281 | * @start: start address of the vmemmap virtual address range that we want |
| 282 | * to remap. |
| 283 | * @end: end address of the vmemmap virtual address range that we want to |
| 284 | * remap. |
| 285 | * @reuse: reuse address. |
| 286 | * |
| 287 | * Return: %0 on success, negative error code otherwise. |
| 288 | */ |
| 289 | static int vmemmap_remap_split(unsigned long start, unsigned long end, |
| 290 | unsigned long reuse) |
| 291 | { |
| 292 | struct vmemmap_remap_walk walk = { |
| 293 | .remap_pte = NULL, |
| 294 | .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, |
| 295 | }; |
| 296 | |
| 297 | /* See the comment in the vmemmap_remap_free(). */ |
| 298 | BUG_ON(start - reuse != PAGE_SIZE); |
| 299 | |
| 300 | return vmemmap_remap_range(start: reuse, end, walk: &walk); |
| 301 | } |
| 302 | |
| 303 | /** |
| 304 | * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) |
| 305 | * to the page which @reuse is mapped to, then free vmemmap |
| 306 | * which the range are mapped to. |
| 307 | * @start: start address of the vmemmap virtual address range that we want |
| 308 | * to remap. |
| 309 | * @end: end address of the vmemmap virtual address range that we want to |
| 310 | * remap. |
| 311 | * @reuse: reuse address. |
| 312 | * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers |
| 313 | * responsibility to free pages. |
| 314 | * @flags: modifications to vmemmap_remap_walk flags |
| 315 | * |
| 316 | * Return: %0 on success, negative error code otherwise. |
| 317 | */ |
| 318 | static int vmemmap_remap_free(unsigned long start, unsigned long end, |
| 319 | unsigned long reuse, |
| 320 | struct list_head *vmemmap_pages, |
| 321 | unsigned long flags) |
| 322 | { |
| 323 | int ret; |
| 324 | struct vmemmap_remap_walk walk = { |
| 325 | .remap_pte = vmemmap_remap_pte, |
| 326 | .reuse_addr = reuse, |
| 327 | .vmemmap_pages = vmemmap_pages, |
| 328 | .flags = flags, |
| 329 | }; |
| 330 | int nid = page_to_nid(page: (struct page *)reuse); |
| 331 | gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; |
| 332 | |
| 333 | /* |
| 334 | * Allocate a new head vmemmap page to avoid breaking a contiguous |
| 335 | * block of struct page memory when freeing it back to page allocator |
| 336 | * in free_vmemmap_page_list(). This will allow the likely contiguous |
| 337 | * struct page backing memory to be kept contiguous and allowing for |
| 338 | * more allocations of hugepages. Fallback to the currently |
| 339 | * mapped head page in case should it fail to allocate. |
| 340 | */ |
| 341 | walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); |
| 342 | if (walk.reuse_page) { |
| 343 | copy_page(page_to_virt(walk.reuse_page), |
| 344 | from: (void *)walk.reuse_addr); |
| 345 | list_add(new: &walk.reuse_page->lru, head: vmemmap_pages); |
| 346 | memmap_pages_add(delta: 1); |
| 347 | } |
| 348 | |
| 349 | /* |
| 350 | * In order to make remapping routine most efficient for the huge pages, |
| 351 | * the routine of vmemmap page table walking has the following rules |
| 352 | * (see more details from the vmemmap_pte_range()): |
| 353 | * |
| 354 | * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) |
| 355 | * should be continuous. |
| 356 | * - The @reuse address is part of the range [@reuse, @end) that we are |
| 357 | * walking which is passed to vmemmap_remap_range(). |
| 358 | * - The @reuse address is the first in the complete range. |
| 359 | * |
| 360 | * So we need to make sure that @start and @reuse meet the above rules. |
| 361 | */ |
| 362 | BUG_ON(start - reuse != PAGE_SIZE); |
| 363 | |
| 364 | ret = vmemmap_remap_range(start: reuse, end, walk: &walk); |
| 365 | if (ret && walk.nr_walked) { |
| 366 | end = reuse + walk.nr_walked * PAGE_SIZE; |
| 367 | /* |
| 368 | * vmemmap_pages contains pages from the previous |
| 369 | * vmemmap_remap_range call which failed. These |
| 370 | * are pages which were removed from the vmemmap. |
| 371 | * They will be restored in the following call. |
| 372 | */ |
| 373 | walk = (struct vmemmap_remap_walk) { |
| 374 | .remap_pte = vmemmap_restore_pte, |
| 375 | .reuse_addr = reuse, |
| 376 | .vmemmap_pages = vmemmap_pages, |
| 377 | .flags = 0, |
| 378 | }; |
| 379 | |
| 380 | vmemmap_remap_range(start: reuse, end, walk: &walk); |
| 381 | } |
| 382 | |
| 383 | return ret; |
| 384 | } |
| 385 | |
| 386 | static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, |
| 387 | struct list_head *list) |
| 388 | { |
| 389 | gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; |
| 390 | unsigned long nr_pages = (end - start) >> PAGE_SHIFT; |
| 391 | int nid = page_to_nid(page: (struct page *)start); |
| 392 | struct page *page, *next; |
| 393 | int i; |
| 394 | |
| 395 | for (i = 0; i < nr_pages; i++) { |
| 396 | page = alloc_pages_node(nid, gfp_mask, 0); |
| 397 | if (!page) |
| 398 | goto out; |
| 399 | list_add(new: &page->lru, head: list); |
| 400 | } |
| 401 | memmap_pages_add(delta: nr_pages); |
| 402 | |
| 403 | return 0; |
| 404 | out: |
| 405 | list_for_each_entry_safe(page, next, list, lru) |
| 406 | __free_page(page); |
| 407 | return -ENOMEM; |
| 408 | } |
| 409 | |
| 410 | /** |
| 411 | * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) |
| 412 | * to the page which is from the @vmemmap_pages |
| 413 | * respectively. |
| 414 | * @start: start address of the vmemmap virtual address range that we want |
| 415 | * to remap. |
| 416 | * @end: end address of the vmemmap virtual address range that we want to |
| 417 | * remap. |
| 418 | * @reuse: reuse address. |
| 419 | * @flags: modifications to vmemmap_remap_walk flags |
| 420 | * |
| 421 | * Return: %0 on success, negative error code otherwise. |
| 422 | */ |
| 423 | static int vmemmap_remap_alloc(unsigned long start, unsigned long end, |
| 424 | unsigned long reuse, unsigned long flags) |
| 425 | { |
| 426 | LIST_HEAD(vmemmap_pages); |
| 427 | struct vmemmap_remap_walk walk = { |
| 428 | .remap_pte = vmemmap_restore_pte, |
| 429 | .reuse_addr = reuse, |
| 430 | .vmemmap_pages = &vmemmap_pages, |
| 431 | .flags = flags, |
| 432 | }; |
| 433 | |
| 434 | /* See the comment in the vmemmap_remap_free(). */ |
| 435 | BUG_ON(start - reuse != PAGE_SIZE); |
| 436 | |
| 437 | if (alloc_vmemmap_page_list(start, end, list: &vmemmap_pages)) |
| 438 | return -ENOMEM; |
| 439 | |
| 440 | return vmemmap_remap_range(start: reuse, end, walk: &walk); |
| 441 | } |
| 442 | |
| 443 | DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); |
| 444 | EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); |
| 445 | |
| 446 | static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); |
| 447 | static int __init hugetlb_vmemmap_optimize_param(char *buf) |
| 448 | { |
| 449 | return kstrtobool(s: buf, res: &vmemmap_optimize_enabled); |
| 450 | } |
| 451 | early_param("hugetlb_free_vmemmap" , hugetlb_vmemmap_optimize_param); |
| 452 | |
| 453 | static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, |
| 454 | struct folio *folio, unsigned long flags) |
| 455 | { |
| 456 | int ret; |
| 457 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
| 458 | unsigned long vmemmap_reuse; |
| 459 | |
| 460 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
| 461 | VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); |
| 462 | |
| 463 | if (!folio_test_hugetlb_vmemmap_optimized(folio)) |
| 464 | return 0; |
| 465 | |
| 466 | if (flags & VMEMMAP_SYNCHRONIZE_RCU) |
| 467 | synchronize_rcu(); |
| 468 | |
| 469 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
| 470 | vmemmap_reuse = vmemmap_start; |
| 471 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
| 472 | |
| 473 | /* |
| 474 | * The pages which the vmemmap virtual address range [@vmemmap_start, |
| 475 | * @vmemmap_end) are mapped to are freed to the buddy allocator, and |
| 476 | * the range is mapped to the page which @vmemmap_reuse is mapped to. |
| 477 | * When a HugeTLB page is freed to the buddy allocator, previously |
| 478 | * discarded vmemmap pages must be allocated and remapping. |
| 479 | */ |
| 480 | ret = vmemmap_remap_alloc(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse, flags); |
| 481 | if (!ret) { |
| 482 | folio_clear_hugetlb_vmemmap_optimized(folio); |
| 483 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
| 484 | } |
| 485 | |
| 486 | return ret; |
| 487 | } |
| 488 | |
| 489 | /** |
| 490 | * hugetlb_vmemmap_restore_folio - restore previously optimized (by |
| 491 | * hugetlb_vmemmap_optimize_folio()) vmemmap pages which |
| 492 | * will be reallocated and remapped. |
| 493 | * @h: struct hstate. |
| 494 | * @folio: the folio whose vmemmap pages will be restored. |
| 495 | * |
| 496 | * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, |
| 497 | * negative error code otherwise. |
| 498 | */ |
| 499 | int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) |
| 500 | { |
| 501 | return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); |
| 502 | } |
| 503 | |
| 504 | /** |
| 505 | * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. |
| 506 | * @h: hstate. |
| 507 | * @folio_list: list of folios. |
| 508 | * @non_hvo_folios: Output list of folios for which vmemmap exists. |
| 509 | * |
| 510 | * Return: number of folios for which vmemmap was restored, or an error code |
| 511 | * if an error was encountered restoring vmemmap for a folio. |
| 512 | * Folios that have vmemmap are moved to the non_hvo_folios |
| 513 | * list. Processing of entries stops when the first error is |
| 514 | * encountered. The folio that experienced the error and all |
| 515 | * non-processed folios will remain on folio_list. |
| 516 | */ |
| 517 | long hugetlb_vmemmap_restore_folios(const struct hstate *h, |
| 518 | struct list_head *folio_list, |
| 519 | struct list_head *non_hvo_folios) |
| 520 | { |
| 521 | struct folio *folio, *t_folio; |
| 522 | long restored = 0; |
| 523 | long ret = 0; |
| 524 | unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; |
| 525 | |
| 526 | list_for_each_entry_safe(folio, t_folio, folio_list, lru) { |
| 527 | if (folio_test_hugetlb_vmemmap_optimized(folio)) { |
| 528 | ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); |
| 529 | /* only need to synchronize_rcu() once for each batch */ |
| 530 | flags &= ~VMEMMAP_SYNCHRONIZE_RCU; |
| 531 | |
| 532 | if (ret) |
| 533 | break; |
| 534 | restored++; |
| 535 | } |
| 536 | |
| 537 | /* Add non-optimized folios to output list */ |
| 538 | list_move(list: &folio->lru, head: non_hvo_folios); |
| 539 | } |
| 540 | |
| 541 | if (restored) |
| 542 | flush_tlb_all(); |
| 543 | if (!ret) |
| 544 | ret = restored; |
| 545 | return ret; |
| 546 | } |
| 547 | |
| 548 | /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ |
| 549 | static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) |
| 550 | { |
| 551 | if (folio_test_hugetlb_vmemmap_optimized(folio)) |
| 552 | return false; |
| 553 | |
| 554 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
| 555 | return false; |
| 556 | |
| 557 | if (!hugetlb_vmemmap_optimizable(h)) |
| 558 | return false; |
| 559 | |
| 560 | return true; |
| 561 | } |
| 562 | |
| 563 | static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, |
| 564 | struct folio *folio, |
| 565 | struct list_head *vmemmap_pages, |
| 566 | unsigned long flags) |
| 567 | { |
| 568 | int ret = 0; |
| 569 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
| 570 | unsigned long vmemmap_reuse; |
| 571 | |
| 572 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
| 573 | VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); |
| 574 | |
| 575 | if (!vmemmap_should_optimize_folio(h, folio)) |
| 576 | return ret; |
| 577 | |
| 578 | static_branch_inc(&hugetlb_optimize_vmemmap_key); |
| 579 | |
| 580 | if (flags & VMEMMAP_SYNCHRONIZE_RCU) |
| 581 | synchronize_rcu(); |
| 582 | /* |
| 583 | * Very Subtle |
| 584 | * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed |
| 585 | * immediately after remapping. As a result, subsequent accesses |
| 586 | * and modifications to struct pages associated with the hugetlb |
| 587 | * page could be to the OLD struct pages. Set the vmemmap optimized |
| 588 | * flag here so that it is copied to the new head page. This keeps |
| 589 | * the old and new struct pages in sync. |
| 590 | * If there is an error during optimization, we will immediately FLUSH |
| 591 | * the TLB and clear the flag below. |
| 592 | */ |
| 593 | folio_set_hugetlb_vmemmap_optimized(folio); |
| 594 | |
| 595 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
| 596 | vmemmap_reuse = vmemmap_start; |
| 597 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
| 598 | |
| 599 | /* |
| 600 | * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) |
| 601 | * to the page which @vmemmap_reuse is mapped to. Add pages previously |
| 602 | * mapping the range to vmemmap_pages list so that they can be freed by |
| 603 | * the caller. |
| 604 | */ |
| 605 | ret = vmemmap_remap_free(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse, |
| 606 | vmemmap_pages, flags); |
| 607 | if (ret) { |
| 608 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
| 609 | folio_clear_hugetlb_vmemmap_optimized(folio); |
| 610 | } |
| 611 | |
| 612 | return ret; |
| 613 | } |
| 614 | |
| 615 | /** |
| 616 | * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. |
| 617 | * @h: struct hstate. |
| 618 | * @folio: the folio whose vmemmap pages will be optimized. |
| 619 | * |
| 620 | * This function only tries to optimize @folio's vmemmap pages and does not |
| 621 | * guarantee that the optimization will succeed after it returns. The caller |
| 622 | * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's |
| 623 | * vmemmap pages have been optimized. |
| 624 | */ |
| 625 | void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) |
| 626 | { |
| 627 | LIST_HEAD(vmemmap_pages); |
| 628 | |
| 629 | __hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); |
| 630 | free_vmemmap_page_list(list: &vmemmap_pages); |
| 631 | } |
| 632 | |
| 633 | static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) |
| 634 | { |
| 635 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
| 636 | unsigned long vmemmap_reuse; |
| 637 | |
| 638 | if (!vmemmap_should_optimize_folio(h, folio)) |
| 639 | return 0; |
| 640 | |
| 641 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
| 642 | vmemmap_reuse = vmemmap_start; |
| 643 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
| 644 | |
| 645 | /* |
| 646 | * Split PMDs on the vmemmap virtual address range [@vmemmap_start, |
| 647 | * @vmemmap_end] |
| 648 | */ |
| 649 | return vmemmap_remap_split(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse); |
| 650 | } |
| 651 | |
| 652 | static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, |
| 653 | struct list_head *folio_list, |
| 654 | bool boot) |
| 655 | { |
| 656 | struct folio *folio; |
| 657 | int nr_to_optimize; |
| 658 | LIST_HEAD(vmemmap_pages); |
| 659 | unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; |
| 660 | |
| 661 | nr_to_optimize = 0; |
| 662 | list_for_each_entry(folio, folio_list, lru) { |
| 663 | int ret; |
| 664 | unsigned long spfn, epfn; |
| 665 | |
| 666 | if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { |
| 667 | /* |
| 668 | * Already optimized by pre-HVO, just map the |
| 669 | * mirrored tail page structs RO. |
| 670 | */ |
| 671 | spfn = (unsigned long)&folio->page; |
| 672 | epfn = spfn + pages_per_huge_page(h); |
| 673 | vmemmap_wrprotect_hvo(start: spfn, end: epfn, node: folio_nid(folio), |
| 674 | HUGETLB_VMEMMAP_RESERVE_SIZE); |
| 675 | register_page_bootmem_memmap(section_nr: pfn_to_section_nr(pfn: spfn), |
| 676 | map: &folio->page, |
| 677 | HUGETLB_VMEMMAP_RESERVE_SIZE); |
| 678 | static_branch_inc(&hugetlb_optimize_vmemmap_key); |
| 679 | continue; |
| 680 | } |
| 681 | |
| 682 | nr_to_optimize++; |
| 683 | |
| 684 | ret = hugetlb_vmemmap_split_folio(h, folio); |
| 685 | |
| 686 | /* |
| 687 | * Spliting the PMD requires allocating a page, thus lets fail |
| 688 | * early once we encounter the first OOM. No point in retrying |
| 689 | * as it can be dynamically done on remap with the memory |
| 690 | * we get back from the vmemmap deduplication. |
| 691 | */ |
| 692 | if (ret == -ENOMEM) |
| 693 | break; |
| 694 | } |
| 695 | |
| 696 | if (!nr_to_optimize) |
| 697 | /* |
| 698 | * All pre-HVO folios, nothing left to do. It's ok if |
| 699 | * there is a mix of pre-HVO and not yet HVO-ed folios |
| 700 | * here, as __hugetlb_vmemmap_optimize_folio() will |
| 701 | * skip any folios that already have the optimized flag |
| 702 | * set, see vmemmap_should_optimize_folio(). |
| 703 | */ |
| 704 | goto out; |
| 705 | |
| 706 | flush_tlb_all(); |
| 707 | |
| 708 | list_for_each_entry(folio, folio_list, lru) { |
| 709 | int ret; |
| 710 | |
| 711 | ret = __hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, flags); |
| 712 | /* only need to synchronize_rcu() once for each batch */ |
| 713 | flags &= ~VMEMMAP_SYNCHRONIZE_RCU; |
| 714 | |
| 715 | /* |
| 716 | * Pages to be freed may have been accumulated. If we |
| 717 | * encounter an ENOMEM, free what we have and try again. |
| 718 | * This can occur in the case that both spliting fails |
| 719 | * halfway and head page allocation also failed. In this |
| 720 | * case __hugetlb_vmemmap_optimize_folio() would free memory |
| 721 | * allowing more vmemmap remaps to occur. |
| 722 | */ |
| 723 | if (ret == -ENOMEM && !list_empty(head: &vmemmap_pages)) { |
| 724 | flush_tlb_all(); |
| 725 | free_vmemmap_page_list(list: &vmemmap_pages); |
| 726 | INIT_LIST_HEAD(list: &vmemmap_pages); |
| 727 | __hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, flags); |
| 728 | } |
| 729 | } |
| 730 | |
| 731 | out: |
| 732 | flush_tlb_all(); |
| 733 | free_vmemmap_page_list(list: &vmemmap_pages); |
| 734 | } |
| 735 | |
| 736 | void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) |
| 737 | { |
| 738 | __hugetlb_vmemmap_optimize_folios(h, folio_list, boot: false); |
| 739 | } |
| 740 | |
| 741 | void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) |
| 742 | { |
| 743 | __hugetlb_vmemmap_optimize_folios(h, folio_list, boot: true); |
| 744 | } |
| 745 | |
| 746 | #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT |
| 747 | |
| 748 | /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ |
| 749 | static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) |
| 750 | { |
| 751 | unsigned long section_size, psize, pmd_vmemmap_size; |
| 752 | phys_addr_t paddr; |
| 753 | |
| 754 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
| 755 | return false; |
| 756 | |
| 757 | if (!hugetlb_vmemmap_optimizable(h: m->hstate)) |
| 758 | return false; |
| 759 | |
| 760 | psize = huge_page_size(h: m->hstate); |
| 761 | paddr = virt_to_phys(address: m); |
| 762 | |
| 763 | /* |
| 764 | * Pre-HVO only works if the bootmem huge page |
| 765 | * is aligned to the section size. |
| 766 | */ |
| 767 | section_size = (1UL << PA_SECTION_SHIFT); |
| 768 | if (!IS_ALIGNED(paddr, section_size) || |
| 769 | !IS_ALIGNED(psize, section_size)) |
| 770 | return false; |
| 771 | |
| 772 | /* |
| 773 | * The pre-HVO code does not deal with splitting PMDS, |
| 774 | * so the bootmem page must be aligned to the number |
| 775 | * of base pages that can be mapped with one vmemmap PMD. |
| 776 | */ |
| 777 | pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; |
| 778 | if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || |
| 779 | !IS_ALIGNED(psize, pmd_vmemmap_size)) |
| 780 | return false; |
| 781 | |
| 782 | return true; |
| 783 | } |
| 784 | |
| 785 | /* |
| 786 | * Initialize memmap section for a gigantic page, HVO-style. |
| 787 | */ |
| 788 | void __init hugetlb_vmemmap_init_early(int nid) |
| 789 | { |
| 790 | unsigned long psize, paddr, section_size; |
| 791 | unsigned long ns, i, pnum, pfn, nr_pages; |
| 792 | unsigned long start, end; |
| 793 | struct huge_bootmem_page *m = NULL; |
| 794 | void *map; |
| 795 | |
| 796 | /* |
| 797 | * Noting to do if bootmem pages were not allocated |
| 798 | * early in boot, or if HVO wasn't enabled in the |
| 799 | * first place. |
| 800 | */ |
| 801 | if (!hugetlb_bootmem_allocated()) |
| 802 | return; |
| 803 | |
| 804 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
| 805 | return; |
| 806 | |
| 807 | section_size = (1UL << PA_SECTION_SHIFT); |
| 808 | |
| 809 | list_for_each_entry(m, &huge_boot_pages[nid], list) { |
| 810 | if (!vmemmap_should_optimize_bootmem_page(m)) |
| 811 | continue; |
| 812 | |
| 813 | nr_pages = pages_per_huge_page(h: m->hstate); |
| 814 | psize = nr_pages << PAGE_SHIFT; |
| 815 | paddr = virt_to_phys(address: m); |
| 816 | pfn = PHYS_PFN(paddr); |
| 817 | map = pfn_to_page(pfn); |
| 818 | start = (unsigned long)map; |
| 819 | end = start + nr_pages * sizeof(struct page); |
| 820 | |
| 821 | if (vmemmap_populate_hvo(start, end, node: nid, |
| 822 | HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) |
| 823 | continue; |
| 824 | |
| 825 | memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); |
| 826 | |
| 827 | pnum = pfn_to_section_nr(pfn); |
| 828 | ns = psize / section_size; |
| 829 | |
| 830 | for (i = 0; i < ns; i++) { |
| 831 | sparse_init_early_section(nid, map, pnum, |
| 832 | SECTION_IS_VMEMMAP_PREINIT); |
| 833 | map += section_map_size(); |
| 834 | pnum++; |
| 835 | } |
| 836 | |
| 837 | m->flags |= HUGE_BOOTMEM_HVO; |
| 838 | } |
| 839 | } |
| 840 | |
| 841 | void __init hugetlb_vmemmap_init_late(int nid) |
| 842 | { |
| 843 | struct huge_bootmem_page *m, *tm; |
| 844 | unsigned long phys, nr_pages, start, end; |
| 845 | unsigned long pfn, nr_mmap; |
| 846 | struct hstate *h; |
| 847 | void *map; |
| 848 | |
| 849 | if (!hugetlb_bootmem_allocated()) |
| 850 | return; |
| 851 | |
| 852 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
| 853 | return; |
| 854 | |
| 855 | list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { |
| 856 | if (!(m->flags & HUGE_BOOTMEM_HVO)) |
| 857 | continue; |
| 858 | |
| 859 | phys = virt_to_phys(address: m); |
| 860 | h = m->hstate; |
| 861 | pfn = PHYS_PFN(phys); |
| 862 | nr_pages = pages_per_huge_page(h); |
| 863 | |
| 864 | if (!hugetlb_bootmem_page_zones_valid(nid, m)) { |
| 865 | /* |
| 866 | * Oops, the hugetlb page spans multiple zones. |
| 867 | * Remove it from the list, and undo HVO. |
| 868 | */ |
| 869 | list_del(entry: &m->list); |
| 870 | |
| 871 | map = pfn_to_page(pfn); |
| 872 | |
| 873 | start = (unsigned long)map; |
| 874 | end = start + nr_pages * sizeof(struct page); |
| 875 | |
| 876 | vmemmap_undo_hvo(start, end, node: nid, |
| 877 | HUGETLB_VMEMMAP_RESERVE_SIZE); |
| 878 | nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; |
| 879 | memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); |
| 880 | |
| 881 | memblock_phys_free(base: phys, size: huge_page_size(h)); |
| 882 | continue; |
| 883 | } else |
| 884 | m->flags |= HUGE_BOOTMEM_ZONES_VALID; |
| 885 | } |
| 886 | } |
| 887 | #endif |
| 888 | |
| 889 | static const struct ctl_table hugetlb_vmemmap_sysctls[] = { |
| 890 | { |
| 891 | .procname = "hugetlb_optimize_vmemmap" , |
| 892 | .data = &vmemmap_optimize_enabled, |
| 893 | .maxlen = sizeof(vmemmap_optimize_enabled), |
| 894 | .mode = 0644, |
| 895 | .proc_handler = proc_dobool, |
| 896 | }, |
| 897 | }; |
| 898 | |
| 899 | static int __init hugetlb_vmemmap_init(void) |
| 900 | { |
| 901 | const struct hstate *h; |
| 902 | |
| 903 | /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ |
| 904 | BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); |
| 905 | |
| 906 | for_each_hstate(h) { |
| 907 | if (hugetlb_vmemmap_optimizable(h)) { |
| 908 | register_sysctl_init("vm" , hugetlb_vmemmap_sysctls); |
| 909 | break; |
| 910 | } |
| 911 | } |
| 912 | return 0; |
| 913 | } |
| 914 | late_initcall(hugetlb_vmemmap_init); |
| 915 | |