| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Implement mseal() syscall. |
| 4 | * |
| 5 | * Copyright (c) 2023,2024 Google, Inc. |
| 6 | * |
| 7 | * Author: Jeff Xu <jeffxu@chromium.org> |
| 8 | */ |
| 9 | |
| 10 | #include <linux/mempolicy.h> |
| 11 | #include <linux/mman.h> |
| 12 | #include <linux/mm.h> |
| 13 | #include <linux/mm_inline.h> |
| 14 | #include <linux/syscalls.h> |
| 15 | #include <linux/sched.h> |
| 16 | #include "internal.h" |
| 17 | |
| 18 | /* |
| 19 | * mseal() disallows an input range which contain unmapped ranges (VMA holes). |
| 20 | * |
| 21 | * It disallows unmapped regions from start to end whether they exist at the |
| 22 | * start, in the middle, or at the end of the range, or any combination thereof. |
| 23 | * |
| 24 | * This is because after sealng a range, there's nothing to stop memory mapping |
| 25 | * of ranges in the remaining gaps later, meaning that the user might then |
| 26 | * wrongly consider the entirety of the mseal()'d range to be sealed when it |
| 27 | * in fact isn't. |
| 28 | */ |
| 29 | |
| 30 | /* |
| 31 | * Does the [start, end) range contain any unmapped memory? |
| 32 | * |
| 33 | * We ensure that: |
| 34 | * - start is part of a valid VMA. |
| 35 | * - end is part of a valid VMA. |
| 36 | * - no gap (unallocated memory) exists between start and end. |
| 37 | */ |
| 38 | static bool range_contains_unmapped(struct mm_struct *mm, |
| 39 | unsigned long start, unsigned long end) |
| 40 | { |
| 41 | struct vm_area_struct *vma; |
| 42 | unsigned long prev_end = start; |
| 43 | VMA_ITERATOR(vmi, current->mm, start); |
| 44 | |
| 45 | for_each_vma_range(vmi, vma, end) { |
| 46 | if (vma->vm_start > prev_end) |
| 47 | return true; |
| 48 | |
| 49 | prev_end = vma->vm_end; |
| 50 | } |
| 51 | |
| 52 | return prev_end < end; |
| 53 | } |
| 54 | |
| 55 | static int mseal_apply(struct mm_struct *mm, |
| 56 | unsigned long start, unsigned long end) |
| 57 | { |
| 58 | struct vm_area_struct *vma, *prev; |
| 59 | unsigned long curr_start = start; |
| 60 | VMA_ITERATOR(vmi, mm, start); |
| 61 | |
| 62 | /* We know there are no gaps so this will be non-NULL. */ |
| 63 | vma = vma_iter_load(vmi: &vmi); |
| 64 | prev = vma_prev(vmi: &vmi); |
| 65 | if (start > vma->vm_start) |
| 66 | prev = vma; |
| 67 | |
| 68 | for_each_vma_range(vmi, vma, end) { |
| 69 | unsigned long curr_end = MIN(vma->vm_end, end); |
| 70 | |
| 71 | if (!(vma->vm_flags & VM_SEALED)) { |
| 72 | vma = vma_modify_flags(vmi: &vmi, prev, vma, |
| 73 | start: curr_start, end: curr_end, |
| 74 | vm_flags: vma->vm_flags | VM_SEALED); |
| 75 | if (IS_ERR(ptr: vma)) |
| 76 | return PTR_ERR(ptr: vma); |
| 77 | vm_flags_set(vma, VM_SEALED); |
| 78 | } |
| 79 | |
| 80 | prev = vma; |
| 81 | curr_start = curr_end; |
| 82 | } |
| 83 | |
| 84 | return 0; |
| 85 | } |
| 86 | |
| 87 | /* |
| 88 | * mseal(2) seals the VM's meta data from |
| 89 | * selected syscalls. |
| 90 | * |
| 91 | * addr/len: VM address range. |
| 92 | * |
| 93 | * The address range by addr/len must meet: |
| 94 | * start (addr) must be in a valid VMA. |
| 95 | * end (addr + len) must be in a valid VMA. |
| 96 | * no gap (unallocated memory) between start and end. |
| 97 | * start (addr) must be page aligned. |
| 98 | * |
| 99 | * len: len will be page aligned implicitly. |
| 100 | * |
| 101 | * Below VMA operations are blocked after sealing. |
| 102 | * 1> Unmapping, moving to another location, and shrinking |
| 103 | * the size, via munmap() and mremap(), can leave an empty |
| 104 | * space, therefore can be replaced with a VMA with a new |
| 105 | * set of attributes. |
| 106 | * 2> Moving or expanding a different vma into the current location, |
| 107 | * via mremap(). |
| 108 | * 3> Modifying a VMA via mmap(MAP_FIXED). |
| 109 | * 4> Size expansion, via mremap(), does not appear to pose any |
| 110 | * specific risks to sealed VMAs. It is included anyway because |
| 111 | * the use case is unclear. In any case, users can rely on |
| 112 | * merging to expand a sealed VMA. |
| 113 | * 5> mprotect and pkey_mprotect. |
| 114 | * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) |
| 115 | * for anonymous memory, when users don't have write permission to the |
| 116 | * memory. Those behaviors can alter region contents by discarding pages, |
| 117 | * effectively a memset(0) for anonymous memory. |
| 118 | * |
| 119 | * flags: reserved. |
| 120 | * |
| 121 | * return values: |
| 122 | * zero: success. |
| 123 | * -EINVAL: |
| 124 | * invalid input flags. |
| 125 | * start address is not page aligned. |
| 126 | * Address arange (start + len) overflow. |
| 127 | * -ENOMEM: |
| 128 | * addr is not a valid address (not allocated). |
| 129 | * end (start + len) is not a valid address. |
| 130 | * a gap (unallocated memory) between start and end. |
| 131 | * -EPERM: |
| 132 | * - In 32 bit architecture, sealing is not supported. |
| 133 | * Note: |
| 134 | * user can call mseal(2) multiple times, adding a seal on an |
| 135 | * already sealed memory is a no-action (no error). |
| 136 | * |
| 137 | * unseal() is not supported. |
| 138 | */ |
| 139 | int do_mseal(unsigned long start, size_t len_in, unsigned long flags) |
| 140 | { |
| 141 | size_t len; |
| 142 | int ret = 0; |
| 143 | unsigned long end; |
| 144 | struct mm_struct *mm = current->mm; |
| 145 | |
| 146 | /* Verify flags not set. */ |
| 147 | if (flags) |
| 148 | return -EINVAL; |
| 149 | |
| 150 | start = untagged_addr(start); |
| 151 | if (!PAGE_ALIGNED(start)) |
| 152 | return -EINVAL; |
| 153 | |
| 154 | len = PAGE_ALIGN(len_in); |
| 155 | /* Check to see whether len was rounded up from small -ve to zero. */ |
| 156 | if (len_in && !len) |
| 157 | return -EINVAL; |
| 158 | |
| 159 | end = start + len; |
| 160 | if (end < start) |
| 161 | return -EINVAL; |
| 162 | |
| 163 | if (end == start) |
| 164 | return 0; |
| 165 | |
| 166 | if (mmap_write_lock_killable(mm)) |
| 167 | return -EINTR; |
| 168 | |
| 169 | if (range_contains_unmapped(mm, start, end)) { |
| 170 | ret = -ENOMEM; |
| 171 | goto out; |
| 172 | } |
| 173 | |
| 174 | /* |
| 175 | * Second pass, this should success, unless there are errors |
| 176 | * from vma_modify_flags, e.g. merge/split error, or process |
| 177 | * reaching the max supported VMAs, however, those cases shall |
| 178 | * be rare. |
| 179 | */ |
| 180 | ret = mseal_apply(mm, start, end); |
| 181 | |
| 182 | out: |
| 183 | mmap_write_unlock(mm); |
| 184 | return ret; |
| 185 | } |
| 186 | |
| 187 | SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, |
| 188 | flags) |
| 189 | { |
| 190 | return do_mseal(start, len_in: len, flags); |
| 191 | } |
| 192 | |