| 1 | // SPDX-License-Identifier: GPL-2.0 | 
|---|
| 2 | /* | 
|---|
| 3 | *  linux/fs/ext4/block_validity.c | 
|---|
| 4 | * | 
|---|
| 5 | * Copyright (C) 2009 | 
|---|
| 6 | * Theodore Ts'o (tytso@mit.edu) | 
|---|
| 7 | * | 
|---|
| 8 | * Track which blocks in the filesystem are metadata blocks that | 
|---|
| 9 | * should never be used as data blocks by files or directories. | 
|---|
| 10 | */ | 
|---|
| 11 |  | 
|---|
| 12 | #include <linux/time.h> | 
|---|
| 13 | #include <linux/fs.h> | 
|---|
| 14 | #include <linux/namei.h> | 
|---|
| 15 | #include <linux/quotaops.h> | 
|---|
| 16 | #include <linux/buffer_head.h> | 
|---|
| 17 | #include <linux/swap.h> | 
|---|
| 18 | #include <linux/pagemap.h> | 
|---|
| 19 | #include <linux/blkdev.h> | 
|---|
| 20 | #include <linux/slab.h> | 
|---|
| 21 | #include "ext4.h" | 
|---|
| 22 |  | 
|---|
| 23 | struct ext4_system_zone { | 
|---|
| 24 | struct rb_node	node; | 
|---|
| 25 | ext4_fsblk_t	start_blk; | 
|---|
| 26 | unsigned int	count; | 
|---|
| 27 | u32		ino; | 
|---|
| 28 | }; | 
|---|
| 29 |  | 
|---|
| 30 | static struct kmem_cache *ext4_system_zone_cachep; | 
|---|
| 31 |  | 
|---|
| 32 | int __init ext4_init_system_zone(void) | 
|---|
| 33 | { | 
|---|
| 34 | ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); | 
|---|
| 35 | if (ext4_system_zone_cachep == NULL) | 
|---|
| 36 | return -ENOMEM; | 
|---|
| 37 | return 0; | 
|---|
| 38 | } | 
|---|
| 39 |  | 
|---|
| 40 | void ext4_exit_system_zone(void) | 
|---|
| 41 | { | 
|---|
| 42 | rcu_barrier(); | 
|---|
| 43 | kmem_cache_destroy(s: ext4_system_zone_cachep); | 
|---|
| 44 | } | 
|---|
| 45 |  | 
|---|
| 46 | static inline int can_merge(struct ext4_system_zone *entry1, | 
|---|
| 47 | struct ext4_system_zone *entry2) | 
|---|
| 48 | { | 
|---|
| 49 | if ((entry1->start_blk + entry1->count) == entry2->start_blk && | 
|---|
| 50 | entry1->ino == entry2->ino) | 
|---|
| 51 | return 1; | 
|---|
| 52 | return 0; | 
|---|
| 53 | } | 
|---|
| 54 |  | 
|---|
| 55 | static void release_system_zone(struct ext4_system_blocks *system_blks) | 
|---|
| 56 | { | 
|---|
| 57 | struct ext4_system_zone	*entry, *n; | 
|---|
| 58 |  | 
|---|
| 59 | rbtree_postorder_for_each_entry_safe(entry, n, | 
|---|
| 60 | &system_blks->root, node) | 
|---|
| 61 | kmem_cache_free(s: ext4_system_zone_cachep, objp: entry); | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | /* | 
|---|
| 65 | * Mark a range of blocks as belonging to the "system zone" --- that | 
|---|
| 66 | * is, filesystem metadata blocks which should never be used by | 
|---|
| 67 | * inodes. | 
|---|
| 68 | */ | 
|---|
| 69 | static int add_system_zone(struct ext4_system_blocks *system_blks, | 
|---|
| 70 | ext4_fsblk_t start_blk, | 
|---|
| 71 | unsigned int count, u32 ino) | 
|---|
| 72 | { | 
|---|
| 73 | struct ext4_system_zone *new_entry, *entry; | 
|---|
| 74 | struct rb_node **n = &system_blks->root.rb_node, *node; | 
|---|
| 75 | struct rb_node *parent = NULL, *new_node; | 
|---|
| 76 |  | 
|---|
| 77 | while (*n) { | 
|---|
| 78 | parent = *n; | 
|---|
| 79 | entry = rb_entry(parent, struct ext4_system_zone, node); | 
|---|
| 80 | if (start_blk < entry->start_blk) | 
|---|
| 81 | n = &(*n)->rb_left; | 
|---|
| 82 | else if (start_blk >= (entry->start_blk + entry->count)) | 
|---|
| 83 | n = &(*n)->rb_right; | 
|---|
| 84 | else	/* Unexpected overlap of system zones. */ | 
|---|
| 85 | return -EFSCORRUPTED; | 
|---|
| 86 | } | 
|---|
| 87 |  | 
|---|
| 88 | new_entry = kmem_cache_alloc(ext4_system_zone_cachep, | 
|---|
| 89 | GFP_KERNEL); | 
|---|
| 90 | if (!new_entry) | 
|---|
| 91 | return -ENOMEM; | 
|---|
| 92 | new_entry->start_blk = start_blk; | 
|---|
| 93 | new_entry->count = count; | 
|---|
| 94 | new_entry->ino = ino; | 
|---|
| 95 | new_node = &new_entry->node; | 
|---|
| 96 |  | 
|---|
| 97 | rb_link_node(node: new_node, parent, rb_link: n); | 
|---|
| 98 | rb_insert_color(new_node, &system_blks->root); | 
|---|
| 99 |  | 
|---|
| 100 | /* Can we merge to the left? */ | 
|---|
| 101 | node = rb_prev(new_node); | 
|---|
| 102 | if (node) { | 
|---|
| 103 | entry = rb_entry(node, struct ext4_system_zone, node); | 
|---|
| 104 | if (can_merge(entry1: entry, entry2: new_entry)) { | 
|---|
| 105 | new_entry->start_blk = entry->start_blk; | 
|---|
| 106 | new_entry->count += entry->count; | 
|---|
| 107 | rb_erase(node, &system_blks->root); | 
|---|
| 108 | kmem_cache_free(s: ext4_system_zone_cachep, objp: entry); | 
|---|
| 109 | } | 
|---|
| 110 | } | 
|---|
| 111 |  | 
|---|
| 112 | /* Can we merge to the right? */ | 
|---|
| 113 | node = rb_next(new_node); | 
|---|
| 114 | if (node) { | 
|---|
| 115 | entry = rb_entry(node, struct ext4_system_zone, node); | 
|---|
| 116 | if (can_merge(entry1: new_entry, entry2: entry)) { | 
|---|
| 117 | new_entry->count += entry->count; | 
|---|
| 118 | rb_erase(node, &system_blks->root); | 
|---|
| 119 | kmem_cache_free(s: ext4_system_zone_cachep, objp: entry); | 
|---|
| 120 | } | 
|---|
| 121 | } | 
|---|
| 122 | return 0; | 
|---|
| 123 | } | 
|---|
| 124 |  | 
|---|
| 125 | static void debug_print_tree(struct ext4_sb_info *sbi) | 
|---|
| 126 | { | 
|---|
| 127 | struct rb_node *node; | 
|---|
| 128 | struct ext4_system_zone *entry; | 
|---|
| 129 | struct ext4_system_blocks *system_blks; | 
|---|
| 130 | int first = 1; | 
|---|
| 131 |  | 
|---|
| 132 | printk(KERN_INFO "System zones: "); | 
|---|
| 133 | rcu_read_lock(); | 
|---|
| 134 | system_blks = rcu_dereference(sbi->s_system_blks); | 
|---|
| 135 | node = rb_first(&system_blks->root); | 
|---|
| 136 | while (node) { | 
|---|
| 137 | entry = rb_entry(node, struct ext4_system_zone, node); | 
|---|
| 138 | printk(KERN_CONT "%s%llu-%llu", first ? "": ", ", | 
|---|
| 139 | entry->start_blk, entry->start_blk + entry->count - 1); | 
|---|
| 140 | first = 0; | 
|---|
| 141 | node = rb_next(node); | 
|---|
| 142 | } | 
|---|
| 143 | rcu_read_unlock(); | 
|---|
| 144 | printk(KERN_CONT "\n"); | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | static int ext4_protect_reserved_inode(struct super_block *sb, | 
|---|
| 148 | struct ext4_system_blocks *system_blks, | 
|---|
| 149 | u32 ino) | 
|---|
| 150 | { | 
|---|
| 151 | struct inode *inode; | 
|---|
| 152 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 
|---|
| 153 | struct ext4_map_blocks map; | 
|---|
| 154 | u32 i = 0, num; | 
|---|
| 155 | int err = 0, n; | 
|---|
| 156 |  | 
|---|
| 157 | if ((ino < EXT4_ROOT_INO) || | 
|---|
| 158 | (ino > le32_to_cpu(sbi->s_es->s_inodes_count))) | 
|---|
| 159 | return -EINVAL; | 
|---|
| 160 | inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL); | 
|---|
| 161 | if (IS_ERR(ptr: inode)) | 
|---|
| 162 | return PTR_ERR(ptr: inode); | 
|---|
| 163 | num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; | 
|---|
| 164 | while (i < num) { | 
|---|
| 165 | cond_resched(); | 
|---|
| 166 | map.m_lblk = i; | 
|---|
| 167 | map.m_len = num - i; | 
|---|
| 168 | n = ext4_map_blocks(NULL, inode, map: &map, flags: 0); | 
|---|
| 169 | if (n < 0) { | 
|---|
| 170 | err = n; | 
|---|
| 171 | break; | 
|---|
| 172 | } | 
|---|
| 173 | if (n == 0) { | 
|---|
| 174 | i++; | 
|---|
| 175 | } else { | 
|---|
| 176 | err = add_system_zone(system_blks, start_blk: map.m_pblk, count: n, ino); | 
|---|
| 177 | if (err < 0) { | 
|---|
| 178 | if (err == -EFSCORRUPTED) { | 
|---|
| 179 | EXT4_ERROR_INODE_ERR(inode, -err, | 
|---|
| 180 | "blocks %llu-%llu from inode overlap system zone", | 
|---|
| 181 | map.m_pblk, | 
|---|
| 182 | map.m_pblk + map.m_len - 1); | 
|---|
| 183 | } | 
|---|
| 184 | break; | 
|---|
| 185 | } | 
|---|
| 186 | i += n; | 
|---|
| 187 | } | 
|---|
| 188 | } | 
|---|
| 189 | iput(inode); | 
|---|
| 190 | return err; | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | static void ext4_destroy_system_zone(struct rcu_head *rcu) | 
|---|
| 194 | { | 
|---|
| 195 | struct ext4_system_blocks *system_blks; | 
|---|
| 196 |  | 
|---|
| 197 | system_blks = container_of(rcu, struct ext4_system_blocks, rcu); | 
|---|
| 198 | release_system_zone(system_blks); | 
|---|
| 199 | kfree(objp: system_blks); | 
|---|
| 200 | } | 
|---|
| 201 |  | 
|---|
| 202 | /* | 
|---|
| 203 | * Build system zone rbtree which is used for block validity checking. | 
|---|
| 204 | * | 
|---|
| 205 | * The update of system_blks pointer in this function is protected by | 
|---|
| 206 | * sb->s_umount semaphore. However we have to be careful as we can be | 
|---|
| 207 | * racing with ext4_inode_block_valid() calls reading system_blks rbtree | 
|---|
| 208 | * protected only by RCU. That's why we first build the rbtree and then | 
|---|
| 209 | * swap it in place. | 
|---|
| 210 | */ | 
|---|
| 211 | int ext4_setup_system_zone(struct super_block *sb) | 
|---|
| 212 | { | 
|---|
| 213 | ext4_group_t ngroups = ext4_get_groups_count(sb); | 
|---|
| 214 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 
|---|
| 215 | struct ext4_system_blocks *system_blks; | 
|---|
| 216 | struct ext4_group_desc *gdp; | 
|---|
| 217 | ext4_group_t i; | 
|---|
| 218 | int ret; | 
|---|
| 219 |  | 
|---|
| 220 | system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); | 
|---|
| 221 | if (!system_blks) | 
|---|
| 222 | return -ENOMEM; | 
|---|
| 223 |  | 
|---|
| 224 | for (i=0; i < ngroups; i++) { | 
|---|
| 225 | unsigned int meta_blks = ext4_num_base_meta_blocks(sb, block_group: i); | 
|---|
| 226 |  | 
|---|
| 227 | cond_resched(); | 
|---|
| 228 | if (meta_blks != 0) { | 
|---|
| 229 | ret = add_system_zone(system_blks, | 
|---|
| 230 | start_blk: ext4_group_first_block_no(sb, group_no: i), | 
|---|
| 231 | count: meta_blks, ino: 0); | 
|---|
| 232 | if (ret) | 
|---|
| 233 | goto err; | 
|---|
| 234 | } | 
|---|
| 235 | gdp = ext4_get_group_desc(sb, block_group: i, NULL); | 
|---|
| 236 | ret = add_system_zone(system_blks, | 
|---|
| 237 | start_blk: ext4_block_bitmap(sb, bg: gdp), count: 1, ino: 0); | 
|---|
| 238 | if (ret) | 
|---|
| 239 | goto err; | 
|---|
| 240 | ret = add_system_zone(system_blks, | 
|---|
| 241 | start_blk: ext4_inode_bitmap(sb, bg: gdp), count: 1, ino: 0); | 
|---|
| 242 | if (ret) | 
|---|
| 243 | goto err; | 
|---|
| 244 | ret = add_system_zone(system_blks, | 
|---|
| 245 | start_blk: ext4_inode_table(sb, bg: gdp), | 
|---|
| 246 | count: sbi->s_itb_per_group, ino: 0); | 
|---|
| 247 | if (ret) | 
|---|
| 248 | goto err; | 
|---|
| 249 | } | 
|---|
| 250 | if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { | 
|---|
| 251 | ret = ext4_protect_reserved_inode(sb, system_blks, | 
|---|
| 252 | le32_to_cpu(sbi->s_es->s_journal_inum)); | 
|---|
| 253 | if (ret) | 
|---|
| 254 | goto err; | 
|---|
| 255 | } | 
|---|
| 256 |  | 
|---|
| 257 | /* | 
|---|
| 258 | * System blks rbtree complete, announce it once to prevent racing | 
|---|
| 259 | * with ext4_inode_block_valid() accessing the rbtree at the same | 
|---|
| 260 | * time. | 
|---|
| 261 | */ | 
|---|
| 262 | rcu_assign_pointer(sbi->s_system_blks, system_blks); | 
|---|
| 263 |  | 
|---|
| 264 | if (test_opt(sb, DEBUG)) | 
|---|
| 265 | debug_print_tree(sbi); | 
|---|
| 266 | return 0; | 
|---|
| 267 | err: | 
|---|
| 268 | release_system_zone(system_blks); | 
|---|
| 269 | kfree(objp: system_blks); | 
|---|
| 270 | return ret; | 
|---|
| 271 | } | 
|---|
| 272 |  | 
|---|
| 273 | /* | 
|---|
| 274 | * Called when the filesystem is unmounted or when remounting it with | 
|---|
| 275 | * noblock_validity specified. | 
|---|
| 276 | * | 
|---|
| 277 | * The update of system_blks pointer in this function is protected by | 
|---|
| 278 | * sb->s_umount semaphore. However we have to be careful as we can be | 
|---|
| 279 | * racing with ext4_inode_block_valid() calls reading system_blks rbtree | 
|---|
| 280 | * protected only by RCU. So we first clear the system_blks pointer and | 
|---|
| 281 | * then free the rbtree only after RCU grace period expires. | 
|---|
| 282 | */ | 
|---|
| 283 | void ext4_release_system_zone(struct super_block *sb) | 
|---|
| 284 | { | 
|---|
| 285 | struct ext4_system_blocks *system_blks; | 
|---|
| 286 |  | 
|---|
| 287 | system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks, | 
|---|
| 288 | lockdep_is_held(&sb->s_umount)); | 
|---|
| 289 | rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL); | 
|---|
| 290 |  | 
|---|
| 291 | if (system_blks) | 
|---|
| 292 | call_rcu(head: &system_blks->rcu, func: ext4_destroy_system_zone); | 
|---|
| 293 | } | 
|---|
| 294 |  | 
|---|
| 295 | int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, | 
|---|
| 296 | ext4_fsblk_t start_blk, unsigned int count) | 
|---|
| 297 | { | 
|---|
| 298 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 
|---|
| 299 | struct ext4_system_blocks *system_blks; | 
|---|
| 300 | struct ext4_system_zone *entry; | 
|---|
| 301 | struct rb_node *n; | 
|---|
| 302 | int ret = 1; | 
|---|
| 303 |  | 
|---|
| 304 | if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || | 
|---|
| 305 | (start_blk + count < start_blk) || | 
|---|
| 306 | (start_blk + count > ext4_blocks_count(es: sbi->s_es))) | 
|---|
| 307 | return 0; | 
|---|
| 308 |  | 
|---|
| 309 | /* | 
|---|
| 310 | * Lock the system zone to prevent it being released concurrently | 
|---|
| 311 | * when doing a remount which inverse current "[no]block_validity" | 
|---|
| 312 | * mount option. | 
|---|
| 313 | */ | 
|---|
| 314 | rcu_read_lock(); | 
|---|
| 315 | system_blks = rcu_dereference(sbi->s_system_blks); | 
|---|
| 316 | if (system_blks == NULL) | 
|---|
| 317 | goto out_rcu; | 
|---|
| 318 |  | 
|---|
| 319 | n = system_blks->root.rb_node; | 
|---|
| 320 | while (n) { | 
|---|
| 321 | entry = rb_entry(n, struct ext4_system_zone, node); | 
|---|
| 322 | if (start_blk + count - 1 < entry->start_blk) | 
|---|
| 323 | n = n->rb_left; | 
|---|
| 324 | else if (start_blk >= (entry->start_blk + entry->count)) | 
|---|
| 325 | n = n->rb_right; | 
|---|
| 326 | else { | 
|---|
| 327 | ret = 0; | 
|---|
| 328 | if (inode) | 
|---|
| 329 | ret = (entry->ino == inode->i_ino); | 
|---|
| 330 | break; | 
|---|
| 331 | } | 
|---|
| 332 | } | 
|---|
| 333 | out_rcu: | 
|---|
| 334 | rcu_read_unlock(); | 
|---|
| 335 | return ret; | 
|---|
| 336 | } | 
|---|
| 337 |  | 
|---|
| 338 | /* | 
|---|
| 339 | * Returns 1 if the passed-in block region (start_blk, | 
|---|
| 340 | * start_blk+count) is valid; 0 if some part of the block region | 
|---|
| 341 | * overlaps with some other filesystem metadata blocks. | 
|---|
| 342 | */ | 
|---|
| 343 | int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, | 
|---|
| 344 | unsigned int count) | 
|---|
| 345 | { | 
|---|
| 346 | return ext4_sb_block_valid(sb: inode->i_sb, inode, start_blk, count); | 
|---|
| 347 | } | 
|---|
| 348 |  | 
|---|
| 349 | int ext4_check_blockref(const char *function, unsigned int line, | 
|---|
| 350 | struct inode *inode, __le32 *p, unsigned int max) | 
|---|
| 351 | { | 
|---|
| 352 | __le32 *bref = p; | 
|---|
| 353 | unsigned int blk; | 
|---|
| 354 | journal_t *journal = EXT4_SB(sb: inode->i_sb)->s_journal; | 
|---|
| 355 |  | 
|---|
| 356 | if (journal && inode == journal->j_inode) | 
|---|
| 357 | return 0; | 
|---|
| 358 |  | 
|---|
| 359 | while (bref < p+max) { | 
|---|
| 360 | blk = le32_to_cpu(*bref++); | 
|---|
| 361 | if (blk && | 
|---|
| 362 | unlikely(!ext4_inode_block_valid(inode, blk, 1))) { | 
|---|
| 363 | ext4_error_inode(inode, function, line, blk, | 
|---|
| 364 | "invalid block"); | 
|---|
| 365 | return -EFSCORRUPTED; | 
|---|
| 366 | } | 
|---|
| 367 | } | 
|---|
| 368 | return 0; | 
|---|
| 369 | } | 
|---|
| 370 |  | 
|---|
| 371 |  | 
|---|