1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 md.h : kernel internal structure of the Linux MD driver
4 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
5
6*/
7
8#ifndef _MD_MD_H
9#define _MD_MD_H
10
11#include <linux/blkdev.h>
12#include <linux/backing-dev.h>
13#include <linux/badblocks.h>
14#include <linux/kobject.h>
15#include <linux/list.h>
16#include <linux/mm.h>
17#include <linux/mutex.h>
18#include <linux/timer.h>
19#include <linux/wait.h>
20#include <linux/workqueue.h>
21#include <linux/raid/md_u.h>
22#include <trace/events/block.h>
23
24#define MaxSector (~(sector_t)0)
25
26enum md_submodule_type {
27 MD_PERSONALITY = 0,
28 MD_CLUSTER,
29 MD_BITMAP,
30};
31
32enum md_submodule_id {
33 ID_LINEAR = LEVEL_LINEAR,
34 ID_RAID0 = 0,
35 ID_RAID1 = 1,
36 ID_RAID4 = 4,
37 ID_RAID5 = 5,
38 ID_RAID6 = 6,
39 ID_RAID10 = 10,
40 ID_CLUSTER,
41 ID_BITMAP,
42 ID_LLBITMAP,
43 ID_BITMAP_NONE,
44};
45
46struct md_submodule_head {
47 enum md_submodule_type type;
48 enum md_submodule_id id;
49 const char *name;
50 struct module *owner;
51};
52
53/*
54 * These flags should really be called "NO_RETRY" rather than
55 * "FAILFAST" because they don't make any promise about time lapse,
56 * only about the number of retries, which will be zero.
57 * REQ_FAILFAST_DRIVER is not included because
58 * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
59 * seems to suggest that the errors it avoids retrying should usually
60 * be retried.
61 */
62#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
63
64/* Status of sync thread. */
65enum sync_action {
66 /*
67 * Represent by MD_RECOVERY_SYNC, start when:
68 * 1) after assemble, sync data from first rdev to other copies, this
69 * must be done first before other sync actions and will only execute
70 * once;
71 * 2) resize the array(notice that this is not reshape), sync data for
72 * the new range;
73 */
74 ACTION_RESYNC,
75 /*
76 * Represent by MD_RECOVERY_RECOVER, start when:
77 * 1) for new replacement, sync data based on the replace rdev or
78 * available copies from other rdev;
79 * 2) for new member disk while the array is degraded, sync data from
80 * other rdev;
81 * 3) reassemble after power failure or re-add a hot removed rdev, sync
82 * data from first rdev to other copies based on bitmap;
83 */
84 ACTION_RECOVER,
85 /*
86 * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED |
87 * MD_RECOVERY_CHECK, start when user echo "check" to sysfs api
88 * sync_action, used to check if data copies from differenct rdev are
89 * the same. The number of mismatch sectors will be exported to user
90 * by sysfs api mismatch_cnt;
91 */
92 ACTION_CHECK,
93 /*
94 * Represent by MD_RECOVERY_SYNC | MD_RECOVERY_REQUESTED, start when
95 * user echo "repair" to sysfs api sync_action, usually paired with
96 * ACTION_CHECK, used to force syncing data once user found that there
97 * are inconsistent data,
98 */
99 ACTION_REPAIR,
100 /*
101 * Represent by MD_RECOVERY_RESHAPE, start when new member disk is added
102 * to the conf, notice that this is different from spares or
103 * replacement;
104 */
105 ACTION_RESHAPE,
106 /*
107 * Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action
108 * or internal usage like setting the array read-only, will forbid above
109 * actions.
110 */
111 ACTION_FROZEN,
112 /*
113 * All above actions don't match.
114 */
115 ACTION_IDLE,
116 NR_SYNC_ACTIONS,
117};
118
119/*
120 * The struct embedded in rdev is used to serialize IO.
121 */
122struct serial_in_rdev {
123 struct rb_root_cached serial_rb;
124 spinlock_t serial_lock;
125 wait_queue_head_t serial_io_wait;
126};
127
128/*
129 * MD's 'extended' device
130 */
131struct md_rdev {
132 struct list_head same_set; /* RAID devices within the same set */
133
134 sector_t sectors; /* Device size (in 512bytes sectors) */
135 struct mddev *mddev; /* RAID array if running */
136 unsigned long last_events; /* IO event timestamp */
137
138 /*
139 * If meta_bdev is non-NULL, it means that a separate device is
140 * being used to store the metadata (superblock/bitmap) which
141 * would otherwise be contained on the same device as the data (bdev).
142 */
143 struct block_device *meta_bdev;
144 struct block_device *bdev; /* block device handle */
145 struct file *bdev_file; /* Handle from open for bdev */
146
147 struct page *sb_page, *bb_page;
148 int sb_loaded;
149 __u64 sb_events;
150 sector_t data_offset; /* start of data in array */
151 sector_t new_data_offset;/* only relevant while reshaping */
152 sector_t sb_start; /* offset of the super block (in 512byte sectors) */
153 int sb_size; /* bytes in the superblock */
154 int preferred_minor; /* autorun support */
155
156 struct kobject kobj;
157
158 /* A device can be in one of three states based on two flags:
159 * Not working: faulty==1 in_sync==0
160 * Fully working: faulty==0 in_sync==1
161 * Working, but not
162 * in sync with array
163 * faulty==0 in_sync==0
164 *
165 * It can never have faulty==1, in_sync==1
166 * This reduces the burden of testing multiple flags in many cases
167 */
168
169 unsigned long flags; /* bit set of 'enum flag_bits' bits. */
170 wait_queue_head_t blocked_wait;
171
172 int desc_nr; /* descriptor index in the superblock */
173 int raid_disk; /* role of device in array */
174 int new_raid_disk; /* role that the device will have in
175 * the array after a level-change completes.
176 */
177 int saved_raid_disk; /* role that device used to have in the
178 * array and could again if we did a partial
179 * resync from the bitmap
180 */
181 union {
182 sector_t recovery_offset;/* If this device has been partially
183 * recovered, this is where we were
184 * up to.
185 */
186 sector_t journal_tail; /* If this device is a journal device,
187 * this is the journal tail (journal
188 * recovery start point)
189 */
190 };
191
192 atomic_t nr_pending; /* number of pending requests.
193 * only maintained for arrays that
194 * support hot removal
195 */
196 atomic_t read_errors; /* number of consecutive read errors that
197 * we have tried to ignore.
198 */
199 time64_t last_read_error; /* monotonic time since our
200 * last read error
201 */
202 atomic_t corrected_errors; /* number of corrected read errors,
203 * for reporting to userspace and storing
204 * in superblock.
205 */
206
207 struct serial_in_rdev *serial; /* used for raid1 io serialization */
208
209 struct kernfs_node *sysfs_state; /* handle for 'state'
210 * sysfs entry */
211 /* handle for 'unacknowledged_bad_blocks' sysfs dentry */
212 struct kernfs_node *sysfs_unack_badblocks;
213 /* handle for 'bad_blocks' sysfs dentry */
214 struct kernfs_node *sysfs_badblocks;
215 struct badblocks badblocks;
216
217 struct {
218 short offset; /* Offset from superblock to start of PPL.
219 * Not used by external metadata. */
220 unsigned int size; /* Size in sectors of the PPL space */
221 sector_t sector; /* First sector of the PPL space */
222 } ppl;
223};
224enum flag_bits {
225 Faulty, /* device is known to have a fault */
226 In_sync, /* device is in_sync with rest of array */
227 Bitmap_sync, /* ..actually, not quite In_sync. Need a
228 * bitmap-based recovery to get fully in sync.
229 * The bit is only meaningful before device
230 * has been passed to pers->hot_add_disk.
231 */
232 WriteMostly, /* Avoid reading if at all possible */
233 AutoDetected, /* added by auto-detect */
234 Blocked, /* An error occurred but has not yet
235 * been acknowledged by the metadata
236 * handler, so don't allow writes
237 * until it is cleared */
238 WriteErrorSeen, /* A write error has been seen on this
239 * device
240 */
241 FaultRecorded, /* Intermediate state for clearing
242 * Blocked. The Fault is/will-be
243 * recorded in the metadata, but that
244 * metadata hasn't been stored safely
245 * on disk yet.
246 */
247 BlockedBadBlocks, /* A writer is blocked because they
248 * found an unacknowledged bad-block.
249 * This can safely be cleared at any
250 * time, and the writer will re-check.
251 * It may be set at any time, and at
252 * worst the writer will timeout and
253 * re-check. So setting it as
254 * accurately as possible is good, but
255 * not absolutely critical.
256 */
257 WantReplacement, /* This device is a candidate to be
258 * hot-replaced, either because it has
259 * reported some faults, or because
260 * of explicit request.
261 */
262 Replacement, /* This device is a replacement for
263 * a want_replacement device with same
264 * raid_disk number.
265 */
266 Candidate, /* For clustered environments only:
267 * This device is seen locally but not
268 * by the whole cluster
269 */
270 Journal, /* This device is used as journal for
271 * raid-5/6.
272 * Usually, this device should be faster
273 * than other devices in the array
274 */
275 ClusterRemove,
276 ExternalBbl, /* External metadata provides bad
277 * block management for a disk
278 */
279 FailFast, /* Minimal retries should be attempted on
280 * this device, so use REQ_FAILFAST_DEV.
281 * Also don't try to repair failed reads.
282 * It is expects that no bad block log
283 * is present.
284 */
285 LastDev, /* Seems to be the last working dev as
286 * it didn't fail, so don't use FailFast
287 * any more for metadata
288 */
289 CollisionCheck, /*
290 * check if there is collision between raid1
291 * serial bios.
292 */
293 Nonrot, /* non-rotational device (SSD) */
294};
295
296static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors,
297 sector_t *first_bad, sector_t *bad_sectors)
298{
299 if (unlikely(rdev->badblocks.count)) {
300 int rv = badblocks_check(bb: &rdev->badblocks, s: rdev->data_offset + s,
301 sectors,
302 first_bad, bad_sectors);
303 if (rv)
304 *first_bad -= rdev->data_offset;
305 return rv;
306 }
307 return 0;
308}
309
310static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
311 int sectors)
312{
313 sector_t first_bad;
314 sector_t bad_sectors;
315
316 return is_badblock(rdev, s, sectors, first_bad: &first_bad, bad_sectors: &bad_sectors);
317}
318
319extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
320 int is_new);
321extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
322 int is_new);
323struct md_cluster_info;
324struct md_cluster_operations;
325
326/**
327 * enum mddev_flags - md device flags.
328 * @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
329 * @MD_CLOSING: If set, we are closing the array, do not open it then.
330 * @MD_JOURNAL_CLEAN: A raid with journal is already clean.
331 * @MD_HAS_JOURNAL: The raid array has journal feature set.
332 * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
333 * resync lock, need to release the lock.
334 * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
335 * calls to md_error() will never cause the array to
336 * become failed.
337 * @MD_HAS_PPL: The raid array has PPL feature set.
338 * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
339 * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
340 * array is ready yet.
341 * @MD_BROKEN: This is used to stop writes and mark array as failed.
342 * @MD_DELETED: This device is being deleted
343 *
344 * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
345 */
346enum mddev_flags {
347 MD_ARRAY_FIRST_USE,
348 MD_CLOSING,
349 MD_JOURNAL_CLEAN,
350 MD_HAS_JOURNAL,
351 MD_CLUSTER_RESYNC_LOCKED,
352 MD_FAILFAST_SUPPORTED,
353 MD_HAS_PPL,
354 MD_HAS_MULTIPLE_PPLS,
355 MD_NOT_READY,
356 MD_BROKEN,
357 MD_DELETED,
358};
359
360enum mddev_sb_flags {
361 MD_SB_CHANGE_DEVS, /* Some device status has changed */
362 MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */
363 MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */
364 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
365};
366
367#define NR_SERIAL_INFOS 8
368/* record current range of serialize IOs */
369struct serial_info {
370 struct rb_node node;
371 sector_t start; /* start sector of rb node */
372 sector_t last; /* end sector of rb node */
373 sector_t _subtree_last; /* highest sector in subtree of rb node */
374};
375
376/*
377 * mddev->curr_resync stores the current sector of the resync but
378 * also has some overloaded values.
379 */
380enum {
381 /* No resync in progress */
382 MD_RESYNC_NONE = 0,
383 /* Yielded to allow another conflicting resync to commence */
384 MD_RESYNC_YIELDED = 1,
385 /* Delayed to check that there is no conflict with another sync */
386 MD_RESYNC_DELAYED = 2,
387 /* Any value greater than or equal to this is in an active resync */
388 MD_RESYNC_ACTIVE = 3,
389};
390
391struct mddev {
392 void *private;
393 struct md_personality *pers;
394 dev_t unit;
395 int md_minor;
396 struct list_head disks;
397 unsigned long flags;
398 unsigned long sb_flags;
399
400 int suspended;
401 struct mutex suspend_mutex;
402 struct percpu_ref active_io;
403 int ro;
404 int sysfs_active; /* set when sysfs deletes
405 * are happening, so run/
406 * takeover/stop are not safe
407 */
408 struct gendisk *gendisk; /* mdraid gendisk */
409 struct gendisk *dm_gendisk; /* dm-raid gendisk */
410
411 struct kobject kobj;
412 int hold_active;
413#define UNTIL_IOCTL 1
414#define UNTIL_STOP 2
415
416 /* Superblock information */
417 int major_version,
418 minor_version,
419 patch_version;
420 int persistent;
421 int external; /* metadata is
422 * managed externally */
423 char metadata_type[17]; /* externally set*/
424 int chunk_sectors;
425 time64_t ctime, utime;
426 int level, layout;
427 char clevel[16];
428 int raid_disks;
429 int max_disks;
430 sector_t dev_sectors; /* used size of
431 * component devices */
432 sector_t array_sectors; /* exported array size */
433 int external_size; /* size managed
434 * externally */
435 __u64 events;
436 /* If the last 'event' was simply a clean->dirty transition, and
437 * we didn't write it to the spares, then it is safe and simple
438 * to just decrement the event count on a dirty->clean transition.
439 * So we record that possibility here.
440 */
441 int can_decrease_events;
442
443 char uuid[16];
444
445 /* If the array is being reshaped, we need to record the
446 * new shape and an indication of where we are up to.
447 * This is written to the superblock.
448 * If reshape_position is MaxSector, then no reshape is happening (yet).
449 */
450 sector_t reshape_position;
451 int delta_disks, new_level, new_layout;
452 int new_chunk_sectors;
453 int reshape_backwards;
454
455 struct md_thread __rcu *thread; /* management thread */
456 struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */
457
458 /*
459 * Set when a sync operation is started. It holds this value even
460 * when the sync thread is "frozen" (interrupted) or "idle" (stopped
461 * or finished). It is overwritten when a new sync operation is begun.
462 */
463 enum sync_action last_sync_action;
464 sector_t curr_resync; /* last block scheduled */
465 /* As resync requests can complete out of order, we cannot easily track
466 * how much resync has been completed. So we occasionally pause until
467 * everything completes, then set curr_resync_completed to curr_resync.
468 * As such it may be well behind the real resync mark, but it is a value
469 * we are certain of.
470 */
471 sector_t curr_resync_completed;
472 unsigned long resync_mark; /* a recent timestamp */
473 sector_t resync_mark_cnt;/* blocks written at resync_mark */
474 sector_t curr_mark_cnt; /* blocks scheduled now */
475
476 sector_t resync_max_sectors; /* may be set by personality */
477
478 atomic64_t resync_mismatches; /* count of sectors where
479 * parity/replica mismatch found
480 */
481
482 /* allow user-space to request suspension of IO to regions of the array */
483 sector_t suspend_lo;
484 sector_t suspend_hi;
485 /* if zero, use the system-wide default */
486 int sync_speed_min;
487 int sync_speed_max;
488 int sync_io_depth;
489
490 /* resync even though the same disks are shared among md-devices */
491 int parallel_resync;
492
493 int ok_start_degraded;
494
495 unsigned long recovery;
496 /* If a RAID personality determines that recovery (of a particular
497 * device) will fail due to a read error on the source device, it
498 * takes a copy of this number and does not attempt recovery again
499 * until this number changes.
500 */
501 int recovery_disabled;
502
503 int in_sync; /* know to not need resync */
504 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
505 * that we are never stopping an array while it is open.
506 * 'reconfig_mutex' protects all other reconfiguration.
507 * These locks are separate due to conflicting interactions
508 * with disk->open_mutex.
509 * Lock ordering is:
510 * reconfig_mutex -> disk->open_mutex
511 * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open
512 */
513 struct mutex open_mutex;
514 struct mutex reconfig_mutex;
515 atomic_t active; /* general refcount */
516 atomic_t openers; /* number of active opens */
517
518 int changed; /* True if we might need to
519 * reread partition info */
520 int degraded; /* whether md should consider
521 * adding a spare
522 */
523
524 unsigned long normal_io_events; /* IO event timestamp */
525 atomic_t recovery_active; /* blocks scheduled, but not written */
526 wait_queue_head_t recovery_wait;
527 sector_t resync_offset;
528 sector_t resync_min; /* user requested sync
529 * starts here */
530 sector_t resync_max; /* resync should pause
531 * when it gets here */
532
533 struct kernfs_node *sysfs_state; /* handle for 'array_state'
534 * file in sysfs.
535 */
536 struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
537 struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */
538 struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
539 struct kernfs_node *sysfs_level; /*handle for 'level' */
540
541 /* used for delayed sysfs removal */
542 struct work_struct del_work;
543 /* used for register new sync thread */
544 struct work_struct sync_work;
545
546 /* "lock" protects:
547 * flush_bio transition from NULL to !NULL
548 * rdev superblocks, events
549 * clearing MD_CHANGE_*
550 * in_sync - and related safemode and MD_CHANGE changes
551 * pers (also protected by reconfig_mutex and pending IO).
552 * clearing ->bitmap
553 * clearing ->bitmap_info.file
554 * changing ->resync_{min,max}
555 * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
556 */
557 spinlock_t lock;
558 wait_queue_head_t sb_wait; /* for waiting on superblock updates */
559 atomic_t pending_writes; /* number of active superblock writes */
560
561 unsigned int safemode; /* if set, update "clean" superblock
562 * when no writes pending.
563 */
564 unsigned int safemode_delay;
565 struct timer_list safemode_timer;
566 struct percpu_ref writes_pending;
567 int sync_checkers; /* # of threads checking writes_pending */
568
569 enum md_submodule_id bitmap_id;
570 void *bitmap; /* the bitmap for the device */
571 struct bitmap_operations *bitmap_ops;
572 struct {
573 struct file *file; /* the bitmap file */
574 loff_t offset; /* offset from superblock of
575 * start of bitmap. May be
576 * negative, but not '0'
577 * For external metadata, offset
578 * from start of device.
579 */
580 unsigned long space; /* space available at this offset */
581 loff_t default_offset; /* this is the offset to use when
582 * hot-adding a bitmap. It should
583 * eventually be settable by sysfs.
584 */
585 unsigned long default_space; /* space available at
586 * default offset */
587 struct mutex mutex;
588 unsigned long chunksize;
589 unsigned long daemon_sleep; /* how many jiffies between updates? */
590 unsigned long max_write_behind; /* write-behind mode */
591 int external;
592 int nodes; /* Maximum number of nodes in the cluster */
593 char cluster_name[64]; /* Name of the cluster */
594 } bitmap_info;
595
596 atomic_t max_corr_read_errors; /* max read retries */
597 struct list_head all_mddevs;
598
599 const struct attribute_group *to_remove;
600
601 struct bio_set bio_set;
602 struct bio_set sync_set; /* for sync operations like
603 * metadata and bitmap writes
604 */
605 struct bio_set io_clone_set;
606
607 struct work_struct event_work; /* used by dm to report failure event */
608 mempool_t *serial_info_pool;
609 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
610 struct md_cluster_info *cluster_info;
611 struct md_cluster_operations *cluster_ops;
612 unsigned int good_device_nr; /* good device num within cluster raid */
613 unsigned int noio_flag; /* for memalloc scope API */
614
615 /*
616 * Temporarily store rdev that will be finally removed when
617 * reconfig_mutex is unlocked, protected by reconfig_mutex.
618 */
619 struct list_head deleting;
620
621 /* The sequence number for sync thread */
622 atomic_t sync_seq;
623
624 bool has_superblocks:1;
625 bool fail_last_dev:1;
626 bool serialize_policy:1;
627};
628
629enum recovery_flags {
630 /* flags for sync thread running status */
631
632 /*
633 * set when one of sync action is set and new sync thread need to be
634 * registered, or just add/remove spares from conf.
635 */
636 MD_RECOVERY_NEEDED,
637 /* sync thread is running, or about to be started */
638 MD_RECOVERY_RUNNING,
639 /* sync thread needs to be aborted for some reason */
640 MD_RECOVERY_INTR,
641 /* sync thread is done and is waiting to be unregistered */
642 MD_RECOVERY_DONE,
643 /* running sync thread must abort immediately, and not restart */
644 MD_RECOVERY_FROZEN,
645 /* waiting for pers->start() to finish */
646 MD_RECOVERY_WAIT,
647 /* interrupted because io-error */
648 MD_RECOVERY_ERROR,
649
650 /* flags determines sync action, see details in enum sync_action */
651
652 /* if just this flag is set, action is resync. */
653 MD_RECOVERY_SYNC,
654 /*
655 * paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set,
656 * action is repair, means user requested resync.
657 */
658 MD_RECOVERY_REQUESTED,
659 /*
660 * paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is
661 * check.
662 */
663 MD_RECOVERY_CHECK,
664 /* recovery, or need to try it */
665 MD_RECOVERY_RECOVER,
666 /* reshape */
667 MD_RECOVERY_RESHAPE,
668 /* remote node is running resync thread */
669 MD_RESYNCING_REMOTE,
670 /* raid456 lazy initial recover */
671 MD_RECOVERY_LAZY_RECOVER,
672};
673
674enum md_ro_state {
675 MD_RDWR,
676 MD_RDONLY,
677 MD_AUTO_READ,
678 MD_MAX_STATE
679};
680
681static inline bool md_is_rdwr(struct mddev *mddev)
682{
683 return (mddev->ro == MD_RDWR);
684}
685
686static inline bool reshape_interrupted(struct mddev *mddev)
687{
688 /* reshape never start */
689 if (mddev->reshape_position == MaxSector)
690 return false;
691
692 /* interrupted */
693 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
694 return true;
695
696 /* running reshape will be interrupted soon. */
697 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
698 test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
699 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
700 return true;
701
702 return false;
703}
704
705static inline int __must_check mddev_lock(struct mddev *mddev)
706{
707 int ret;
708
709 ret = mutex_lock_interruptible(lock: &mddev->reconfig_mutex);
710
711 /* MD_DELETED is set in do_md_stop with reconfig_mutex.
712 * So check it here.
713 */
714 if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
715 ret = -ENODEV;
716 mutex_unlock(lock: &mddev->reconfig_mutex);
717 }
718
719 return ret;
720}
721
722/* Sometimes we need to take the lock in a situation where
723 * failure due to interrupts is not acceptable.
724 * It doesn't need to check MD_DELETED here, the owner which
725 * holds the lock here can't be stopped. And all paths can't
726 * call this function after do_md_stop.
727 */
728static inline void mddev_lock_nointr(struct mddev *mddev)
729{
730 mutex_lock(lock: &mddev->reconfig_mutex);
731}
732
733static inline int mddev_trylock(struct mddev *mddev)
734{
735 int ret;
736
737 ret = mutex_trylock(lock: &mddev->reconfig_mutex);
738 if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
739 ret = -ENODEV;
740 mutex_unlock(lock: &mddev->reconfig_mutex);
741 }
742 return ret;
743}
744extern void mddev_unlock(struct mddev *mddev);
745
746struct md_personality
747{
748 struct md_submodule_head head;
749
750 bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
751 /*
752 * start up works that do NOT require md_thread. tasks that
753 * requires md_thread should go into start()
754 */
755 int (*run)(struct mddev *mddev);
756 /* start up works that require md threads */
757 int (*start)(struct mddev *mddev);
758 void (*free)(struct mddev *mddev, void *priv);
759 void (*status)(struct seq_file *seq, struct mddev *mddev);
760 /* error_handler must set ->faulty and clear ->in_sync
761 * if appropriate, and should abort recovery if needed
762 */
763 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
764 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
765 int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
766 int (*spare_active) (struct mddev *mddev);
767 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr,
768 sector_t max_sector, int *skipped);
769 int (*resize) (struct mddev *mddev, sector_t sectors);
770 sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
771 int (*check_reshape) (struct mddev *mddev);
772 int (*start_reshape) (struct mddev *mddev);
773 void (*finish_reshape) (struct mddev *mddev);
774 void (*update_reshape_pos) (struct mddev *mddev);
775 void (*prepare_suspend) (struct mddev *mddev);
776 /* quiesce suspends or resumes internal processing.
777 * 1 - stop new actions and wait for action io to complete
778 * 0 - return to normal behaviour
779 */
780 void (*quiesce) (struct mddev *mddev, int quiesce);
781 /* takeover is used to transition an array from one
782 * personality to another. The new personality must be able
783 * to handle the data in the current layout.
784 * e.g. 2drive raid1 -> 2drive raid5
785 * ndrive raid5 -> degraded n+1drive raid6 with special layout
786 * If the takeover succeeds, a new 'private' structure is returned.
787 * This needs to be installed and then ->run used to activate the
788 * array.
789 */
790 void *(*takeover) (struct mddev *mddev);
791 /* Changes the consistency policy of an active array. */
792 int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
793 /* convert io ranges from array to bitmap */
794 void (*bitmap_sector)(struct mddev *mddev, sector_t *offset,
795 unsigned long *sectors);
796};
797
798struct md_sysfs_entry {
799 struct attribute attr;
800 ssize_t (*show)(struct mddev *, char *);
801 ssize_t (*store)(struct mddev *, const char *, size_t);
802};
803
804static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
805{
806 if (sd)
807 return sysfs_get_dirent(parent: sd, name);
808 return sd;
809}
810static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
811{
812 if (sd)
813 sysfs_notify_dirent(kn: sd);
814}
815
816static inline char * mdname (struct mddev * mddev)
817{
818 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
819}
820
821static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
822{
823 char nm[20];
824 if (!test_bit(Replacement, &rdev->flags) &&
825 !test_bit(Journal, &rdev->flags) &&
826 mddev->kobj.sd) {
827 sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
828 return sysfs_create_link(kobj: &mddev->kobj, target: &rdev->kobj, name: nm);
829 } else
830 return 0;
831}
832
833static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
834{
835 char nm[20];
836 if (!test_bit(Replacement, &rdev->flags) &&
837 !test_bit(Journal, &rdev->flags) &&
838 mddev->kobj.sd) {
839 sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
840 sysfs_remove_link(kobj: &mddev->kobj, name: nm);
841 }
842}
843
844/*
845 * iterates through some rdev ringlist. It's safe to remove the
846 * current 'rdev'. Dont touch 'tmp' though.
847 */
848#define rdev_for_each_list(rdev, tmp, head) \
849 list_for_each_entry_safe(rdev, tmp, head, same_set)
850
851/*
852 * iterates through the 'same array disks' ringlist
853 */
854#define rdev_for_each(rdev, mddev) \
855 list_for_each_entry(rdev, &((mddev)->disks), same_set)
856
857#define rdev_for_each_safe(rdev, tmp, mddev) \
858 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
859
860#define rdev_for_each_rcu(rdev, mddev) \
861 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
862
863struct md_thread {
864 void (*run) (struct md_thread *thread);
865 struct mddev *mddev;
866 wait_queue_head_t wqueue;
867 unsigned long flags;
868 struct task_struct *tsk;
869 unsigned long timeout;
870 void *private;
871};
872
873struct md_io_clone {
874 struct mddev *mddev;
875 struct bio *orig_bio;
876 unsigned long start_time;
877 sector_t offset;
878 unsigned long sectors;
879 enum stat_group rw;
880 struct bio bio_clone;
881};
882
883#define THREAD_WAKEUP 0
884
885static inline void safe_put_page(struct page *p)
886{
887 if (p) put_page(page: p);
888}
889
890int register_md_submodule(struct md_submodule_head *msh);
891void unregister_md_submodule(struct md_submodule_head *msh);
892
893extern struct md_thread *md_register_thread(
894 void (*run)(struct md_thread *thread),
895 struct mddev *mddev,
896 const char *name);
897extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp);
898extern void md_wakeup_thread(struct md_thread __rcu *thread);
899extern void md_check_recovery(struct mddev *mddev);
900extern void md_reap_sync_thread(struct mddev *mddev);
901extern enum sync_action md_sync_action(struct mddev *mddev);
902extern enum sync_action md_sync_action_by_name(const char *page);
903extern const char *md_sync_action_name(enum sync_action action);
904extern void md_write_start(struct mddev *mddev, struct bio *bi);
905extern void md_write_inc(struct mddev *mddev, struct bio *bi);
906extern void md_write_end(struct mddev *mddev);
907extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
908extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
909extern void md_finish_reshape(struct mddev *mddev);
910void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
911 struct bio *bio, sector_t start, sector_t size);
912void md_account_bio(struct mddev *mddev, struct bio **bio);
913void md_free_cloned_bio(struct bio *bio);
914
915extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
916void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
917 sector_t sector, int size, struct page *page,
918 unsigned int offset);
919extern int md_super_wait(struct mddev *mddev);
920extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
921 struct page *page, blk_opf_t opf, bool metadata_op);
922extern void md_do_sync(struct md_thread *thread);
923extern void md_new_event(void);
924extern void md_allow_write(struct mddev *mddev);
925extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
926extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
927extern int md_check_no_bitmap(struct mddev *mddev);
928extern int md_integrity_register(struct mddev *mddev);
929extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
930
931extern int mddev_init(struct mddev *mddev);
932extern void mddev_destroy(struct mddev *mddev);
933void md_init_stacking_limits(struct queue_limits *lim);
934struct mddev *md_alloc(dev_t dev, char *name);
935void mddev_put(struct mddev *mddev);
936extern int md_run(struct mddev *mddev);
937extern int md_start(struct mddev *mddev);
938extern void md_stop(struct mddev *mddev);
939extern void md_stop_writes(struct mddev *mddev);
940extern int md_rdev_init(struct md_rdev *rdev);
941extern void md_rdev_clear(struct md_rdev *rdev);
942
943extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
944extern int mddev_suspend(struct mddev *mddev, bool interruptible);
945extern void mddev_resume(struct mddev *mddev);
946extern void md_idle_sync_thread(struct mddev *mddev);
947extern void md_frozen_sync_thread(struct mddev *mddev);
948extern void md_unfrozen_sync_thread(struct mddev *mddev);
949
950extern void md_update_sb(struct mddev *mddev, int force);
951extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
952extern void mddev_destroy_serial_pool(struct mddev *mddev,
953 struct md_rdev *rdev);
954struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
955struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
956
957static inline bool is_rdev_broken(struct md_rdev *rdev)
958{
959 return !disk_live(disk: rdev->bdev->bd_disk);
960}
961
962static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
963{
964 int faulty = test_bit(Faulty, &rdev->flags);
965 if (atomic_dec_and_test(v: &rdev->nr_pending) && faulty) {
966 set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
967 md_wakeup_thread(thread: mddev->thread);
968 }
969}
970
971static inline int mddev_is_clustered(struct mddev *mddev)
972{
973 return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
974}
975
976/* clear unsupported mddev_flags */
977static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
978 unsigned long unsupported_flags)
979{
980 mddev->flags &= ~unsupported_flags;
981}
982
983static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
984{
985 if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
986 !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
987 mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0;
988}
989
990static inline int mddev_suspend_and_lock(struct mddev *mddev)
991{
992 int ret;
993
994 ret = mddev_suspend(mddev, interruptible: true);
995 if (ret)
996 return ret;
997
998 ret = mddev_lock(mddev);
999 if (ret)
1000 mddev_resume(mddev);
1001
1002 return ret;
1003}
1004
1005static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
1006{
1007 mddev_suspend(mddev, interruptible: false);
1008 mutex_lock(lock: &mddev->reconfig_mutex);
1009}
1010
1011static inline void mddev_unlock_and_resume(struct mddev *mddev)
1012{
1013 mddev_unlock(mddev);
1014 mddev_resume(mddev);
1015}
1016
1017struct mdu_array_info_s;
1018struct mdu_disk_info_s;
1019
1020extern int mdp_major;
1021void md_autostart_arrays(int part);
1022int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
1023int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
1024int do_md_run(struct mddev *mddev);
1025#define MDDEV_STACK_INTEGRITY (1u << 0)
1026int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
1027 unsigned int flags);
1028int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev);
1029void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes);
1030
1031extern const struct block_device_operations md_fops;
1032
1033/*
1034 * MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
1035 */
1036static inline bool mddev_is_dm(struct mddev *mddev)
1037{
1038 return !mddev->gendisk;
1039}
1040
1041static inline bool raid_is_456(struct mddev *mddev)
1042{
1043 return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 ||
1044 mddev->level == ID_RAID6;
1045}
1046
1047static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
1048 sector_t sector)
1049{
1050 if (!mddev_is_dm(mddev))
1051 trace_block_bio_remap(bio, dev: disk_devt(disk: mddev->gendisk), from: sector);
1052}
1053
1054static inline bool rdev_blocked(struct md_rdev *rdev)
1055{
1056 /*
1057 * Blocked will be set by error handler and cleared by daemon after
1058 * updating superblock, meanwhile write IO should be blocked to prevent
1059 * reading old data after power failure.
1060 */
1061 if (test_bit(Blocked, &rdev->flags))
1062 return true;
1063
1064 /*
1065 * Faulty device should not be accessed anymore, there is no need to
1066 * wait for bad block to be acknowledged.
1067 */
1068 if (test_bit(Faulty, &rdev->flags))
1069 return false;
1070
1071 /* rdev is blocked by badblocks. */
1072 if (test_bit(BlockedBadBlocks, &rdev->flags))
1073 return true;
1074
1075 return false;
1076}
1077
1078#define mddev_add_trace_msg(mddev, fmt, args...) \
1079do { \
1080 if (!mddev_is_dm(mddev)) \
1081 blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
1082} while (0)
1083
1084#endif /* _MD_MD_H */
1085