md.h source code [Linux/drivers/md/md.h]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	md.h : kernel internal structure of the Linux MD driver
4	Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
5
6	*/
7
8	#ifndef _MD_MD_H
9	#define _MD_MD_H
10
11	#include <linux/blkdev.h>
12	#include <linux/backing-dev.h>
13	#include <linux/badblocks.h>
14	#include <linux/kobject.h>
15	#include <linux/list.h>
16	#include <linux/mm.h>
17	#include <linux/mutex.h>
18	#include <linux/timer.h>
19	#include <linux/wait.h>
20	#include <linux/workqueue.h>
21	#include <linux/raid/md_u.h>
22	#include <trace/events/block.h>
23
24	#define MaxSector (~(sector_t)0)
25
26	enum md_submodule_type {
27	MD_PERSONALITY = `0`,
28	MD_CLUSTER,
29	MD_BITMAP,
30	};
31
32	enum md_submodule_id {
33	ID_LINEAR = LEVEL_LINEAR,
34	ID_RAID0 = `0`,
35	ID_RAID1 = `1`,
36	ID_RAID4 = `4`,
37	ID_RAID5 = `5`,
38	ID_RAID6 = `6`,
39	ID_RAID10 = `10`,
40	ID_CLUSTER,
41	ID_BITMAP,
42	ID_LLBITMAP,
43	ID_BITMAP_NONE,
44	};
45
46	struct md_submodule_head {
47	enum md_submodule_type type;
48	enum md_submodule_id id;
49	const char *name;
50	struct module *owner;
51	};
52
53	/*
54	* These flags should really be called "NO_RETRY" rather than
55	* "FAILFAST" because they don't make any promise about time lapse,
56	* only about the number of retries, which will be zero.
57	* REQ_FAILFAST_DRIVER is not included because
58	* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
59	* seems to suggest that the errors it avoids retrying should usually
60	* be retried.
61	*/
62	#define MD_FAILFAST (REQ_FAILFAST_DEV \| REQ_FAILFAST_TRANSPORT)
63
64	/ Status of sync thread. /
65	enum sync_action {
66	/*
67	* Represent by MD_RECOVERY_SYNC, start when:
68	* 1) after assemble, sync data from first rdev to other copies, this
69	* must be done first before other sync actions and will only execute
70	* once;
71	* 2) resize the array(notice that this is not reshape), sync data for
72	* the new range;
73	*/
74	ACTION_RESYNC,
75	/*
76	* Represent by MD_RECOVERY_RECOVER, start when:
77	* 1) for new replacement, sync data based on the replace rdev or
78	* available copies from other rdev;
79	* 2) for new member disk while the array is degraded, sync data from
80	* other rdev;
81	* 3) reassemble after power failure or re-add a hot removed rdev, sync
82	* data from first rdev to other copies based on bitmap;
83	*/
84	ACTION_RECOVER,
85	/*
86	* Represent by MD_RECOVERY_SYNC \| MD_RECOVERY_REQUESTED \|
87	* MD_RECOVERY_CHECK, start when user echo "check" to sysfs api
88	* sync_action, used to check if data copies from differenct rdev are
89	* the same. The number of mismatch sectors will be exported to user
90	* by sysfs api mismatch_cnt;
91	*/
92	ACTION_CHECK,
93	/*
94	* Represent by MD_RECOVERY_SYNC \| MD_RECOVERY_REQUESTED, start when
95	* user echo "repair" to sysfs api sync_action, usually paired with
96	* ACTION_CHECK, used to force syncing data once user found that there
97	* are inconsistent data,
98	*/
99	ACTION_REPAIR,
100	/*
101	* Represent by MD_RECOVERY_RESHAPE, start when new member disk is added
102	* to the conf, notice that this is different from spares or
103	* replacement;
104	*/
105	ACTION_RESHAPE,
106	/*
107	* Represent by MD_RECOVERY_FROZEN, can be set by sysfs api sync_action
108	* or internal usage like setting the array read-only, will forbid above
109	* actions.
110	*/
111	ACTION_FROZEN,
112	/*
113	* All above actions don't match.
114	*/
115	ACTION_IDLE,
116	NR_SYNC_ACTIONS,
117	};
118
119	/*
120	* The struct embedded in rdev is used to serialize IO.
121	*/
122	struct serial_in_rdev {
123	struct rb_root_cached serial_rb;
124	spinlock_t serial_lock;
125	wait_queue_head_t serial_io_wait;
126	};
127
128	/*
129	* MD's 'extended' device
130	*/
131	struct md_rdev {
132	struct list_head same_set; / RAID devices within the same set /
133
134	sector_t sectors; / Device size (in 512bytes sectors) /
135	struct mddev mddev; /* RAID array if running /
136	unsigned long last_events; / IO event timestamp /
137
138	/*
139	* If meta_bdev is non-NULL, it means that a separate device is
140	* being used to store the metadata (superblock/bitmap) which
141	* would otherwise be contained on the same device as the data (bdev).
142	*/
143	struct block_device *meta_bdev;
144	struct block_device bdev; /* block device handle /
145	struct file bdev_file; /* Handle from open for bdev /
146
147	struct page sb_page, bb_page;
148	int sb_loaded;
149	__u64 sb_events;
150	sector_t data_offset; / start of data in array /
151	sector_t new_data_offset;/ only relevant while reshaping /
152	sector_t sb_start; / offset of the super block (in 512byte sectors) /
153	int sb_size; / bytes in the superblock /
154	int preferred_minor; / autorun support /
155
156	struct kobject kobj;
157
158	/ A device can be in one of three states based on two flags:*
159	* Not working: faulty==1 in_sync==0
160	* Fully working: faulty==0 in_sync==1
161	* Working, but not
162	* in sync with array
163	* faulty==0 in_sync==0
164	*
165	* It can never have faulty==1, in_sync==1
166	* This reduces the burden of testing multiple flags in many cases
167	*/
168
169	unsigned long flags; / bit set of 'enum flag_bits' bits. /
170	wait_queue_head_t blocked_wait;
171
172	int desc_nr; / descriptor index in the superblock /
173	int raid_disk; / role of device in array /
174	int new_raid_disk; / role that the device will have in*
175	* the array after a level-change completes.
176	*/
177	int saved_raid_disk; / role that device used to have in the*
178	* array and could again if we did a partial
179	* resync from the bitmap
180	*/
181	union {
182	sector_t recovery_offset;/ If this device has been partially*
183	* recovered, this is where we were
184	* up to.
185	*/
186	sector_t journal_tail; / If this device is a journal device,*
187	* this is the journal tail (journal
188	* recovery start point)
189	*/
190	};
191
192	atomic_t nr_pending; / number of pending requests.*
193	* only maintained for arrays that
194	* support hot removal
195	*/
196	atomic_t read_errors; / number of consecutive read errors that*
197	* we have tried to ignore.
198	*/
199	time64_t last_read_error; / monotonic time since our*
200	* last read error
201	*/
202	atomic_t corrected_errors; / number of corrected read errors,*
203	* for reporting to userspace and storing
204	* in superblock.
205	*/
206
207	struct serial_in_rdev serial; /* used for raid1 io serialization /
208
209	struct kernfs_node sysfs_state; /* handle for 'state'*
210	* sysfs entry */
211	/ handle for 'unacknowledged_bad_blocks' sysfs dentry /
212	struct kernfs_node *sysfs_unack_badblocks;
213	/ handle for 'bad_blocks' sysfs dentry /
214	struct kernfs_node *sysfs_badblocks;
215	struct badblocks badblocks;
216
217	struct {
218	short offset; / Offset from superblock to start of PPL.*
219	* Not used by external metadata. */
220	unsigned int size; / Size in sectors of the PPL space /
221	sector_t sector; / First sector of the PPL space /
222	} ppl;
223	};
224	enum flag_bits {
225	Faulty, / device is known to have a fault /
226	In_sync, / device is in_sync with rest of array /
227	Bitmap_sync, / ..actually, not quite In_sync. Need a*
228	* bitmap-based recovery to get fully in sync.
229	* The bit is only meaningful before device
230	* has been passed to pers->hot_add_disk.
231	*/
232	WriteMostly, / Avoid reading if at all possible /
233	AutoDetected, / added by auto-detect /
234	Blocked, / An error occurred but has not yet*
235	* been acknowledged by the metadata
236	* handler, so don't allow writes
237	* until it is cleared */
238	WriteErrorSeen, / A write error has been seen on this*
239	* device
240	*/
241	FaultRecorded, / Intermediate state for clearing*
242	* Blocked. The Fault is/will-be
243	* recorded in the metadata, but that
244	* metadata hasn't been stored safely
245	* on disk yet.
246	*/
247	BlockedBadBlocks, / A writer is blocked because they*
248	* found an unacknowledged bad-block.
249	* This can safely be cleared at any
250	* time, and the writer will re-check.
251	* It may be set at any time, and at
252	* worst the writer will timeout and
253	* re-check. So setting it as
254	* accurately as possible is good, but
255	* not absolutely critical.
256	*/
257	WantReplacement, / This device is a candidate to be*
258	* hot-replaced, either because it has
259	* reported some faults, or because
260	* of explicit request.
261	*/
262	Replacement, / This device is a replacement for*
263	* a want_replacement device with same
264	* raid_disk number.
265	*/
266	Candidate, / For clustered environments only:*
267	* This device is seen locally but not
268	* by the whole cluster
269	*/
270	Journal, / This device is used as journal for*
271	* raid-5/6.
272	* Usually, this device should be faster
273	* than other devices in the array
274	*/
275	ClusterRemove,
276	ExternalBbl, / External metadata provides bad*
277	* block management for a disk
278	*/
279	FailFast, / Minimal retries should be attempted on*
280	* this device, so use REQ_FAILFAST_DEV.
281	* Also don't try to repair failed reads.
282	* It is expects that no bad block log
283	* is present.
284	*/
285	LastDev, / Seems to be the last working dev as*
286	* it didn't fail, so don't use FailFast
287	* any more for metadata
288	*/
289	CollisionCheck, /*
290	* check if there is collision between raid1
291	* serial bios.
292	*/
293	Nonrot, / non-rotational device (SSD) /
294	};
295
296	static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors,
297	sector_t first_bad, sector_t bad_sectors)
298	{
299	if (unlikely(rdev->badblocks.count)) {
300	int rv = badblocks_check(bb: &rdev->badblocks, s: rdev->data_offset + s,
301	sectors,
302	first_bad, bad_sectors);
303	if (rv)
304	*first_bad -= rdev->data_offset;
305	return rv;
306	}
307	return `0`;
308	}
309
310	static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
311	int sectors)
312	{
313	sector_t first_bad;
314	sector_t bad_sectors;
315
316	return is_badblock(rdev, s, sectors, first_bad: &first_bad, bad_sectors: &bad_sectors);
317	}
318
319	extern bool rdev_set_badblocks(struct md_rdev rdev, sector_t s, int* sectors,
320	int is_new);
321	extern void rdev_clear_badblocks(struct md_rdev rdev, sector_t s, int* sectors,
322	int is_new);
323	struct md_cluster_info;
324	struct md_cluster_operations;
325
326	/**
327	* enum mddev_flags - md device flags.
328	* @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
329	* @MD_CLOSING: If set, we are closing the array, do not open it then.
330	* @MD_JOURNAL_CLEAN: A raid with journal is already clean.
331	* @MD_HAS_JOURNAL: The raid array has journal feature set.
332	* @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
333	* resync lock, need to release the lock.
334	* @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
335	* calls to md_error() will never cause the array to
336	* become failed.
337	* @MD_HAS_PPL: The raid array has PPL feature set.
338	* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
339	* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
340	* array is ready yet.
341	* @MD_BROKEN: This is used to stop writes and mark array as failed.
342	* @MD_DELETED: This device is being deleted
343	*
344	* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
345	*/
346	enum mddev_flags {
347	MD_ARRAY_FIRST_USE,
348	MD_CLOSING,
349	MD_JOURNAL_CLEAN,
350	MD_HAS_JOURNAL,
351	MD_CLUSTER_RESYNC_LOCKED,
352	MD_FAILFAST_SUPPORTED,
353	MD_HAS_PPL,
354	MD_HAS_MULTIPLE_PPLS,
355	MD_NOT_READY,
356	MD_BROKEN,
357	MD_DELETED,
358	};
359
360	enum mddev_sb_flags {
361	MD_SB_CHANGE_DEVS, / Some device status has changed /
362	MD_SB_CHANGE_CLEAN, / transition to or from 'clean' /
363	MD_SB_CHANGE_PENDING, / switch from 'clean' to 'active' in progress /
364	MD_SB_NEED_REWRITE, / metadata write needs to be repeated /
365	};
366
367	#define NR_SERIAL_INFOS 8
368	/ record current range of serialize IOs /
369	struct serial_info {
370	struct rb_node node;
371	sector_t start; / start sector of rb node /
372	sector_t last; / end sector of rb node /
373	sector_t _subtree_last; / highest sector in subtree of rb node /
374	};
375
376	/*
377	* mddev->curr_resync stores the current sector of the resync but
378	* also has some overloaded values.
379	*/
380	enum {
381	/ No resync in progress /
382	MD_RESYNC_NONE = `0`,
383	/ Yielded to allow another conflicting resync to commence /
384	MD_RESYNC_YIELDED = `1`,
385	/ Delayed to check that there is no conflict with another sync /
386	MD_RESYNC_DELAYED = `2`,
387	/ Any value greater than or equal to this is in an active resync /
388	MD_RESYNC_ACTIVE = `3`,
389	};
390
391	struct mddev {
392	void *private;
393	struct md_personality *pers;
394	dev_t unit;
395	int md_minor;
396	struct list_head disks;
397	unsigned long flags;
398	unsigned long sb_flags;
399
400	int suspended;
401	struct mutex suspend_mutex;
402	struct percpu_ref active_io;
403	int ro;
404	int sysfs_active; / set when sysfs deletes*
405	* are happening, so run/
406	* takeover/stop are not safe
407	*/
408	struct gendisk gendisk; /* mdraid gendisk /
409	struct gendisk dm_gendisk; /* dm-raid gendisk /
410
411	struct kobject kobj;
412	int hold_active;
413	#define UNTIL_IOCTL 1
414	#define UNTIL_STOP 2
415
416	/ Superblock information /
417	int major_version,
418	minor_version,
419	patch_version;
420	int persistent;
421	int external; / metadata is*
422	* managed externally */
423	char metadata_type[`17`]; / externally set/
424	int chunk_sectors;
425	time64_t ctime, utime;
426	int level, layout;
427	char clevel[`16`];
428	int raid_disks;
429	int max_disks;
430	sector_t dev_sectors; / used size of*
431	* component devices */
432	sector_t array_sectors; / exported array size /
433	int external_size; / size managed*
434	* externally */
435	__u64 events;
436	/ If the last 'event' was simply a clean->dirty transition, and*
437	* we didn't write it to the spares, then it is safe and simple
438	* to just decrement the event count on a dirty->clean transition.
439	* So we record that possibility here.
440	*/
441	int can_decrease_events;
442
443	char uuid[`16`];
444
445	/ If the array is being reshaped, we need to record the*
446	* new shape and an indication of where we are up to.
447	* This is written to the superblock.
448	* If reshape_position is MaxSector, then no reshape is happening (yet).
449	*/
450	sector_t reshape_position;
451	int delta_disks, new_level, new_layout;
452	int new_chunk_sectors;
453	int reshape_backwards;
454
455	struct md_thread __rcu thread; /* management thread /
456	struct md_thread __rcu sync_thread; /* doing resync or reconstruct /
457
458	/*
459	* Set when a sync operation is started. It holds this value even
460	* when the sync thread is "frozen" (interrupted) or "idle" (stopped
461	* or finished). It is overwritten when a new sync operation is begun.
462	*/
463	enum sync_action last_sync_action;
464	sector_t curr_resync; / last block scheduled /
465	/ As resync requests can complete out of order, we cannot easily track*
466	* how much resync has been completed. So we occasionally pause until
467	* everything completes, then set curr_resync_completed to curr_resync.
468	* As such it may be well behind the real resync mark, but it is a value
469	* we are certain of.
470	*/
471	sector_t curr_resync_completed;
472	unsigned long resync_mark; / a recent timestamp /
473	sector_t resync_mark_cnt;/ blocks written at resync_mark /
474	sector_t curr_mark_cnt; / blocks scheduled now /
475
476	sector_t resync_max_sectors; / may be set by personality /
477
478	atomic64_t resync_mismatches; / count of sectors where*
479	* parity/replica mismatch found
480	*/
481
482	/ allow user-space to request suspension of IO to regions of the array /
483	sector_t suspend_lo;
484	sector_t suspend_hi;
485	/ if zero, use the system-wide default /
486	int sync_speed_min;
487	int sync_speed_max;
488	int sync_io_depth;
489
490	/ resync even though the same disks are shared among md-devices /
491	int parallel_resync;
492
493	int ok_start_degraded;
494
495	unsigned long recovery;
496	/ If a RAID personality determines that recovery (of a particular*
497	* device) will fail due to a read error on the source device, it
498	* takes a copy of this number and does not attempt recovery again
499	* until this number changes.
500	*/
501	int recovery_disabled;
502
503	int in_sync; / know to not need resync /
504	/ 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so*
505	* that we are never stopping an array while it is open.
506	* 'reconfig_mutex' protects all other reconfiguration.
507	* These locks are separate due to conflicting interactions
508	* with disk->open_mutex.
509	* Lock ordering is:
510	* reconfig_mutex -> disk->open_mutex
511	* disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open
512	*/
513	struct mutex open_mutex;
514	struct mutex reconfig_mutex;
515	atomic_t active; / general refcount /
516	atomic_t openers; / number of active opens /
517
518	int changed; / True if we might need to*
519	* reread partition info */
520	int degraded; / whether md should consider*
521	* adding a spare
522	*/
523
524	unsigned long normal_io_events; / IO event timestamp /
525	atomic_t recovery_active; / blocks scheduled, but not written /
526	wait_queue_head_t recovery_wait;
527	sector_t resync_offset;
528	sector_t resync_min; / user requested sync*
529	* starts here */
530	sector_t resync_max; / resync should pause*
531	* when it gets here */
532
533	struct kernfs_node sysfs_state; /* handle for 'array_state'*
534	* file in sysfs.
535	*/
536	struct kernfs_node sysfs_action; /* handle for 'sync_action' /
537	struct kernfs_node sysfs_completed; /handle for 'sync_completed' /*
538	struct kernfs_node sysfs_degraded; /handle for 'degraded' /*
539	struct kernfs_node sysfs_level; /handle for 'level' /*
540
541	/ used for delayed sysfs removal /
542	struct work_struct del_work;
543	/ used for register new sync thread /
544	struct work_struct sync_work;
545
546	/ "lock" protects:*
547	* flush_bio transition from NULL to !NULL
548	* rdev superblocks, events
549	* clearing MD_CHANGE_*
550	* in_sync - and related safemode and MD_CHANGE changes
551	* pers (also protected by reconfig_mutex and pending IO).
552	* clearing ->bitmap
553	* clearing ->bitmap_info.file
554	* changing ->resync_{min,max}
555	* setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
556	*/
557	spinlock_t lock;
558	wait_queue_head_t sb_wait; / for waiting on superblock updates /
559	atomic_t pending_writes; / number of active superblock writes /
560
561	unsigned int safemode; / if set, update "clean" superblock*
562	* when no writes pending.
563	*/
564	unsigned int safemode_delay;
565	struct timer_list safemode_timer;
566	struct percpu_ref writes_pending;
567	int sync_checkers; / # of threads checking writes_pending /
568
569	enum md_submodule_id bitmap_id;
570	void bitmap; /* the bitmap for the device /
571	struct bitmap_operations *bitmap_ops;
572	struct {
573	struct file file; /* the bitmap file /
574	loff_t offset; / offset from superblock of*
575	* start of bitmap. May be
576	* negative, but not '0'
577	* For external metadata, offset
578	* from start of device.
579	*/
580	unsigned long space; / space available at this offset /
581	loff_t default_offset; / this is the offset to use when*
582	* hot-adding a bitmap. It should
583	* eventually be settable by sysfs.
584	*/
585	unsigned long default_space; / space available at*
586	* default offset */
587	struct mutex mutex;
588	unsigned long chunksize;
589	unsigned long daemon_sleep; / how many jiffies between updates? /
590	unsigned long max_write_behind; / write-behind mode /
591	int external;
592	int nodes; / Maximum number of nodes in the cluster /
593	char cluster_name[`64`]; / Name of the cluster /
594	} bitmap_info;
595
596	atomic_t max_corr_read_errors; / max read retries /
597	struct list_head all_mddevs;
598
599	const struct attribute_group *to_remove;
600
601	struct bio_set bio_set;
602	struct bio_set sync_set; / for sync operations like*
603	* metadata and bitmap writes
604	*/
605	struct bio_set io_clone_set;
606
607	struct work_struct event_work; / used by dm to report failure event /
608	mempool_t *serial_info_pool;
609	void (sync_super)(struct* mddev mddev, struct* md_rdev *rdev);
610	struct md_cluster_info *cluster_info;
611	struct md_cluster_operations *cluster_ops;
612	unsigned int good_device_nr; / good device num within cluster raid /
613	unsigned int noio_flag; / for memalloc scope API /
614
615	/*
616	* Temporarily store rdev that will be finally removed when
617	* reconfig_mutex is unlocked, protected by reconfig_mutex.
618	*/
619	struct list_head deleting;
620
621	/ The sequence number for sync thread /
622	atomic_t sync_seq;
623
624	bool has_superblocks:`1`;
625	bool fail_last_dev:`1`;
626	bool serialize_policy:`1`;
627	};
628
629	enum recovery_flags {
630	/ flags for sync thread running status /
631
632	/*
633	* set when one of sync action is set and new sync thread need to be
634	* registered, or just add/remove spares from conf.
635	*/
636	MD_RECOVERY_NEEDED,
637	/ sync thread is running, or about to be started /
638	MD_RECOVERY_RUNNING,
639	/ sync thread needs to be aborted for some reason /
640	MD_RECOVERY_INTR,
641	/ sync thread is done and is waiting to be unregistered /
642	MD_RECOVERY_DONE,
643	/ running sync thread must abort immediately, and not restart /
644	MD_RECOVERY_FROZEN,
645	/ waiting for pers->start() to finish /
646	MD_RECOVERY_WAIT,
647	/ interrupted because io-error /
648	MD_RECOVERY_ERROR,
649
650	/ flags determines sync action, see details in enum sync_action /
651
652	/ if just this flag is set, action is resync. /
653	MD_RECOVERY_SYNC,
654	/*
655	* paired with MD_RECOVERY_SYNC, if MD_RECOVERY_CHECK is not set,
656	* action is repair, means user requested resync.
657	*/
658	MD_RECOVERY_REQUESTED,
659	/*
660	* paired with MD_RECOVERY_SYNC and MD_RECOVERY_REQUESTED, action is
661	* check.
662	*/
663	MD_RECOVERY_CHECK,
664	/ recovery, or need to try it /
665	MD_RECOVERY_RECOVER,
666	/ reshape /
667	MD_RECOVERY_RESHAPE,
668	/ remote node is running resync thread /
669	MD_RESYNCING_REMOTE,
670	/ raid456 lazy initial recover /
671	MD_RECOVERY_LAZY_RECOVER,
672	};
673
674	enum md_ro_state {
675	MD_RDWR,
676	MD_RDONLY,
677	MD_AUTO_READ,
678	MD_MAX_STATE
679	};
680
681	static inline bool md_is_rdwr(struct mddev *mddev)
682	{
683	return (mddev->ro == MD_RDWR);
684	}
685
686	static inline bool reshape_interrupted(struct mddev *mddev)
687	{
688	/ reshape never start /
689	if (mddev->reshape_position == MaxSector)
690	return false;
691
692	/ interrupted /
693	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
694	return true;
695
696	/ running reshape will be interrupted soon. /
697	if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) \|\|
698	test_bit(MD_RECOVERY_INTR, &mddev->recovery) \|\|
699	test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
700	return true;
701
702	return false;
703	}
704
705	static inline int __must_check mddev_lock(struct mddev *mddev)
706	{
707	int ret;
708
709	ret = mutex_lock_interruptible(lock: &mddev->reconfig_mutex);
710
711	/ MD_DELETED is set in do_md_stop with reconfig_mutex.*
712	* So check it here.
713	*/
714	if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
715	ret = -ENODEV;
716	mutex_unlock(lock: &mddev->reconfig_mutex);
717	}
718
719	return ret;
720	}
721
722	/ Sometimes we need to take the lock in a situation where*
723	* failure due to interrupts is not acceptable.
724	* It doesn't need to check MD_DELETED here, the owner which
725	* holds the lock here can't be stopped. And all paths can't
726	* call this function after do_md_stop.
727	*/
728	static inline void mddev_lock_nointr(struct mddev *mddev)
729	{
730	mutex_lock(lock: &mddev->reconfig_mutex);
731	}
732
733	static inline int mddev_trylock(struct mddev *mddev)
734	{
735	int ret;
736
737	ret = mutex_trylock(lock: &mddev->reconfig_mutex);
738	if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
739	ret = -ENODEV;
740	mutex_unlock(lock: &mddev->reconfig_mutex);
741	}
742	return ret;
743	}
744	extern void mddev_unlock(struct mddev *mddev);
745
746	struct md_personality
747	{
748	struct md_submodule_head head;
749
750	bool __must_check (make_request)(struct* mddev mddev, struct* bio *bio);
751	/*
752	* start up works that do NOT require md_thread. tasks that
753	* requires md_thread should go into start()
754	*/
755	int (run)(struct* mddev *mddev);
756	/ start up works that require md threads /
757	int (start)(struct* mddev *mddev);
758	void (free)(struct* mddev mddev, void* *priv);
759	void (status)(struct* seq_file seq, struct* mddev *mddev);
760	/ error_handler must set ->faulty and clear ->in_sync*
761	* if appropriate, and should abort recovery if needed
762	*/
763	void (error_handler)(struct* mddev mddev, struct* md_rdev *rdev);
764	int (hot_add_disk) (struct* mddev mddev, struct* md_rdev *rdev);
765	int (hot_remove_disk) (struct* mddev mddev, struct* md_rdev *rdev);
766	int (spare_active) (struct* mddev *mddev);
767	sector_t (sync_request)(struct* mddev *mddev, sector_t sector_nr,
768	sector_t max_sector, int *skipped);
769	int (resize) (struct* mddev *mddev, sector_t sectors);
770	sector_t (size) (struct* mddev mddev, sector_t sectors, int* raid_disks);
771	int (check_reshape) (struct* mddev *mddev);
772	int (start_reshape) (struct* mddev *mddev);
773	void (finish_reshape) (struct* mddev *mddev);
774	void (update_reshape_pos) (struct* mddev *mddev);
775	void (prepare_suspend) (struct* mddev *mddev);
776	/ quiesce suspends or resumes internal processing.*
777	* 1 - stop new actions and wait for action io to complete
778	* 0 - return to normal behaviour
779	*/
780	void (quiesce) (struct* mddev mddev, int* quiesce);
781	/ takeover is used to transition an array from one*
782	* personality to another. The new personality must be able
783	* to handle the data in the current layout.
784	* e.g. 2drive raid1 -> 2drive raid5
785	* ndrive raid5 -> degraded n+1drive raid6 with special layout
786	* If the takeover succeeds, a new 'private' structure is returned.
787	* This needs to be installed and then ->run used to activate the
788	* array.
789	*/
790	void (takeover) (struct mddev *mddev);
791	/ Changes the consistency policy of an active array. /
792	int (change_consistency_policy)(struct* mddev mddev, const* char *buf);
793	/ convert io ranges from array to bitmap /
794	void (bitmap_sector)(struct* mddev mddev, sector_t offset,
795	unsigned long *sectors);
796	};
797
798	struct md_sysfs_entry {
799	struct attribute attr;
800	ssize_t (show)(struct* mddev , char* *);
801	ssize_t (store)(struct* mddev , const* char *, size_t);
802	};
803
804	static inline struct kernfs_node sysfs_get_dirent_safe(struct* kernfs_node sd, char* *name)
805	{
806	if (sd)
807	return sysfs_get_dirent(parent: sd, name);
808	return sd;
809	}
810	static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
811	{
812	if (sd)
813	sysfs_notify_dirent(kn: sd);
814	}
815
816	static inline char * mdname (struct mddev * mddev)
817	{
818	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
819	}
820
821	static inline int sysfs_link_rdev(struct mddev mddev, struct* md_rdev *rdev)
822	{
823	char nm[`20`];
824	if (!test_bit(Replacement, &rdev->flags) &&
825	!test_bit(Journal, &rdev->flags) &&
826	mddev->kobj.sd) {
827	sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
828	return sysfs_create_link(kobj: &mddev->kobj, target: &rdev->kobj, name: nm);
829	} else
830	return `0`;
831	}
832
833	static inline void sysfs_unlink_rdev(struct mddev mddev, struct* md_rdev *rdev)
834	{
835	char nm[`20`];
836	if (!test_bit(Replacement, &rdev->flags) &&
837	!test_bit(Journal, &rdev->flags) &&
838	mddev->kobj.sd) {
839	sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
840	sysfs_remove_link(kobj: &mddev->kobj, name: nm);
841	}
842	}
843
844	/*
845	* iterates through some rdev ringlist. It's safe to remove the
846	* current 'rdev'. Dont touch 'tmp' though.
847	*/
848	#define rdev_for_each_list(rdev, tmp, head) \
849	list_for_each_entry_safe(rdev, tmp, head, same_set)
850
851	/*
852	* iterates through the 'same array disks' ringlist
853	*/
854	#define rdev_for_each(rdev, mddev) \
855	list_for_each_entry(rdev, &((mddev)->disks), same_set)
856
857	#define rdev_for_each_safe(rdev, tmp, mddev) \
858	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
859
860	#define rdev_for_each_rcu(rdev, mddev) \
861	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
862
863	struct md_thread {
864	void (run) (struct* md_thread *thread);
865	struct mddev *mddev;
866	wait_queue_head_t wqueue;
867	unsigned long flags;
868	struct task_struct *tsk;
869	unsigned long timeout;
870	void *private;
871	};
872
873	struct md_io_clone {
874	struct mddev *mddev;
875	struct bio *orig_bio;
876	unsigned long start_time;
877	sector_t offset;
878	unsigned long sectors;
879	enum stat_group rw;
880	struct bio bio_clone;
881	};
882
883	#define THREAD_WAKEUP 0
884
885	static inline void safe_put_page(struct page *p)
886	{
887	if (p) put_page(page: p);
888	}
889
890	int register_md_submodule(struct md_submodule_head *msh);
891	void unregister_md_submodule(struct md_submodule_head *msh);
892
893	extern struct md_thread *md_register_thread(
894	void (run)(struct* md_thread *thread),
895	struct mddev *mddev,
896	const char *name);
897	extern void md_unregister_thread(struct mddev mddev, struct* md_thread __rcu **threadp);
898	extern void md_wakeup_thread(struct md_thread __rcu *thread);
899	extern void md_check_recovery(struct mddev *mddev);
900	extern void md_reap_sync_thread(struct mddev *mddev);
901	extern enum sync_action md_sync_action(struct mddev *mddev);
902	extern enum sync_action md_sync_action_by_name(const char *page);
903	extern const char md_sync_action_name(enum* sync_action action);
904	extern void md_write_start(struct mddev mddev, struct* bio *bi);
905	extern void md_write_inc(struct mddev mddev, struct* bio *bi);
906	extern void md_write_end(struct mddev *mddev);
907	extern void md_done_sync(struct mddev mddev, int* blocks, int ok);
908	extern void md_error(struct mddev mddev, struct* md_rdev *rdev);
909	extern void md_finish_reshape(struct mddev *mddev);
910	void md_submit_discard_bio(struct mddev mddev, struct* md_rdev *rdev,
911	struct bio *bio, sector_t start, sector_t size);
912	void md_account_bio(struct mddev mddev, struct* bio **bio);
913	void md_free_cloned_bio(struct bio *bio);
914
915	extern bool __must_check md_flush_request(struct mddev mddev, struct* bio *bio);
916	void md_write_metadata(struct mddev mddev, struct* md_rdev *rdev,
917	sector_t sector, int size, struct page *page,
918	unsigned int offset);
919	extern int md_super_wait(struct mddev *mddev);
920	extern int sync_page_io(struct md_rdev rdev, sector_t sector, int* size,
921	struct page *page, blk_opf_t opf, bool metadata_op);
922	extern void md_do_sync(struct md_thread *thread);
923	extern void md_new_event(void);
924	extern void md_allow_write(struct mddev *mddev);
925	extern void md_wait_for_blocked_rdev(struct md_rdev rdev, struct* mddev *mddev);
926	extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
927	extern int md_check_no_bitmap(struct mddev *mddev);
928	extern int md_integrity_register(struct mddev *mddev);
929	extern int strict_strtoul_scaled(const char cp, unsigned* long res, int* scale);
930
931	extern int mddev_init(struct mddev *mddev);
932	extern void mddev_destroy(struct mddev *mddev);
933	void md_init_stacking_limits(struct queue_limits *lim);
934	struct mddev md_alloc(dev_t dev, char* *name);
935	void mddev_put(struct mddev *mddev);
936	extern int md_run(struct mddev *mddev);
937	extern int md_start(struct mddev *mddev);
938	extern void md_stop(struct mddev *mddev);
939	extern void md_stop_writes(struct mddev *mddev);
940	extern int md_rdev_init(struct md_rdev *rdev);
941	extern void md_rdev_clear(struct md_rdev *rdev);
942
943	extern bool md_handle_request(struct mddev mddev, struct* bio *bio);
944	extern int mddev_suspend(struct mddev *mddev, bool interruptible);
945	extern void mddev_resume(struct mddev *mddev);
946	extern void md_idle_sync_thread(struct mddev *mddev);
947	extern void md_frozen_sync_thread(struct mddev *mddev);
948	extern void md_unfrozen_sync_thread(struct mddev *mddev);
949
950	extern void md_update_sb(struct mddev mddev, int* force);
951	extern void mddev_create_serial_pool(struct mddev mddev, struct* md_rdev *rdev);
952	extern void mddev_destroy_serial_pool(struct mddev *mddev,
953	struct md_rdev *rdev);
954	struct md_rdev md_find_rdev_nr_rcu(struct* mddev mddev, int* nr);
955	struct md_rdev md_find_rdev_rcu(struct* mddev *mddev, dev_t dev);
956
957	static inline bool is_rdev_broken(struct md_rdev *rdev)
958	{
959	return !disk_live(disk: rdev->bdev->bd_disk);
960	}
961
962	static inline void rdev_dec_pending(struct md_rdev rdev, struct* mddev *mddev)
963	{
964	int faulty = test_bit(Faulty, &rdev->flags);
965	if (atomic_dec_and_test(v: &rdev->nr_pending) && faulty) {
966	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
967	md_wakeup_thread(thread: mddev->thread);
968	}
969	}
970
971	static inline int mddev_is_clustered(struct mddev *mddev)
972	{
973	return mddev->cluster_info && mddev->bitmap_info.nodes > `1`;
974	}
975
976	/ clear unsupported mddev_flags /
977	static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
978	unsigned long unsupported_flags)
979	{
980	mddev->flags &= ~unsupported_flags;
981	}
982
983	static inline void mddev_check_write_zeroes(struct mddev mddev, struct* bio *bio)
984	{
985	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
986	!bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
987	mddev->gendisk->queue->limits.max_write_zeroes_sectors = `0`;
988	}
989
990	static inline int mddev_suspend_and_lock(struct mddev *mddev)
991	{
992	int ret;
993
994	ret = mddev_suspend(mddev, interruptible: true);
995	if (ret)
996	return ret;
997
998	ret = mddev_lock(mddev);
999	if (ret)
1000	mddev_resume(mddev);
1001
1002	return ret;
1003	}
1004
1005	static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
1006	{
1007	mddev_suspend(mddev, interruptible: false);
1008	mutex_lock(lock: &mddev->reconfig_mutex);
1009	}
1010
1011	static inline void mddev_unlock_and_resume(struct mddev *mddev)
1012	{
1013	mddev_unlock(mddev);
1014	mddev_resume(mddev);
1015	}
1016
1017	struct mdu_array_info_s;
1018	struct mdu_disk_info_s;
1019
1020	extern int mdp_major;
1021	void md_autostart_arrays(int part);
1022	int md_set_array_info(struct mddev mddev, struct* mdu_array_info_s *info);
1023	int md_add_new_disk(struct mddev mddev, struct* mdu_disk_info_s *info);
1024	int do_md_run(struct mddev *mddev);
1025	#define MDDEV_STACK_INTEGRITY (1u << 0)
1026	int mddev_stack_rdev_limits(struct mddev mddev, struct* queue_limits *lim,
1027	unsigned int flags);
1028	int mddev_stack_new_rdev(struct mddev mddev, struct* md_rdev *rdev);
1029	void mddev_update_io_opt(struct mddev mddev, unsigned* int nr_stripes);
1030
1031	extern const struct block_device_operations md_fops;
1032
1033	/*
1034	* MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
1035	*/
1036	static inline bool mddev_is_dm(struct mddev *mddev)
1037	{
1038	return !mddev->gendisk;
1039	}
1040
1041	static inline bool raid_is_456(struct mddev *mddev)
1042	{
1043	return mddev->level == ID_RAID4 \|\| mddev->level == ID_RAID5 \|\|
1044	mddev->level == ID_RAID6;
1045	}
1046
1047	static inline void mddev_trace_remap(struct mddev mddev, struct* bio *bio,
1048	sector_t sector)
1049	{
1050	if (!mddev_is_dm(mddev))
1051	trace_block_bio_remap(bio, dev: disk_devt(disk: mddev->gendisk), from: sector);
1052	}
1053
1054	static inline bool rdev_blocked(struct md_rdev *rdev)
1055	{
1056	/*
1057	* Blocked will be set by error handler and cleared by daemon after
1058	* updating superblock, meanwhile write IO should be blocked to prevent
1059	* reading old data after power failure.
1060	*/
1061	if (test_bit(Blocked, &rdev->flags))
1062	return true;
1063
1064	/*
1065	* Faulty device should not be accessed anymore, there is no need to
1066	* wait for bad block to be acknowledged.
1067	*/
1068	if (test_bit(Faulty, &rdev->flags))
1069	return false;
1070
1071	/ rdev is blocked by badblocks. /
1072	if (test_bit(BlockedBadBlocks, &rdev->flags))
1073	return true;
1074
1075	return false;
1076	}
1077
1078	#define mddev_add_trace_msg(mddev, fmt, args...) \
1079	do { \
1080	if (!mddev_is_dm(mddev)) \
1081	blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
1082	} while (0)
1083
1084	#endif /* _MD_MD_H */
1085

Browse the source code of Linux/drivers/md/md.h