buffer.c source code [Linux/fs/buffer.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/buffer.c
4	*
5	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
6	*/
7
8	/*
9	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10	*
11	* Removed a lot of unnecessary code and simplified things now that
12	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13	*
14	* Speed up hash, lru, and free list operations. Use gfp() for allocating
15	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16	*
17	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
18	*
19	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20	*/
21
22	#include <linux/kernel.h>
23	#include <linux/sched/signal.h>
24	#include <linux/syscalls.h>
25	#include <linux/fs.h>
26	#include <linux/iomap.h>
27	#include <linux/mm.h>
28	#include <linux/percpu.h>
29	#include <linux/slab.h>
30	#include <linux/capability.h>
31	#include <linux/blkdev.h>
32	#include <linux/file.h>
33	#include <linux/quotaops.h>
34	#include <linux/highmem.h>
35	#include <linux/export.h>
36	#include <linux/backing-dev.h>
37	#include <linux/writeback.h>
38	#include <linux/hash.h>
39	#include <linux/suspend.h>
40	#include <linux/buffer_head.h>
41	#include <linux/task_io_accounting_ops.h>
42	#include <linux/bio.h>
43	#include <linux/cpu.h>
44	#include <linux/bitops.h>
45	#include <linux/mpage.h>
46	#include <linux/bit_spinlock.h>
47	#include <linux/pagevec.h>
48	#include <linux/sched/mm.h>
49	#include <trace/events/block.h>
50	#include <linux/fscrypt.h>
51	#include <linux/fsverity.h>
52	#include <linux/sched/isolation.h>
53
54	#include "internal.h"
55
56	static int fsync_buffers_list(spinlock_t lock, struct* list_head *list);
57	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
58	enum rw_hint hint, struct writeback_control *wbc);
59
60	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
61
62	inline void touch_buffer(struct buffer_head *bh)
63	{
64	trace_block_touch_buffer(bh);
65	folio_mark_accessed(bh->b_folio);
66	}
67	EXPORT_SYMBOL(touch_buffer);
68
69	void __lock_buffer(struct buffer_head *bh)
70	{
71	wait_on_bit_lock_io(word: &bh->b_state, bit: BH_Lock, TASK_UNINTERRUPTIBLE);
72	}
73	EXPORT_SYMBOL(__lock_buffer);
74
75	void unlock_buffer(struct buffer_head *bh)
76	{
77	clear_bit_unlock(nr: BH_Lock, addr: &bh->b_state);
78	smp_mb__after_atomic();
79	wake_up_bit(word: &bh->b_state, bit: BH_Lock);
80	}
81	EXPORT_SYMBOL(unlock_buffer);
82
83	/*
84	* Returns if the folio has dirty or writeback buffers. If all the buffers
85	* are unlocked and clean then the folio_test_dirty information is stale. If
86	* any of the buffers are locked, it is assumed they are locked for IO.
87	*/
88	void buffer_check_dirty_writeback(struct folio *folio,
89	bool dirty, bool writeback)
90	{
91	struct buffer_head head, bh;
92	*dirty = false;
93	*writeback = false;
94
95	BUG_ON(!folio_test_locked(folio));
96
97	head = folio_buffers(folio);
98	if (!head)
99	return;
100
101	if (folio_test_writeback(folio))
102	*writeback = true;
103
104	bh = head;
105	do {
106	if (buffer_locked(bh))
107	*writeback = true;
108
109	if (buffer_dirty(bh))
110	*dirty = true;
111
112	bh = bh->b_this_page;
113	} while (bh != head);
114	}
115
116	/*
117	* Block until a buffer comes unlocked. This doesn't stop it
118	* from becoming locked again - you have to lock it yourself
119	* if you want to preserve its state.
120	*/
121	void __wait_on_buffer(struct buffer_head * bh)
122	{
123	wait_on_bit_io(word: &bh->b_state, bit: BH_Lock, TASK_UNINTERRUPTIBLE);
124	}
125	EXPORT_SYMBOL(__wait_on_buffer);
126
127	static void buffer_io_error(struct buffer_head bh, char* *msg)
128	{
129	if (!test_bit(BH_Quiet, &bh->b_state))
130	printk_ratelimited(KERN_ERR
131	"Buffer I/O error on dev %pg, logical block %llu%s\n",
132	bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
133	}
134
135	/*
136	* End-of-IO handler helper function which does not touch the bh after
137	* unlocking it.
138	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
139	* a race there is benign: unlock_buffer() only use the bh's address for
140	* hashing after unlocking the buffer, so it doesn't actually touch the bh
141	* itself.
142	*/
143	static void __end_buffer_read_notouch(struct buffer_head bh, int* uptodate)
144	{
145	if (uptodate) {
146	set_buffer_uptodate(bh);
147	} else {
148	/ This happens, due to failed read-ahead attempts. /
149	clear_buffer_uptodate(bh);
150	}
151	unlock_buffer(bh);
152	}
153
154	/*
155	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
156	* unlock the buffer.
157	*/
158	void end_buffer_read_sync(struct buffer_head bh, int* uptodate)
159	{
160	put_bh(bh);
161	__end_buffer_read_notouch(bh, uptodate);
162	}
163	EXPORT_SYMBOL(end_buffer_read_sync);
164
165	void end_buffer_write_sync(struct buffer_head bh, int* uptodate)
166	{
167	if (uptodate) {
168	set_buffer_uptodate(bh);
169	} else {
170	buffer_io_error(bh, msg: ", lost sync page write");
171	mark_buffer_write_io_error(bh);
172	clear_buffer_uptodate(bh);
173	}
174	unlock_buffer(bh);
175	put_bh(bh);
176	}
177	EXPORT_SYMBOL(end_buffer_write_sync);
178
179	static struct buffer_head *
180	__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
181	{
182	struct address_space *bd_mapping = bdev->bd_mapping;
183	const int blkbits = bd_mapping->host->i_blkbits;
184	struct buffer_head *ret = NULL;
185	pgoff_t index;
186	struct buffer_head *bh;
187	struct buffer_head *head;
188	struct folio *folio;
189	int all_mapped = `1`;
190	static DEFINE_RATELIMIT_STATE(last_warned, HZ, `1`);
191
192	index = ((loff_t)block << blkbits) / PAGE_SIZE;
193	folio = __filemap_get_folio(mapping: bd_mapping, index, FGP_ACCESSED, gfp: `0`);
194	if (IS_ERR(ptr: folio))
195	goto out;
196
197	/*
198	* Folio lock protects the buffers. Callers that cannot block
199	* will fallback to serializing vs try_to_free_buffers() via
200	* the i_private_lock.
201	*/
202	if (atomic)
203	spin_lock(lock: &bd_mapping->i_private_lock);
204	else
205	folio_lock(folio);
206
207	head = folio_buffers(folio);
208	if (!head)
209	goto out_unlock;
210	/*
211	* Upon a noref migration, the folio lock serializes here;
212	* otherwise bail.
213	*/
214	if (test_bit_acquire(BH_Migrate, &head->b_state)) {
215	WARN_ON(!atomic);
216	goto out_unlock;
217	}
218
219	bh = head;
220	do {
221	if (!buffer_mapped(bh))
222	all_mapped = `0`;
223	else if (bh->b_blocknr == block) {
224	ret = bh;
225	get_bh(bh);
226	goto out_unlock;
227	}
228	bh = bh->b_this_page;
229	} while (bh != head);
230
231	/ we might be here because some of the buffers on this page are*
232	* not mapped. This is due to various races between
233	* file io on the block device and getblk. It gets dealt with
234	* elsewhere, don't buffer_error if we had some unmapped buffers
235	*/
236	ratelimit_set_flags(rs: &last_warned, RATELIMIT_MSG_ON_RELEASE);
237	if (all_mapped && __ratelimit(&last_warned)) {
238	printk("__find_get_block_slow() failed. block=%llu, "
239	"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
240	"device %pg blocksize: %d\n",
241	(unsigned long long)block,
242	(unsigned long long)bh->b_blocknr,
243	bh->b_state, bh->b_size, bdev,
244	`1` << blkbits);
245	}
246	out_unlock:
247	if (atomic)
248	spin_unlock(lock: &bd_mapping->i_private_lock);
249	else
250	folio_unlock(folio);
251	folio_put(folio);
252	out:
253	return ret;
254	}
255
256	static void end_buffer_async_read(struct buffer_head bh, int* uptodate)
257	{
258	unsigned long flags;
259	struct buffer_head *first;
260	struct buffer_head *tmp;
261	struct folio *folio;
262	int folio_uptodate = `1`;
263
264	BUG_ON(!buffer_async_read(bh));
265
266	folio = bh->b_folio;
267	if (uptodate) {
268	set_buffer_uptodate(bh);
269	} else {
270	clear_buffer_uptodate(bh);
271	buffer_io_error(bh, msg: ", async page read");
272	}
273
274	/*
275	* Be _very_ careful from here on. Bad things can happen if
276	* two buffer heads end IO at almost the same time and both
277	* decide that the page is now completely done.
278	*/
279	first = folio_buffers(folio);
280	spin_lock_irqsave(&first->b_uptodate_lock, flags);
281	clear_buffer_async_read(bh);
282	unlock_buffer(bh);
283	tmp = bh;
284	do {
285	if (!buffer_uptodate(bh: tmp))
286	folio_uptodate = `0`;
287	if (buffer_async_read(bh: tmp)) {
288	BUG_ON(!buffer_locked(tmp));
289	goto still_busy;
290	}
291	tmp = tmp->b_this_page;
292	} while (tmp != bh);
293	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
294
295	folio_end_read(folio, success: folio_uptodate);
296	return;
297
298	still_busy:
299	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
300	}
301
302	struct postprocess_bh_ctx {
303	struct work_struct work;
304	struct buffer_head *bh;
305	};
306
307	static void verify_bh(struct work_struct *work)
308	{
309	struct postprocess_bh_ctx *ctx =
310	container_of(work, struct postprocess_bh_ctx, work);
311	struct buffer_head *bh = ctx->bh;
312	bool valid;
313
314	valid = fsverity_verify_blocks(folio: bh->b_folio, len: bh->b_size, offset: bh_offset(bh));
315	end_buffer_async_read(bh, uptodate: valid);
316	kfree(objp: ctx);
317	}
318
319	static bool need_fsverity(struct buffer_head *bh)
320	{
321	struct folio *folio = bh->b_folio;
322	struct inode *inode = folio->mapping->host;
323
324	return fsverity_active(inode) &&
325	/ needed by ext4 /
326	folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
327	}
328
329	static void decrypt_bh(struct work_struct *work)
330	{
331	struct postprocess_bh_ctx *ctx =
332	container_of(work, struct postprocess_bh_ctx, work);
333	struct buffer_head *bh = ctx->bh;
334	int err;
335
336	err = fscrypt_decrypt_pagecache_blocks(folio: bh->b_folio, len: bh->b_size,
337	offs: bh_offset(bh));
338	if (err == `0` && need_fsverity(bh)) {
339	/*
340	* We use different work queues for decryption and for verity
341	* because verity may require reading metadata pages that need
342	* decryption, and we shouldn't recurse to the same workqueue.
343	*/
344	INIT_WORK(&ctx->work, verify_bh);
345	fsverity_enqueue_verify_work(work: &ctx->work);
346	return;
347	}
348	end_buffer_async_read(bh, uptodate: err == `0`);
349	kfree(objp: ctx);
350	}
351
352	/*
353	* I/O completion handler for block_read_full_folio() - pages
354	* which come unlocked at the end of I/O.
355	*/
356	static void end_buffer_async_read_io(struct buffer_head bh, int* uptodate)
357	{
358	struct inode *inode = bh->b_folio->mapping->host;
359	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
360	bool verify = need_fsverity(bh);
361
362	/ Decrypt (with fscrypt) and/or verify (with fsverity) if needed. /
363	if (uptodate && (decrypt \|\| verify)) {
364	struct postprocess_bh_ctx *ctx =
365	kmalloc(sizeof(*ctx), GFP_ATOMIC);
366
367	if (ctx) {
368	ctx->bh = bh;
369	if (decrypt) {
370	INIT_WORK(&ctx->work, decrypt_bh);
371	fscrypt_enqueue_decrypt_work(work: &ctx->work);
372	} else {
373	INIT_WORK(&ctx->work, verify_bh);
374	fsverity_enqueue_verify_work(work: &ctx->work);
375	}
376	return;
377	}
378	uptodate = `0`;
379	}
380	end_buffer_async_read(bh, uptodate);
381	}
382
383	/*
384	* Completion handler for block_write_full_folio() - folios which are unlocked
385	* during I/O, and which have the writeback flag cleared upon I/O completion.
386	*/
387	static void end_buffer_async_write(struct buffer_head bh, int* uptodate)
388	{
389	unsigned long flags;
390	struct buffer_head *first;
391	struct buffer_head *tmp;
392	struct folio *folio;
393
394	BUG_ON(!buffer_async_write(bh));
395
396	folio = bh->b_folio;
397	if (uptodate) {
398	set_buffer_uptodate(bh);
399	} else {
400	buffer_io_error(bh, msg: ", lost async page write");
401	mark_buffer_write_io_error(bh);
402	clear_buffer_uptodate(bh);
403	}
404
405	first = folio_buffers(folio);
406	spin_lock_irqsave(&first->b_uptodate_lock, flags);
407
408	clear_buffer_async_write(bh);
409	unlock_buffer(bh);
410	tmp = bh->b_this_page;
411	while (tmp != bh) {
412	if (buffer_async_write(bh: tmp)) {
413	BUG_ON(!buffer_locked(tmp));
414	goto still_busy;
415	}
416	tmp = tmp->b_this_page;
417	}
418	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
419	folio_end_writeback(folio);
420	return;
421
422	still_busy:
423	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
424	}
425
426	/*
427	* If a page's buffers are under async readin (end_buffer_async_read
428	* completion) then there is a possibility that another thread of
429	* control could lock one of the buffers after it has completed
430	* but while some of the other buffers have not completed. This
431	* locked buffer would confuse end_buffer_async_read() into not unlocking
432	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
433	* that this buffer is not under async I/O.
434	*
435	* The page comes unlocked when it has no locked buffer_async buffers
436	* left.
437	*
438	* PageLocked prevents anyone starting new async I/O reads any of
439	* the buffers.
440	*
441	* PageWriteback is used to prevent simultaneous writeout of the same
442	* page.
443	*
444	* PageLocked prevents anyone from starting writeback of a page which is
445	* under read I/O (PageWriteback is only ever set against a locked page).
446	*/
447	static void mark_buffer_async_read(struct buffer_head *bh)
448	{
449	bh->b_end_io = end_buffer_async_read_io;
450	set_buffer_async_read(bh);
451	}
452
453	static void mark_buffer_async_write_endio(struct buffer_head *bh,
454	bh_end_io_t *handler)
455	{
456	bh->b_end_io = handler;
457	set_buffer_async_write(bh);
458	}
459
460	void mark_buffer_async_write(struct buffer_head *bh)
461	{
462	mark_buffer_async_write_endio(bh, handler: end_buffer_async_write);
463	}
464	EXPORT_SYMBOL(mark_buffer_async_write);
465
466
467	/*
468	* fs/buffer.c contains helper functions for buffer-backed address space's
469	* fsync functions. A common requirement for buffer-based filesystems is
470	* that certain data from the backing blockdev needs to be written out for
471	* a successful fsync(). For example, ext2 indirect blocks need to be
472	* written back and waited upon before fsync() returns.
473	*
474	* The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
475	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
476	* management of a list of dependent buffers at ->i_mapping->i_private_list.
477	*
478	* Locking is a little subtle: try_to_free_buffers() will remove buffers
479	* from their controlling inode's queue when they are being freed. But
480	* try_to_free_buffers() will be operating against the blockdev mapping
481	* at the time, not against the S_ISREG file which depends on those buffers.
482	* So the locking for i_private_list is via the i_private_lock in the address_space
483	* which backs the buffers. Which is different from the address_space
484	* against which the buffers are listed. So for a particular address_space,
485	* mapping->i_private_lock does not protect mapping->i_private_list! In fact,
486	* mapping->i_private_list will always be protected by the backing blockdev's
487	* ->i_private_lock.
488	*
489	* Which introduces a requirement: all buffers on an address_space's
490	* ->i_private_list must be from the same address_space: the blockdev's.
491	*
492	* address_spaces which do not place buffers at ->i_private_list via these
493	* utility functions are free to use i_private_lock and i_private_list for
494	* whatever they want. The only requirement is that list_empty(i_private_list)
495	* be true at clear_inode() time.
496	*
497	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
498	* filesystems should do that. invalidate_inode_buffers() should just go
499	* BUG_ON(!list_empty).
500	*
501	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
502	* take an address_space, not an inode. And it should be called
503	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
504	* queued up.
505	*
506	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
507	* list if it is already on a list. Because if the buffer is on a list,
508	* it must already be on the right one. If not, the filesystem is being
509	* silly. This will save a ton of locking. But first we have to ensure
510	* that buffers are taken off the old inode's list when they are freed
511	* (presumably in truncate). That requires careful auditing of all
512	* filesystems (do it inside bforget()). It could also be done by bringing
513	* b_inode back.
514	*/
515
516	/*
517	* The buffer's backing address_space's i_private_lock must be held
518	*/
519	static void __remove_assoc_queue(struct buffer_head *bh)
520	{
521	list_del_init(entry: &bh->b_assoc_buffers);
522	WARN_ON(!bh->b_assoc_map);
523	bh->b_assoc_map = NULL;
524	}
525
526	int inode_has_buffers(struct inode *inode)
527	{
528	return !list_empty(head: &inode->i_data.i_private_list);
529	}
530
531	/*
532	* osync is designed to support O_SYNC io. It waits synchronously for
533	* all already-submitted IO to complete, but does not queue any new
534	* writes to the disk.
535	*
536	* To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
537	* as you dirty the buffers, and then use osync_inode_buffers to wait for
538	* completion. Any other dirty buffers which are not yet queued for
539	* write will not be flushed to disk by the osync.
540	*/
541	static int osync_buffers_list(spinlock_t lock, struct* list_head *list)
542	{
543	struct buffer_head *bh;
544	struct list_head *p;
545	int err = `0`;
546
547	spin_lock(lock);
548	repeat:
549	list_for_each_prev(p, list) {
550	bh = BH_ENTRY(p);
551	if (buffer_locked(bh)) {
552	get_bh(bh);
553	spin_unlock(lock);
554	wait_on_buffer(bh);
555	if (!buffer_uptodate(bh))
556	err = -EIO;
557	brelse(bh);
558	spin_lock(lock);
559	goto repeat;
560	}
561	}
562	spin_unlock(lock);
563	return err;
564	}
565
566	/**
567	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
568	* @mapping: the mapping which wants those buffers written
569	*
570	* Starts I/O against the buffers at mapping->i_private_list, and waits upon
571	* that I/O.
572	*
573	* Basically, this is a convenience function for fsync().
574	* @mapping is a file or directory which needs those buffers to be written for
575	* a successful fsync().
576	*/
577	int sync_mapping_buffers(struct address_space *mapping)
578	{
579	struct address_space *buffer_mapping = mapping->i_private_data;
580
581	if (buffer_mapping == NULL \|\| list_empty(head: &mapping->i_private_list))
582	return `0`;
583
584	return fsync_buffers_list(lock: &buffer_mapping->i_private_lock,
585	list: &mapping->i_private_list);
586	}
587	EXPORT_SYMBOL(sync_mapping_buffers);
588
589	/**
590	* generic_buffers_fsync_noflush - generic buffer fsync implementation
591	* for simple filesystems with no inode lock
592	*
593	* @file: file to synchronize
594	* @start: start offset in bytes
595	* @end: end offset in bytes (inclusive)
596	* @datasync: only synchronize essential metadata if true
597	*
598	* This is a generic implementation of the fsync method for simple
599	* filesystems which track all non-inode metadata in the buffers list
600	* hanging off the address_space structure.
601	*/
602	int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
603	bool datasync)
604	{
605	struct inode *inode = file->f_mapping->host;
606	int err;
607	int ret;
608
609	err = file_write_and_wait_range(file, start, end);
610	if (err)
611	return err;
612
613	ret = sync_mapping_buffers(inode->i_mapping);
614	if (!(inode->i_state & I_DIRTY_ALL))
615	goto out;
616	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
617	goto out;
618
619	err = sync_inode_metadata(inode, wait: `1`);
620	if (ret == `0`)
621	ret = err;
622
623	out:
624	/ check and advance again to catch errors after syncing out buffers /
625	err = file_check_and_advance_wb_err(file);
626	if (ret == `0`)
627	ret = err;
628	return ret;
629	}
630	EXPORT_SYMBOL(generic_buffers_fsync_noflush);
631
632	/**
633	* generic_buffers_fsync - generic buffer fsync implementation
634	* for simple filesystems with no inode lock
635	*
636	* @file: file to synchronize
637	* @start: start offset in bytes
638	* @end: end offset in bytes (inclusive)
639	* @datasync: only synchronize essential metadata if true
640	*
641	* This is a generic implementation of the fsync method for simple
642	* filesystems which track all non-inode metadata in the buffers list
643	* hanging off the address_space structure. This also makes sure that
644	* a device cache flush operation is called at the end.
645	*/
646	int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
647	bool datasync)
648	{
649	struct inode *inode = file->f_mapping->host;
650	int ret;
651
652	ret = generic_buffers_fsync_noflush(file, start, end, datasync);
653	if (!ret)
654	ret = blkdev_issue_flush(bdev: inode->i_sb->s_bdev);
655	return ret;
656	}
657	EXPORT_SYMBOL(generic_buffers_fsync);
658
659	/*
660	* Called when we've recently written block `bblock', and it is known that
661	* `bblock' was for a buffer_boundary() buffer. This means that the block at
662	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
663	* dirty, schedule it for IO. So that indirects merge nicely with their data.
664	*/
665	void write_boundary_block(struct block_device *bdev,
666	sector_t bblock, unsigned blocksize)
667	{
668	struct buffer_head *bh;
669
670	bh = __find_get_block_nonatomic(bdev, block: bblock + `1`, size: blocksize);
671	if (bh) {
672	if (buffer_dirty(bh))
673	write_dirty_buffer(bh, op_flags: `0`);
674	put_bh(bh);
675	}
676	}
677
678	void mark_buffer_dirty_inode(struct buffer_head bh, struct* inode *inode)
679	{
680	struct address_space *mapping = inode->i_mapping;
681	struct address_space *buffer_mapping = bh->b_folio->mapping;
682
683	mark_buffer_dirty(bh);
684	if (!mapping->i_private_data) {
685	mapping->i_private_data = buffer_mapping;
686	} else {
687	BUG_ON(mapping->i_private_data != buffer_mapping);
688	}
689	if (!bh->b_assoc_map) {
690	spin_lock(lock: &buffer_mapping->i_private_lock);
691	list_move_tail(list: &bh->b_assoc_buffers,
692	head: &mapping->i_private_list);
693	bh->b_assoc_map = mapping;
694	spin_unlock(lock: &buffer_mapping->i_private_lock);
695	}
696	}
697	EXPORT_SYMBOL(mark_buffer_dirty_inode);
698
699	/**
700	* block_dirty_folio - Mark a folio as dirty.
701	* @mapping: The address space containing this folio.
702	* @folio: The folio to mark dirty.
703	*
704	* Filesystems which use buffer_heads can use this function as their
705	* ->dirty_folio implementation. Some filesystems need to do a little
706	* work before calling this function. Filesystems which do not use
707	* buffer_heads should call filemap_dirty_folio() instead.
708	*
709	* If the folio has buffers, the uptodate buffers are set dirty, to
710	* preserve dirty-state coherency between the folio and the buffers.
711	* Buffers added to a dirty folio are created dirty.
712	*
713	* The buffers are dirtied before the folio is dirtied. There's a small
714	* race window in which writeback may see the folio cleanness but not the
715	* buffer dirtiness. That's fine. If this code were to set the folio
716	* dirty before the buffers, writeback could clear the folio dirty flag,
717	* see a bunch of clean buffers and we'd end up with dirty buffers/clean
718	* folio on the dirty folio list.
719	*
720	* We use i_private_lock to lock against try_to_free_buffers() while
721	* using the folio's buffer list. This also prevents clean buffers
722	* being added to the folio after it was set dirty.
723	*
724	* Context: May only be called from process context. Does not sleep.
725	* Caller must ensure that @folio cannot be truncated during this call,
726	* typically by holding the folio lock or having a page in the folio
727	* mapped and holding the page table lock.
728	*
729	* Return: True if the folio was dirtied; false if it was already dirtied.
730	*/
731	bool block_dirty_folio(struct address_space mapping, struct* folio *folio)
732	{
733	struct buffer_head *head;
734	bool newly_dirty;
735
736	spin_lock(lock: &mapping->i_private_lock);
737	head = folio_buffers(folio);
738	if (head) {
739	struct buffer_head *bh = head;
740
741	do {
742	set_buffer_dirty(bh);
743	bh = bh->b_this_page;
744	} while (bh != head);
745	}
746	/*
747	* Lock out page's memcg migration to keep PageDirty
748	* synchronized with per-memcg dirty page counters.
749	*/
750	newly_dirty = !folio_test_set_dirty(folio);
751	spin_unlock(lock: &mapping->i_private_lock);
752
753	if (newly_dirty)
754	__folio_mark_dirty(folio, mapping, warn: `1`);
755
756	if (newly_dirty)
757	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
758
759	return newly_dirty;
760	}
761	EXPORT_SYMBOL(block_dirty_folio);
762
763	/*
764	* Write out and wait upon a list of buffers.
765	*
766	* We have conflicting pressures: we want to make sure that all
767	* initially dirty buffers get waited on, but that any subsequently
768	* dirtied buffers don't. After all, we don't want fsync to last
769	* forever if somebody is actively writing to the file.
770	*
771	* Do this in two main stages: first we copy dirty buffers to a
772	* temporary inode list, queueing the writes as we go. Then we clean
773	* up, waiting for those writes to complete.
774	*
775	* During this second stage, any subsequent updates to the file may end
776	* up refiling the buffer on the original inode's dirty list again, so
777	* there is a chance we will end up with a buffer queued for write but
778	* not yet completed on that list. So, as a final cleanup we go through
779	* the osync code to catch these locked, dirty buffers without requeuing
780	* any newly dirty buffers for write.
781	*/
782	static int fsync_buffers_list(spinlock_t lock, struct* list_head *list)
783	{
784	struct buffer_head *bh;
785	struct address_space *mapping;
786	int err = `0`, err2;
787	struct blk_plug plug;
788	LIST_HEAD(tmp);
789
790	blk_start_plug(&plug);
791
792	spin_lock(lock);
793	while (!list_empty(head: list)) {
794	bh = BH_ENTRY(list->next);
795	mapping = bh->b_assoc_map;
796	__remove_assoc_queue(bh);
797	/ Avoid race with mark_buffer_dirty_inode() which does*
798	* a lockless check and we rely on seeing the dirty bit */
799	smp_mb();
800	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
801	list_add(new: &bh->b_assoc_buffers, head: &tmp);
802	bh->b_assoc_map = mapping;
803	if (buffer_dirty(bh)) {
804	get_bh(bh);
805	spin_unlock(lock);
806	/*
807	* Ensure any pending I/O completes so that
808	* write_dirty_buffer() actually writes the
809	* current contents - it is a noop if I/O is
810	* still in flight on potentially older
811	* contents.
812	*/
813	write_dirty_buffer(bh, REQ_SYNC);
814
815	/*
816	* Kick off IO for the previous mapping. Note
817	* that we will not run the very last mapping,
818	* wait_on_buffer() will do that for us
819	* through sync_buffer().
820	*/
821	brelse(bh);
822	spin_lock(lock);
823	}
824	}
825	}
826
827	spin_unlock(lock);
828	blk_finish_plug(&plug);
829	spin_lock(lock);
830
831	while (!list_empty(head: &tmp)) {
832	bh = BH_ENTRY(tmp.prev);
833	get_bh(bh);
834	mapping = bh->b_assoc_map;
835	__remove_assoc_queue(bh);
836	/ Avoid race with mark_buffer_dirty_inode() which does*
837	* a lockless check and we rely on seeing the dirty bit */
838	smp_mb();
839	if (buffer_dirty(bh)) {
840	list_add(new: &bh->b_assoc_buffers,
841	head: &mapping->i_private_list);
842	bh->b_assoc_map = mapping;
843	}
844	spin_unlock(lock);
845	wait_on_buffer(bh);
846	if (!buffer_uptodate(bh))
847	err = -EIO;
848	brelse(bh);
849	spin_lock(lock);
850	}
851
852	spin_unlock(lock);
853	err2 = osync_buffers_list(lock, list);
854	if (err)
855	return err;
856	else
857	return err2;
858	}
859
860	/*
861	* Invalidate any and all dirty buffers on a given inode. We are
862	* probably unmounting the fs, but that doesn't mean we have already
863	* done a sync(). Just drop the buffers from the inode list.
864	*
865	* NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
866	* assumes that all the buffers are against the blockdev.
867	*/
868	void invalidate_inode_buffers(struct inode *inode)
869	{
870	if (inode_has_buffers(inode)) {
871	struct address_space *mapping = &inode->i_data;
872	struct list_head *list = &mapping->i_private_list;
873	struct address_space *buffer_mapping = mapping->i_private_data;
874
875	spin_lock(lock: &buffer_mapping->i_private_lock);
876	while (!list_empty(head: list))
877	__remove_assoc_queue(BH_ENTRY(list->next));
878	spin_unlock(lock: &buffer_mapping->i_private_lock);
879	}
880	}
881	EXPORT_SYMBOL(invalidate_inode_buffers);
882
883	/*
884	* Remove any clean buffers from the inode's buffer list. This is called
885	* when we're trying to free the inode itself. Those buffers can pin it.
886	*
887	* Returns true if all buffers were removed.
888	*/
889	int remove_inode_buffers(struct inode *inode)
890	{
891	int ret = `1`;
892
893	if (inode_has_buffers(inode)) {
894	struct address_space *mapping = &inode->i_data;
895	struct list_head *list = &mapping->i_private_list;
896	struct address_space *buffer_mapping = mapping->i_private_data;
897
898	spin_lock(lock: &buffer_mapping->i_private_lock);
899	while (!list_empty(head: list)) {
900	struct buffer_head *bh = BH_ENTRY(list->next);
901	if (buffer_dirty(bh)) {
902	ret = `0`;
903	break;
904	}
905	__remove_assoc_queue(bh);
906	}
907	spin_unlock(lock: &buffer_mapping->i_private_lock);
908	}
909	return ret;
910	}
911
912	/*
913	* Create the appropriate buffers when given a folio for data area and
914	* the size of each buffer.. Use the bh->b_this_page linked list to
915	* follow the buffers created. Return NULL if unable to create more
916	* buffers.
917	*
918	* The retry flag is used to differentiate async IO (paging, swapping)
919	* which may not fail from ordinary buffer allocations.
920	*/
921	struct buffer_head folio_alloc_buffers(struct* folio folio, unsigned* long size,
922	gfp_t gfp)
923	{
924	struct buffer_head bh, head;
925	long offset;
926	struct mem_cgroup memcg, old_memcg;
927
928	/ The folio lock pins the memcg /
929	memcg = folio_memcg(folio);
930	old_memcg = set_active_memcg(memcg);
931
932	head = NULL;
933	offset = folio_size(folio);
934	while ((offset -= size) >= `0`) {
935	bh = alloc_buffer_head(gfp_flags: gfp);
936	if (!bh)
937	goto no_grow;
938
939	bh->b_this_page = head;
940	bh->b_blocknr = -`1`;
941	head = bh;
942
943	bh->b_size = size;
944
945	/ Link the buffer to its folio /
946	folio_set_bh(bh, folio, offset);
947	}
948	out:
949	set_active_memcg(old_memcg);
950	return head;
951	/*
952	* In case anything failed, we just free everything we got.
953	*/
954	no_grow:
955	if (head) {
956	do {
957	bh = head;
958	head = head->b_this_page;
959	free_buffer_head(bh);
960	} while (head);
961	}
962
963	goto out;
964	}
965	EXPORT_SYMBOL_GPL(folio_alloc_buffers);
966
967	struct buffer_head alloc_page_buffers(struct* page page, unsigned* long size)
968	{
969	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT;
970
971	return folio_alloc_buffers(page_folio(page), size, gfp);
972	}
973	EXPORT_SYMBOL_GPL(alloc_page_buffers);
974
975	static inline void link_dev_buffers(struct folio *folio,
976	struct buffer_head *head)
977	{
978	struct buffer_head bh, tail;
979
980	bh = head;
981	do {
982	tail = bh;
983	bh = bh->b_this_page;
984	} while (bh);
985	tail->b_this_page = head;
986	folio_attach_private(folio, data: head);
987	}
988
989	static sector_t blkdev_max_block(struct block_device bdev, unsigned* int size)
990	{
991	sector_t retval = ~((sector_t)`0`);
992	loff_t sz = bdev_nr_bytes(bdev);
993
994	if (sz) {
995	unsigned int sizebits = blksize_bits(size);
996	retval = (sz >> sizebits);
997	}
998	return retval;
999	}
1000
1001	/*
1002	* Initialise the state of a blockdev folio's buffers.
1003	*/
1004	static sector_t folio_init_buffers(struct folio *folio,
1005	struct block_device bdev, unsigned* size)
1006	{
1007	struct buffer_head *head = folio_buffers(folio);
1008	struct buffer_head *bh = head;
1009	bool uptodate = folio_test_uptodate(folio);
1010	sector_t block = div_u64(dividend: folio_pos(folio), divisor: size);
1011	sector_t end_block = blkdev_max_block(bdev, size);
1012
1013	do {
1014	if (!buffer_mapped(bh)) {
1015	bh->b_end_io = NULL;
1016	bh->b_private = NULL;
1017	bh->b_bdev = bdev;
1018	bh->b_blocknr = block;
1019	if (uptodate)
1020	set_buffer_uptodate(bh);
1021	if (block < end_block)
1022	set_buffer_mapped(bh);
1023	}
1024	block++;
1025	bh = bh->b_this_page;
1026	} while (bh != head);
1027
1028	/*
1029	* Caller needs to validate requested block against end of device.
1030	*/
1031	return end_block;
1032	}
1033
1034	/*
1035	* Create the page-cache folio that contains the requested block.
1036	*
1037	* This is used purely for blockdev mappings.
1038	*
1039	* Returns false if we have a failure which cannot be cured by retrying
1040	* without sleeping. Returns true if we succeeded, or the caller should retry.
1041	*/
1042	static bool grow_dev_folio(struct block_device *bdev, sector_t block,
1043	pgoff_t index, unsigned size, gfp_t gfp)
1044	{
1045	struct address_space *mapping = bdev->bd_mapping;
1046	struct folio *folio;
1047	struct buffer_head *bh;
1048	sector_t end_block = `0`;
1049
1050	folio = __filemap_get_folio(mapping, index,
1051	FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT, gfp);
1052	if (IS_ERR(ptr: folio))
1053	return false;
1054
1055	bh = folio_buffers(folio);
1056	if (bh) {
1057	if (bh->b_size == size) {
1058	end_block = folio_init_buffers(folio, bdev, size);
1059	goto unlock;
1060	}
1061
1062	/*
1063	* Retrying may succeed; for example the folio may finish
1064	* writeback, or buffers may be cleaned. This should not
1065	* happen very often; maybe we have old buffers attached to
1066	* this blockdev's page cache and we're trying to change
1067	* the block size?
1068	*/
1069	if (!try_to_free_buffers(folio)) {
1070	end_block = ~`0ULL`;
1071	goto unlock;
1072	}
1073	}
1074
1075	bh = folio_alloc_buffers(folio, size, gfp \| __GFP_ACCOUNT);
1076	if (!bh)
1077	goto unlock;
1078
1079	/*
1080	* Link the folio to the buffers and initialise them. Take the
1081	* lock to be atomic wrt __find_get_block(), which does not
1082	* run under the folio lock.
1083	*/
1084	spin_lock(lock: &mapping->i_private_lock);
1085	link_dev_buffers(folio, head: bh);
1086	end_block = folio_init_buffers(folio, bdev, size);
1087	spin_unlock(lock: &mapping->i_private_lock);
1088	unlock:
1089	folio_unlock(folio);
1090	folio_put(folio);
1091	return block < end_block;
1092	}
1093
1094	/*
1095	* Create buffers for the specified block device block's folio. If
1096	* that folio was dirty, the buffers are set dirty also. Returns false
1097	* if we've hit a permanent error.
1098	*/
1099	static bool grow_buffers(struct block_device *bdev, sector_t block,
1100	unsigned size, gfp_t gfp)
1101	{
1102	loff_t pos;
1103
1104	/*
1105	* Check for a block which lies outside our maximum possible
1106	* pagecache index.
1107	*/
1108	if (check_mul_overflow(block, (sector_t)size, &pos) \|\| pos > MAX_LFS_FILESIZE) {
1109	printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
1110	__func__, (unsigned long long)block,
1111	bdev);
1112	return false;
1113	}
1114
1115	/ Create a folio with the proper size buffers /
1116	return grow_dev_folio(bdev, block, index: pos / PAGE_SIZE, size, gfp);
1117	}
1118
1119	static struct buffer_head *
1120	__getblk_slow(struct block_device *bdev, sector_t block,
1121	unsigned size, gfp_t gfp)
1122	{
1123	bool blocking = gfpflags_allow_blocking(gfp_flags: gfp);
1124
1125	if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
1126	printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
1127	size, bdev_logical_block_size(bdev));
1128	return NULL;
1129	}
1130
1131	for (;;) {
1132	struct buffer_head *bh;
1133
1134	if (!grow_buffers(bdev, block, size, gfp))
1135	return NULL;
1136
1137	if (blocking)
1138	bh = __find_get_block_nonatomic(bdev, block, size);
1139	else
1140	bh = __find_get_block(bdev, block, size);
1141	if (bh)
1142	return bh;
1143	}
1144	}
1145
1146	/*
1147	* The relationship between dirty buffers and dirty pages:
1148	*
1149	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
1150	* the page is tagged dirty in the page cache.
1151	*
1152	* At all times, the dirtiness of the buffers represents the dirtiness of
1153	* subsections of the page. If the page has buffers, the page dirty bit is
1154	* merely a hint about the true dirty state.
1155	*
1156	* When a page is set dirty in its entirety, all its buffers are marked dirty
1157	* (if the page has buffers).
1158	*
1159	* When a buffer is marked dirty, its page is dirtied, but the page's other
1160	* buffers are not.
1161	*
1162	* Also. When blockdev buffers are explicitly read with bread(), they
1163	* individually become uptodate. But their backing page remains not
1164	* uptodate - even if all of its buffers are uptodate. A subsequent
1165	* block_read_full_folio() against that folio will discover all the uptodate
1166	* buffers, will set the folio uptodate and will perform no I/O.
1167	*/
1168
1169	/**
1170	* mark_buffer_dirty - mark a buffer_head as needing writeout
1171	* @bh: the buffer_head to mark dirty
1172	*
1173	* mark_buffer_dirty() will set the dirty bit against the buffer, then set
1174	* its backing page dirty, then tag the page as dirty in the page cache
1175	* and then attach the address_space's inode to its superblock's dirty
1176	* inode list.
1177	*
1178	* mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock,
1179	* i_pages lock and mapping->host->i_lock.
1180	*/
1181	void mark_buffer_dirty(struct buffer_head *bh)
1182	{
1183	WARN_ON_ONCE(!buffer_uptodate(bh));
1184
1185	trace_block_dirty_buffer(bh);
1186
1187	/*
1188	* Very carefully optimize the it-is-already-dirty case.
1189	*
1190	* Don't let the final "is it dirty" escape to before we
1191	* perhaps modified the buffer.
1192	*/
1193	if (buffer_dirty(bh)) {
1194	smp_mb();
1195	if (buffer_dirty(bh))
1196	return;
1197	}
1198
1199	if (!test_set_buffer_dirty(bh)) {
1200	struct folio *folio = bh->b_folio;
1201	struct address_space *mapping = NULL;
1202
1203	if (!folio_test_set_dirty(folio)) {
1204	mapping = folio->mapping;
1205	if (mapping)
1206	__folio_mark_dirty(folio, mapping, warn: `0`);
1207	}
1208	if (mapping)
1209	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1210	}
1211	}
1212	EXPORT_SYMBOL(mark_buffer_dirty);
1213
1214	void mark_buffer_write_io_error(struct buffer_head *bh)
1215	{
1216	set_buffer_write_io_error(bh);
1217	/ FIXME: do we need to set this in both places? /
1218	if (bh->b_folio && bh->b_folio->mapping)
1219	mapping_set_error(mapping: bh->b_folio->mapping, error: -EIO);
1220	if (bh->b_assoc_map)
1221	mapping_set_error(mapping: bh->b_assoc_map, error: -EIO);
1222	}
1223	EXPORT_SYMBOL(mark_buffer_write_io_error);
1224
1225	/**
1226	* __brelse - Release a buffer.
1227	* @bh: The buffer to release.
1228	*
1229	* This variant of brelse() can be called if @bh is guaranteed to not be NULL.
1230	*/
1231	void __brelse(struct buffer_head *bh)
1232	{
1233	if (atomic_read(v: &bh->b_count)) {
1234	put_bh(bh);
1235	return;
1236	}
1237	WARN(`1`, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1238	}
1239	EXPORT_SYMBOL(__brelse);
1240
1241	/**
1242	* __bforget - Discard any dirty data in a buffer.
1243	* @bh: The buffer to forget.
1244	*
1245	* This variant of bforget() can be called if @bh is guaranteed to not
1246	* be NULL.
1247	*/
1248	void __bforget(struct buffer_head *bh)
1249	{
1250	clear_buffer_dirty(bh);
1251	if (bh->b_assoc_map) {
1252	struct address_space *buffer_mapping = bh->b_folio->mapping;
1253
1254	spin_lock(lock: &buffer_mapping->i_private_lock);
1255	list_del_init(entry: &bh->b_assoc_buffers);
1256	bh->b_assoc_map = NULL;
1257	spin_unlock(lock: &buffer_mapping->i_private_lock);
1258	}
1259	__brelse(bh);
1260	}
1261	EXPORT_SYMBOL(__bforget);
1262
1263	static struct buffer_head __bread_slow(struct* buffer_head *bh)
1264	{
1265	lock_buffer(bh);
1266	if (buffer_uptodate(bh)) {
1267	unlock_buffer(bh);
1268	return bh;
1269	} else {
1270	get_bh(bh);
1271	bh->b_end_io = end_buffer_read_sync;
1272	submit_bh(REQ_OP_READ, bh);
1273	wait_on_buffer(bh);
1274	if (buffer_uptodate(bh))
1275	return bh;
1276	}
1277	brelse(bh);
1278	return NULL;
1279	}
1280
1281	/*
1282	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1283	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1284	* refcount elevated by one when they're in an LRU. A buffer can only appear
1285	* once in a particular CPU's LRU. A single buffer can be present in multiple
1286	* CPU's LRUs at the same time.
1287	*
1288	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
1289	* sb_find_get_block().
1290	*
1291	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
1292	* a local interrupt disable for that.
1293	*/
1294
1295	#define BH_LRU_SIZE 16
1296
1297	struct bh_lru {
1298	struct buffer_head *bhs[BH_LRU_SIZE];
1299	};
1300
1301	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1302
1303	#ifdef CONFIG_SMP
1304	#define bh_lru_lock() local_irq_disable()
1305	#define bh_lru_unlock() local_irq_enable()
1306	#else
1307	#define bh_lru_lock() preempt_disable()
1308	#define bh_lru_unlock() preempt_enable()
1309	#endif
1310
1311	static inline void check_irqs_on(void)
1312	{
1313	#ifdef irqs_disabled
1314	BUG_ON(irqs_disabled());
1315	#endif
1316	}
1317
1318	/*
1319	* Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1320	* inserted at the front, and the buffer_head at the back if any is evicted.
1321	* Or, if already in the LRU it is moved to the front.
1322	*/
1323	static void bh_lru_install(struct buffer_head *bh)
1324	{
1325	struct buffer_head *evictee = bh;
1326	struct bh_lru *b;
1327	int i;
1328
1329	check_irqs_on();
1330	bh_lru_lock();
1331
1332	/*
1333	* the refcount of buffer_head in bh_lru prevents dropping the
1334	* attached page(i.e., try_to_free_buffers) so it could cause
1335	* failing page migration.
1336	* Skip putting upcoming bh into bh_lru until migration is done.
1337	*/
1338	if (lru_cache_disabled() \|\| cpu_is_isolated(smp_processor_id())) {
1339	bh_lru_unlock();
1340	return;
1341	}
1342
1343	b = this_cpu_ptr(&bh_lrus);
1344	for (i = `0`; i < BH_LRU_SIZE; i++) {
1345	swap(evictee, b->bhs[i]);
1346	if (evictee == bh) {
1347	bh_lru_unlock();
1348	return;
1349	}
1350	}
1351
1352	get_bh(bh);
1353	bh_lru_unlock();
1354	brelse(bh: evictee);
1355	}
1356
1357	/*
1358	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
1359	*/
1360	static struct buffer_head *
1361	lookup_bh_lru(struct block_device bdev, sector_t block, unsigned* size)
1362	{
1363	struct buffer_head *ret = NULL;
1364	unsigned int i;
1365
1366	check_irqs_on();
1367	bh_lru_lock();
1368	if (cpu_is_isolated(smp_processor_id())) {
1369	bh_lru_unlock();
1370	return NULL;
1371	}
1372	for (i = `0`; i < BH_LRU_SIZE; i++) {
1373	struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1374
1375	if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1376	bh->b_size == size) {
1377	if (i) {
1378	while (i) {
1379	__this_cpu_write(bh_lrus.bhs[i],
1380	__this_cpu_read(bh_lrus.bhs[i - `1`]));
1381	i--;
1382	}
1383	__this_cpu_write(bh_lrus.bhs[`0`], bh);
1384	}
1385	get_bh(bh);
1386	ret = bh;
1387	break;
1388	}
1389	}
1390	bh_lru_unlock();
1391	return ret;
1392	}
1393
1394	/*
1395	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
1396	* it in the LRU and mark it as accessed. If it is not present then return
1397	* NULL. Atomic context callers may also return NULL if the buffer is being
1398	* migrated; similarly the page is not marked accessed either.
1399	*/
1400	static struct buffer_head *
1401	find_get_block_common(struct block_device *bdev, sector_t block,
1402	unsigned size, bool atomic)
1403	{
1404	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1405
1406	if (bh == NULL) {
1407	/ __find_get_block_slow will mark the page accessed /
1408	bh = __find_get_block_slow(bdev, block, atomic);
1409	if (bh)
1410	bh_lru_install(bh);
1411	} else
1412	touch_buffer(bh);
1413
1414	return bh;
1415	}
1416
1417	struct buffer_head *
1418	__find_get_block(struct block_device bdev, sector_t block, unsigned* size)
1419	{
1420	return find_get_block_common(bdev, block, size, atomic: true);
1421	}
1422	EXPORT_SYMBOL(__find_get_block);
1423
1424	/ same as __find_get_block() but allows sleeping contexts /
1425	struct buffer_head *
1426	__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
1427	unsigned size)
1428	{
1429	return find_get_block_common(bdev, block, size, atomic: false);
1430	}
1431	EXPORT_SYMBOL(__find_get_block_nonatomic);
1432
1433	/**
1434	* bdev_getblk - Get a buffer_head in a block device's buffer cache.
1435	* @bdev: The block device.
1436	* @block: The block number.
1437	* @size: The size of buffer_heads for this @bdev.
1438	* @gfp: The memory allocation flags to use.
1439	*
1440	* The returned buffer head has its reference count incremented, but is
1441	* not locked. The caller should call brelse() when it has finished
1442	* with the buffer. The buffer may not be uptodate. If needed, the
1443	* caller can bring it uptodate either by reading it or overwriting it.
1444	*
1445	* Return: The buffer head, or NULL if memory could not be allocated.
1446	*/
1447	struct buffer_head bdev_getblk(struct* block_device *bdev, sector_t block,
1448	unsigned size, gfp_t gfp)
1449	{
1450	struct buffer_head *bh;
1451
1452	if (gfpflags_allow_blocking(gfp_flags: gfp))
1453	bh = __find_get_block_nonatomic(bdev, block, size);
1454	else
1455	bh = __find_get_block(bdev, block, size);
1456
1457	might_alloc(gfp_mask: gfp);
1458	if (bh)
1459	return bh;
1460
1461	return __getblk_slow(bdev, block, size, gfp);
1462	}
1463	EXPORT_SYMBOL(bdev_getblk);
1464
1465	/*
1466	* Do async read-ahead on a buffer..
1467	*/
1468	void __breadahead(struct block_device bdev, sector_t block, unsigned* size)
1469	{
1470	struct buffer_head *bh = bdev_getblk(bdev, block, size,
1471	GFP_NOWAIT \| __GFP_MOVABLE);
1472
1473	if (likely(bh)) {
1474	bh_readahead(bh, REQ_RAHEAD);
1475	brelse(bh);
1476	}
1477	}
1478	EXPORT_SYMBOL(__breadahead);
1479
1480	/**
1481	* __bread_gfp() - Read a block.
1482	* @bdev: The block device to read from.
1483	* @block: Block number in units of block size.
1484	* @size: The block size of this device in bytes.
1485	* @gfp: Not page allocation flags; see below.
1486	*
1487	* You are not expected to call this function. You should use one of
1488	* sb_bread(), sb_bread_unmovable() or __bread().
1489	*
1490	* Read a specified block, and return the buffer head that refers to it.
1491	* If @gfp is 0, the memory will be allocated using the block device's
1492	* default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be
1493	* allocated from a movable area. Do not pass in a complete set of
1494	* GFP flags.
1495	*
1496	* The returned buffer head has its refcount increased. The caller should
1497	* call brelse() when it has finished with the buffer.
1498	*
1499	* Context: May sleep waiting for I/O.
1500	* Return: NULL if the block was unreadable.
1501	*/
1502	struct buffer_head __bread_gfp(struct* block_device *bdev, sector_t block,
1503	unsigned size, gfp_t gfp)
1504	{
1505	struct buffer_head *bh;
1506
1507	gfp \|= mapping_gfp_constraint(mapping: bdev->bd_mapping, gfp_mask: ~__GFP_FS);
1508
1509	/*
1510	* Prefer looping in the allocator rather than here, at least that
1511	* code knows what it's doing.
1512	*/
1513	gfp \|= __GFP_NOFAIL;
1514
1515	bh = bdev_getblk(bdev, block, size, gfp);
1516
1517	if (likely(bh) && !buffer_uptodate(bh))
1518	bh = __bread_slow(bh);
1519	return bh;
1520	}
1521	EXPORT_SYMBOL(__bread_gfp);
1522
1523	static void __invalidate_bh_lrus(struct bh_lru *b)
1524	{
1525	int i;
1526
1527	for (i = `0`; i < BH_LRU_SIZE; i++) {
1528	brelse(bh: b->bhs[i]);
1529	b->bhs[i] = NULL;
1530	}
1531	}
1532	/*
1533	* invalidate_bh_lrus() is called rarely - but not only at unmount.
1534	* This doesn't race because it runs in each cpu either in irq
1535	* or with preempt disabled.
1536	*/
1537	static void invalidate_bh_lru(void *arg)
1538	{
1539	struct bh_lru *b = &get_cpu_var(bh_lrus);
1540
1541	__invalidate_bh_lrus(b);
1542	put_cpu_var(bh_lrus);
1543	}
1544
1545	bool has_bh_in_lru(int cpu, void *dummy)
1546	{
1547	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1548	int i;
1549
1550	for (i = `0`; i < BH_LRU_SIZE; i++) {
1551	if (b->bhs[i])
1552	return true;
1553	}
1554
1555	return false;
1556	}
1557
1558	void invalidate_bh_lrus(void)
1559	{
1560	on_each_cpu_cond(cond_func: has_bh_in_lru, func: invalidate_bh_lru, NULL, wait: `1`);
1561	}
1562	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1563
1564	/*
1565	* It's called from workqueue context so we need a bh_lru_lock to close
1566	* the race with preemption/irq.
1567	*/
1568	void invalidate_bh_lrus_cpu(void)
1569	{
1570	struct bh_lru *b;
1571
1572	bh_lru_lock();
1573	b = this_cpu_ptr(&bh_lrus);
1574	__invalidate_bh_lrus(b);
1575	bh_lru_unlock();
1576	}
1577
1578	void folio_set_bh(struct buffer_head bh, struct* folio *folio,
1579	unsigned long offset)
1580	{
1581	bh->b_folio = folio;
1582	BUG_ON(offset >= folio_size(folio));
1583	if (folio_test_highmem(folio))
1584	/*
1585	* This catches illegal uses and preserves the offset:
1586	*/
1587	bh->b_data = (char *)(`0` + offset);
1588	else
1589	bh->b_data = folio_address(folio) + offset;
1590	}
1591	EXPORT_SYMBOL(folio_set_bh);
1592
1593	/*
1594	* Called when truncating a buffer on a page completely.
1595	*/
1596
1597	/ Bits that are cleared during an invalidate /
1598	#define BUFFER_FLAGS_DISCARD \
1599	(1 << BH_Mapped \| 1 << BH_New \| 1 << BH_Req \| \
1600	1 << BH_Delay \| 1 << BH_Unwritten)
1601
1602	static void discard_buffer(struct buffer_head * bh)
1603	{
1604	unsigned long b_state;
1605
1606	lock_buffer(bh);
1607	clear_buffer_dirty(bh);
1608	bh->b_bdev = NULL;
1609	b_state = READ_ONCE(bh->b_state);
1610	do {
1611	} while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
1612	b_state & ~BUFFER_FLAGS_DISCARD));
1613	unlock_buffer(bh);
1614	}
1615
1616	/**
1617	* block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1618	* @folio: The folio which is affected.
1619	* @offset: start of the range to invalidate
1620	* @length: length of the range to invalidate
1621	*
1622	* block_invalidate_folio() is called when all or part of the folio has been
1623	* invalidated by a truncate operation.
1624	*
1625	* block_invalidate_folio() does not have to release all buffers, but it must
1626	* ensure that no dirty buffer is left outside @offset and that no I/O
1627	* is underway against any of the blocks which are outside the truncation
1628	* point. Because the caller is about to free (and possibly reuse) those
1629	* blocks on-disk.
1630	*/
1631	void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1632	{
1633	struct buffer_head head, bh, *next;
1634	size_t curr_off = `0`;
1635	size_t stop = length + offset;
1636
1637	BUG_ON(!folio_test_locked(folio));
1638
1639	/*
1640	* Check for overflow
1641	*/
1642	BUG_ON(stop > folio_size(folio) \|\| stop < length);
1643
1644	head = folio_buffers(folio);
1645	if (!head)
1646	return;
1647
1648	bh = head;
1649	do {
1650	size_t next_off = curr_off + bh->b_size;
1651	next = bh->b_this_page;
1652
1653	/*
1654	* Are we still fully in range ?
1655	*/
1656	if (next_off > stop)
1657	goto out;
1658
1659	/*
1660	* is this block fully invalidated?
1661	*/
1662	if (offset <= curr_off)
1663	discard_buffer(bh);
1664	curr_off = next_off;
1665	bh = next;
1666	} while (bh != head);
1667
1668	/*
1669	* We release buffers only if the entire folio is being invalidated.
1670	* The get_block cached value has been unconditionally invalidated,
1671	* so real IO is not possible anymore.
1672	*/
1673	if (length == folio_size(folio))
1674	filemap_release_folio(folio, gfp: `0`);
1675	out:
1676	folio_clear_mappedtodisk(folio);
1677	}
1678	EXPORT_SYMBOL(block_invalidate_folio);
1679
1680	/*
1681	* We attach and possibly dirty the buffers atomically wrt
1682	* block_dirty_folio() via i_private_lock. try_to_free_buffers
1683	* is already excluded via the folio lock.
1684	*/
1685	struct buffer_head create_empty_buffers(struct* folio *folio,
1686	unsigned long blocksize, unsigned long b_state)
1687	{
1688	struct buffer_head bh, head, *tail;
1689	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT \| __GFP_NOFAIL;
1690
1691	head = folio_alloc_buffers(folio, blocksize, gfp);
1692	bh = head;
1693	do {
1694	bh->b_state \|= b_state;
1695	tail = bh;
1696	bh = bh->b_this_page;
1697	} while (bh);
1698	tail->b_this_page = head;
1699
1700	spin_lock(lock: &folio->mapping->i_private_lock);
1701	if (folio_test_uptodate(folio) \|\| folio_test_dirty(folio)) {
1702	bh = head;
1703	do {
1704	if (folio_test_dirty(folio))
1705	set_buffer_dirty(bh);
1706	if (folio_test_uptodate(folio))
1707	set_buffer_uptodate(bh);
1708	bh = bh->b_this_page;
1709	} while (bh != head);
1710	}
1711	folio_attach_private(folio, data: head);
1712	spin_unlock(lock: &folio->mapping->i_private_lock);
1713
1714	return head;
1715	}
1716	EXPORT_SYMBOL(create_empty_buffers);
1717
1718	/**
1719	* clean_bdev_aliases: clean a range of buffers in block device
1720	* @bdev: Block device to clean buffers in
1721	* @block: Start of a range of blocks to clean
1722	* @len: Number of blocks to clean
1723	*
1724	* We are taking a range of blocks for data and we don't want writeback of any
1725	* buffer-cache aliases starting from return from this function and until the
1726	* moment when something will explicitly mark the buffer dirty (hopefully that
1727	* will not happen until we will free that block ;-) We don't even need to mark
1728	* it not-uptodate - nobody can expect anything from a newly allocated buffer
1729	* anyway. We used to use unmap_buffer() for such invalidation, but that was
1730	* wrong. We definitely don't want to mark the alias unmapped, for example - it
1731	* would confuse anyone who might pick it with bread() afterwards...
1732	*
1733	* Also.. Note that bforget() doesn't lock the buffer. So there can be
1734	* writeout I/O going on against recently-freed buffers. We don't wait on that
1735	* I/O in bforget() - it's more efficient to wait on the I/O only if we really
1736	* need to. That happens here.
1737	*/
1738	void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1739	{
1740	struct address_space *bd_mapping = bdev->bd_mapping;
1741	const int blkbits = bd_mapping->host->i_blkbits;
1742	struct folio_batch fbatch;
1743	pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
1744	pgoff_t end;
1745	int i, count;
1746	struct buffer_head *bh;
1747	struct buffer_head *head;
1748
1749	end = ((loff_t)(block + len - `1`) << blkbits) / PAGE_SIZE;
1750	folio_batch_init(fbatch: &fbatch);
1751	while (filemap_get_folios(mapping: bd_mapping, start: &index, end, fbatch: &fbatch)) {
1752	count = folio_batch_count(fbatch: &fbatch);
1753	for (i = `0`; i < count; i++) {
1754	struct folio *folio = fbatch.folios[i];
1755
1756	if (!folio_buffers(folio))
1757	continue;
1758	/*
1759	* We use folio lock instead of bd_mapping->i_private_lock
1760	* to pin buffers here since we can afford to sleep and
1761	* it scales better than a global spinlock lock.
1762	*/
1763	folio_lock(folio);
1764	/ Recheck when the folio is locked which pins bhs /
1765	head = folio_buffers(folio);
1766	if (!head)
1767	goto unlock_page;
1768	bh = head;
1769	do {
1770	if (!buffer_mapped(bh) \|\| (bh->b_blocknr < block))
1771	goto next;
1772	if (bh->b_blocknr >= block + len)
1773	break;
1774	clear_buffer_dirty(bh);
1775	wait_on_buffer(bh);
1776	clear_buffer_req(bh);
1777	next:
1778	bh = bh->b_this_page;
1779	} while (bh != head);
1780	unlock_page:
1781	folio_unlock(folio);
1782	}
1783	folio_batch_release(fbatch: &fbatch);
1784	cond_resched();
1785	/ End of range already reached? /
1786	if (index > end \|\| !index)
1787	break;
1788	}
1789	}
1790	EXPORT_SYMBOL(clean_bdev_aliases);
1791
1792	static struct buffer_head folio_create_buffers(struct* folio *folio,
1793	struct inode *inode,
1794	unsigned int b_state)
1795	{
1796	struct buffer_head *bh;
1797
1798	BUG_ON(!folio_test_locked(folio));
1799
1800	bh = folio_buffers(folio);
1801	if (!bh)
1802	bh = create_empty_buffers(folio,
1803	`1` << READ_ONCE(inode->i_blkbits), b_state);
1804	return bh;
1805	}
1806
1807	/*
1808	* NOTE! All mapped/uptodate combinations are valid:
1809	*
1810	* Mapped Uptodate Meaning
1811	*
1812	* No No "unknown" - must do get_block()
1813	* No Yes "hole" - zero-filled
1814	* Yes No "allocated" - allocated on disk, not read in
1815	* Yes Yes "valid" - allocated and up-to-date in memory.
1816	*
1817	* "Dirty" is valid only with the last case (mapped+uptodate).
1818	*/
1819
1820	/*
1821	* While block_write_full_folio is writing back the dirty buffers under
1822	* the page lock, whoever dirtied the buffers may decide to clean them
1823	* again at any time. We handle that by only looking at the buffer
1824	* state inside lock_buffer().
1825	*
1826	* If block_write_full_folio() is called for regular writeback
1827	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1828	* locked buffer. This only can happen if someone has written the buffer
1829	* directly, with submit_bh(). At the address_space level PageWriteback
1830	* prevents this contention from occurring.
1831	*
1832	* If block_write_full_folio() is called with wbc->sync_mode ==
1833	* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1834	* causes the writes to be flagged as synchronous writes.
1835	*/
1836	int __block_write_full_folio(struct inode inode, struct* folio *folio,
1837	get_block_t get_block, struct* writeback_control *wbc)
1838	{
1839	int err;
1840	sector_t block;
1841	sector_t last_block;
1842	struct buffer_head bh, head;
1843	size_t blocksize;
1844	int nr_underway = `0`;
1845	blk_opf_t write_flags = wbc_to_write_flags(wbc);
1846
1847	head = folio_create_buffers(folio, inode,
1848	b_state: (`1` << BH_Dirty) \| (`1` << BH_Uptodate));
1849
1850	/*
1851	* Be very careful. We have no exclusion from block_dirty_folio
1852	* here, and the (potentially unmapped) buffers may become dirty at
1853	* any time. If a buffer becomes dirty here after we've inspected it
1854	* then we just miss that fact, and the folio stays dirty.
1855	*
1856	* Buffers outside i_size may be dirtied by block_dirty_folio;
1857	* handle that here by just cleaning them.
1858	*/
1859
1860	bh = head;
1861	blocksize = bh->b_size;
1862
1863	block = div_u64(dividend: folio_pos(folio), divisor: blocksize);
1864	last_block = div_u64(dividend: i_size_read(inode) - `1`, divisor: blocksize);
1865
1866	/*
1867	* Get all the dirty buffers mapped to disk addresses and
1868	* handle any aliases from the underlying blockdev's mapping.
1869	*/
1870	do {
1871	if (block > last_block) {
1872	/*
1873	* mapped buffers outside i_size will occur, because
1874	* this folio can be outside i_size when there is a
1875	* truncate in progress.
1876	*/
1877	/*
1878	* The buffer was zeroed by block_write_full_folio()
1879	*/
1880	clear_buffer_dirty(bh);
1881	set_buffer_uptodate(bh);
1882	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
1883	buffer_dirty(bh)) {
1884	WARN_ON(bh->b_size != blocksize);
1885	err = get_block(inode, block, bh, `1`);
1886	if (err)
1887	goto recover;
1888	clear_buffer_delay(bh);
1889	if (buffer_new(bh)) {
1890	/ blockdev mappings never come here /
1891	clear_buffer_new(bh);
1892	clean_bdev_bh_alias(bh);
1893	}
1894	}
1895	bh = bh->b_this_page;
1896	block++;
1897	} while (bh != head);
1898
1899	do {
1900	if (!buffer_mapped(bh))
1901	continue;
1902	/*
1903	* If it's a fully non-blocking write attempt and we cannot
1904	* lock the buffer then redirty the folio. Note that this can
1905	* potentially cause a busy-wait loop from writeback threads
1906	* and kswapd activity, but those code paths have their own
1907	* higher-level throttling.
1908	*/
1909	if (wbc->sync_mode != WB_SYNC_NONE) {
1910	lock_buffer(bh);
1911	} else if (!trylock_buffer(bh)) {
1912	folio_redirty_for_writepage(wbc, folio);
1913	continue;
1914	}
1915	if (test_clear_buffer_dirty(bh)) {
1916	mark_buffer_async_write_endio(bh,
1917	handler: end_buffer_async_write);
1918	} else {
1919	unlock_buffer(bh);
1920	}
1921	} while ((bh = bh->b_this_page) != head);
1922
1923	/*
1924	* The folio and its buffers are protected by the writeback flag,
1925	* so we can drop the bh refcounts early.
1926	*/
1927	BUG_ON(folio_test_writeback(folio));
1928	folio_start_writeback(folio);
1929
1930	do {
1931	struct buffer_head *next = bh->b_this_page;
1932	if (buffer_async_write(bh)) {
1933	submit_bh_wbc(opf: REQ_OP_WRITE \| write_flags, bh,
1934	hint: inode->i_write_hint, wbc);
1935	nr_underway++;
1936	}
1937	bh = next;
1938	} while (bh != head);
1939	folio_unlock(folio);
1940
1941	err = `0`;
1942	done:
1943	if (nr_underway == `0`) {
1944	/*
1945	* The folio was marked dirty, but the buffers were
1946	* clean. Someone wrote them back by hand with
1947	* write_dirty_buffer/submit_bh. A rare case.
1948	*/
1949	folio_end_writeback(folio);
1950
1951	/*
1952	* The folio and buffer_heads can be released at any time from
1953	* here on.
1954	*/
1955	}
1956	return err;
1957
1958	recover:
1959	/*
1960	* ENOSPC, or some other error. We may already have added some
1961	* blocks to the file, so we need to write these out to avoid
1962	* exposing stale data.
1963	* The folio is currently locked and not marked for writeback
1964	*/
1965	bh = head;
1966	/ Recovery: lock and submit the mapped buffers /
1967	do {
1968	if (buffer_mapped(bh) && buffer_dirty(bh) &&
1969	!buffer_delay(bh)) {
1970	lock_buffer(bh);
1971	mark_buffer_async_write_endio(bh,
1972	handler: end_buffer_async_write);
1973	} else {
1974	/*
1975	* The buffer may have been set dirty during
1976	* attachment to a dirty folio.
1977	*/
1978	clear_buffer_dirty(bh);
1979	}
1980	} while ((bh = bh->b_this_page) != head);
1981	BUG_ON(folio_test_writeback(folio));
1982	mapping_set_error(mapping: folio->mapping, error: err);
1983	folio_start_writeback(folio);
1984	do {
1985	struct buffer_head *next = bh->b_this_page;
1986	if (buffer_async_write(bh)) {
1987	clear_buffer_dirty(bh);
1988	submit_bh_wbc(opf: REQ_OP_WRITE \| write_flags, bh,
1989	hint: inode->i_write_hint, wbc);
1990	nr_underway++;
1991	}
1992	bh = next;
1993	} while (bh != head);
1994	folio_unlock(folio);
1995	goto done;
1996	}
1997	EXPORT_SYMBOL(__block_write_full_folio);
1998
1999	/*
2000	* If a folio has any new buffers, zero them out here, and mark them uptodate
2001	* and dirty so they'll be written out (in order to prevent uninitialised
2002	* block data from leaking). And clear the new bit.
2003	*/
2004	void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
2005	{
2006	size_t block_start, block_end;
2007	struct buffer_head head, bh;
2008
2009	BUG_ON(!folio_test_locked(folio));
2010	head = folio_buffers(folio);
2011	if (!head)
2012	return;
2013
2014	bh = head;
2015	block_start = `0`;
2016	do {
2017	block_end = block_start + bh->b_size;
2018
2019	if (buffer_new(bh)) {
2020	if (block_end > from && block_start < to) {
2021	if (!folio_test_uptodate(folio)) {
2022	size_t start, xend;
2023
2024	start = max(from, block_start);
2025	xend = min(to, block_end);
2026
2027	folio_zero_segment(folio, start, xend);
2028	set_buffer_uptodate(bh);
2029	}
2030
2031	clear_buffer_new(bh);
2032	mark_buffer_dirty(bh);
2033	}
2034	}
2035
2036	block_start = block_end;
2037	bh = bh->b_this_page;
2038	} while (bh != head);
2039	}
2040	EXPORT_SYMBOL(folio_zero_new_buffers);
2041
2042	static int
2043	iomap_to_bh(struct inode inode, sector_t block, struct* buffer_head *bh,
2044	const struct iomap *iomap)
2045	{
2046	loff_t offset = (loff_t)block << inode->i_blkbits;
2047
2048	bh->b_bdev = iomap->bdev;
2049
2050	/*
2051	* Block points to offset in file we need to map, iomap contains
2052	* the offset at which the map starts. If the map ends before the
2053	* current block, then do not map the buffer and let the caller
2054	* handle it.
2055	*/
2056	if (offset >= iomap->offset + iomap->length)
2057	return -EIO;
2058
2059	switch (iomap->type) {
2060	case IOMAP_HOLE:
2061	/*
2062	* If the buffer is not up to date or beyond the current EOF,
2063	* we need to mark it as new to ensure sub-block zeroing is
2064	* executed if necessary.
2065	*/
2066	if (!buffer_uptodate(bh) \|\|
2067	(offset >= i_size_read(inode)))
2068	set_buffer_new(bh);
2069	return `0`;
2070	case IOMAP_DELALLOC:
2071	if (!buffer_uptodate(bh) \|\|
2072	(offset >= i_size_read(inode)))
2073	set_buffer_new(bh);
2074	set_buffer_uptodate(bh);
2075	set_buffer_mapped(bh);
2076	set_buffer_delay(bh);
2077	return `0`;
2078	case IOMAP_UNWRITTEN:
2079	/*
2080	* For unwritten regions, we always need to ensure that regions
2081	* in the block we are not writing to are zeroed. Mark the
2082	* buffer as new to ensure this.
2083	*/
2084	set_buffer_new(bh);
2085	set_buffer_unwritten(bh);
2086	fallthrough;
2087	case IOMAP_MAPPED:
2088	if ((iomap->flags & IOMAP_F_NEW) \|\|
2089	offset >= i_size_read(inode)) {
2090	/*
2091	* This can happen if truncating the block device races
2092	* with the check in the caller as i_size updates on
2093	* block devices aren't synchronized by i_rwsem for
2094	* block devices.
2095	*/
2096	if (S_ISBLK(inode->i_mode))
2097	return -EIO;
2098	set_buffer_new(bh);
2099	}
2100	bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2101	inode->i_blkbits;
2102	set_buffer_mapped(bh);
2103	return `0`;
2104	default:
2105	WARN_ON_ONCE(`1`);
2106	return -EIO;
2107	}
2108	}
2109
2110	int __block_write_begin_int(struct folio folio, loff_t pos, unsigned* len,
2111	get_block_t get_block, const* struct iomap *iomap)
2112	{
2113	size_t from = offset_in_folio(folio, pos);
2114	size_t to = from + len;
2115	struct inode *inode = folio->mapping->host;
2116	size_t block_start, block_end;
2117	sector_t block;
2118	int err = `0`;
2119	size_t blocksize;
2120	struct buffer_head bh, head, wait[`2`], *wait_bh=wait;
2121
2122	BUG_ON(!folio_test_locked(folio));
2123	BUG_ON(to > folio_size(folio));
2124	BUG_ON(from > to);
2125
2126	head = folio_create_buffers(folio, inode, b_state: `0`);
2127	blocksize = head->b_size;
2128	block = div_u64(dividend: folio_pos(folio), divisor: blocksize);
2129
2130	for (bh = head, block_start = `0`; bh != head \|\| !block_start;
2131	block++, block_start=block_end, bh = bh->b_this_page) {
2132	block_end = block_start + blocksize;
2133	if (block_end <= from \|\| block_start >= to) {
2134	if (folio_test_uptodate(folio)) {
2135	if (!buffer_uptodate(bh))
2136	set_buffer_uptodate(bh);
2137	}
2138	continue;
2139	}
2140	if (buffer_new(bh))
2141	clear_buffer_new(bh);
2142	if (!buffer_mapped(bh)) {
2143	WARN_ON(bh->b_size != blocksize);
2144	if (get_block)
2145	err = get_block(inode, block, bh, `1`);
2146	else
2147	err = iomap_to_bh(inode, block, bh, iomap);
2148	if (err)
2149	break;
2150
2151	if (buffer_new(bh)) {
2152	clean_bdev_bh_alias(bh);
2153	if (folio_test_uptodate(folio)) {
2154	clear_buffer_new(bh);
2155	set_buffer_uptodate(bh);
2156	mark_buffer_dirty(bh);
2157	continue;
2158	}
2159	if (block_end > to \|\| block_start < from)
2160	folio_zero_segments(folio,
2161	start1: to, xend1: block_end,
2162	start2: block_start, xend2: from);
2163	continue;
2164	}
2165	}
2166	if (folio_test_uptodate(folio)) {
2167	if (!buffer_uptodate(bh))
2168	set_buffer_uptodate(bh);
2169	continue;
2170	}
2171	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2172	!buffer_unwritten(bh) &&
2173	(block_start < from \|\| block_end > to)) {
2174	bh_read_nowait(bh, op_flags: `0`);
2175	*wait_bh++=bh;
2176	}
2177	}
2178	/*
2179	* If we issued read requests - let them complete.
2180	*/
2181	while(wait_bh > wait) {
2182	wait_on_buffer(bh: *--wait_bh);
2183	if (!buffer_uptodate(bh: *wait_bh))
2184	err = -EIO;
2185	}
2186	if (unlikely(err))
2187	folio_zero_new_buffers(folio, from, to);
2188	return err;
2189	}
2190
2191	int __block_write_begin(struct folio folio, loff_t pos, unsigned* len,
2192	get_block_t *get_block)
2193	{
2194	return __block_write_begin_int(folio, pos, len, get_block, NULL);
2195	}
2196	EXPORT_SYMBOL(__block_write_begin);
2197
2198	void block_commit_write(struct folio *folio, size_t from, size_t to)
2199	{
2200	size_t block_start, block_end;
2201	bool partial = false;
2202	unsigned blocksize;
2203	struct buffer_head bh, head;
2204
2205	bh = head = folio_buffers(folio);
2206	if (!bh)
2207	return;
2208	blocksize = bh->b_size;
2209
2210	block_start = `0`;
2211	do {
2212	block_end = block_start + blocksize;
2213	if (block_end <= from \|\| block_start >= to) {
2214	if (!buffer_uptodate(bh))
2215	partial = true;
2216	} else {
2217	set_buffer_uptodate(bh);
2218	mark_buffer_dirty(bh);
2219	}
2220	if (buffer_new(bh))
2221	clear_buffer_new(bh);
2222
2223	block_start = block_end;
2224	bh = bh->b_this_page;
2225	} while (bh != head);
2226
2227	/*
2228	* If this is a partial write which happened to make all buffers
2229	* uptodate then we can optimize away a bogus read_folio() for
2230	* the next read(). Here we 'discover' whether the folio went
2231	* uptodate as a result of this (potentially partial) write.
2232	*/
2233	if (!partial)
2234	folio_mark_uptodate(folio);
2235	}
2236	EXPORT_SYMBOL(block_commit_write);
2237
2238	/*
2239	* block_write_begin takes care of the basic task of block allocation and
2240	* bringing partial write blocks uptodate first.
2241	*
2242	* The filesystem needs to handle block truncation upon failure.
2243	*/
2244	int block_write_begin(struct address_space mapping, loff_t pos, unsigned* len,
2245	struct folio *foliop, get_block_t get_block)
2246	{
2247	pgoff_t index = pos >> PAGE_SHIFT;
2248	struct folio *folio;
2249	int status;
2250
2251	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2252	gfp: mapping_gfp_mask(mapping));
2253	if (IS_ERR(ptr: folio))
2254	return PTR_ERR(ptr: folio);
2255
2256	status = __block_write_begin_int(folio, pos, len, get_block, NULL);
2257	if (unlikely(status)) {
2258	folio_unlock(folio);
2259	folio_put(folio);
2260	folio = NULL;
2261	}
2262
2263	*foliop = folio;
2264	return status;
2265	}
2266	EXPORT_SYMBOL(block_write_begin);
2267
2268	int block_write_end(loff_t pos, unsigned len, unsigned copied,
2269	struct folio *folio)
2270	{
2271	size_t start = pos - folio_pos(folio);
2272
2273	if (unlikely(copied < len)) {
2274	/*
2275	* The buffers that were written will now be uptodate, so
2276	* we don't have to worry about a read_folio reading them
2277	* and overwriting a partial write. However if we have
2278	* encountered a short write and only partially written
2279	* into a buffer, it will not be marked uptodate, so a
2280	* read_folio might come in and destroy our partial write.
2281	*
2282	* Do the simplest thing, and just treat any short write to a
2283	* non uptodate folio as a zero-length write, and force the
2284	* caller to redo the whole thing.
2285	*/
2286	if (!folio_test_uptodate(folio))
2287	copied = `0`;
2288
2289	folio_zero_new_buffers(folio, start+copied, start+len);
2290	}
2291	flush_dcache_folio(folio);
2292
2293	/ This could be a short (even 0-length) commit /
2294	block_commit_write(folio, start, start + copied);
2295
2296	return copied;
2297	}
2298	EXPORT_SYMBOL(block_write_end);
2299
2300	int generic_write_end(const struct kiocb iocb, struct* address_space *mapping,
2301	loff_t pos, unsigned len, unsigned copied,
2302	struct folio folio, void* *fsdata)
2303	{
2304	struct inode *inode = mapping->host;
2305	loff_t old_size = inode->i_size;
2306	bool i_size_changed = false;
2307
2308	copied = block_write_end(pos, len, copied, folio);
2309
2310	/*
2311	* No need to use i_size_read() here, the i_size cannot change under us
2312	* because we hold i_rwsem.
2313	*
2314	* But it's important to update i_size while still holding folio lock:
2315	* page writeout could otherwise come in and zero beyond i_size.
2316	*/
2317	if (pos + copied > inode->i_size) {
2318	i_size_write(inode, i_size: pos + copied);
2319	i_size_changed = true;
2320	}
2321
2322	folio_unlock(folio);
2323	folio_put(folio);
2324
2325	if (old_size < pos)
2326	pagecache_isize_extended(inode, from: old_size, to: pos);
2327	/*
2328	* Don't mark the inode dirty under page lock. First, it unnecessarily
2329	* makes the holding time of page lock longer. Second, it forces lock
2330	* ordering of page lock and transaction start for journaling
2331	* filesystems.
2332	*/
2333	if (i_size_changed)
2334	mark_inode_dirty(inode);
2335	return copied;
2336	}
2337	EXPORT_SYMBOL(generic_write_end);
2338
2339	/*
2340	* block_is_partially_uptodate checks whether buffers within a folio are
2341	* uptodate or not.
2342	*
2343	* Returns true if all buffers which correspond to the specified part
2344	* of the folio are uptodate.
2345	*/
2346	bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2347	{
2348	unsigned block_start, block_end, blocksize;
2349	unsigned to;
2350	struct buffer_head bh, head;
2351	bool ret = true;
2352
2353	head = folio_buffers(folio);
2354	if (!head)
2355	return false;
2356	blocksize = head->b_size;
2357	to = min_t(unsigned, folio_size(folio) - from, count);
2358	to = from + to;
2359	if (from < blocksize && to > folio_size(folio) - blocksize)
2360	return false;
2361
2362	bh = head;
2363	block_start = `0`;
2364	do {
2365	block_end = block_start + blocksize;
2366	if (block_end > from && block_start < to) {
2367	if (!buffer_uptodate(bh)) {
2368	ret = false;
2369	break;
2370	}
2371	if (block_end >= to)
2372	break;
2373	}
2374	block_start = block_end;
2375	bh = bh->b_this_page;
2376	} while (bh != head);
2377
2378	return ret;
2379	}
2380	EXPORT_SYMBOL(block_is_partially_uptodate);
2381
2382	/*
2383	* Generic "read_folio" function for block devices that have the normal
2384	* get_block functionality. This is most of the block device filesystems.
2385	* Reads the folio asynchronously --- the unlock_buffer() and
2386	* set/clear_buffer_uptodate() functions propagate buffer state into the
2387	* folio once IO has completed.
2388	*/
2389	int block_read_full_folio(struct folio folio, get_block_t get_block)
2390	{
2391	struct inode *inode = folio->mapping->host;
2392	sector_t iblock, lblock;
2393	struct buffer_head bh, head, *prev = NULL;
2394	size_t blocksize;
2395	int fully_mapped = `1`;
2396	bool page_error = false;
2397	loff_t limit = i_size_read(inode);
2398
2399	/ This is needed for ext4. /
2400	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2401	limit = inode->i_sb->s_maxbytes;
2402
2403	head = folio_create_buffers(folio, inode, b_state: `0`);
2404	blocksize = head->b_size;
2405
2406	iblock = div_u64(dividend: folio_pos(folio), divisor: blocksize);
2407	lblock = div_u64(dividend: limit + blocksize - `1`, divisor: blocksize);
2408	bh = head;
2409
2410	do {
2411	if (buffer_uptodate(bh))
2412	continue;
2413
2414	if (!buffer_mapped(bh)) {
2415	int err = `0`;
2416
2417	fully_mapped = `0`;
2418	if (iblock < lblock) {
2419	WARN_ON(bh->b_size != blocksize);
2420	err = get_block(inode, iblock, bh, `0`);
2421	if (err)
2422	page_error = true;
2423	}
2424	if (!buffer_mapped(bh)) {
2425	folio_zero_range(folio, start: bh_offset(bh),
2426	length: blocksize);
2427	if (!err)
2428	set_buffer_uptodate(bh);
2429	continue;
2430	}
2431	/*
2432	* get_block() might have updated the buffer
2433	* synchronously
2434	*/
2435	if (buffer_uptodate(bh))
2436	continue;
2437	}
2438
2439	lock_buffer(bh);
2440	if (buffer_uptodate(bh)) {
2441	unlock_buffer(bh);
2442	continue;
2443	}
2444
2445	mark_buffer_async_read(bh);
2446	if (prev)
2447	submit_bh(REQ_OP_READ, prev);
2448	prev = bh;
2449	} while (iblock++, (bh = bh->b_this_page) != head);
2450
2451	if (fully_mapped)
2452	folio_set_mappedtodisk(folio);
2453
2454	/*
2455	* All buffers are uptodate or get_block() returned an error
2456	* when trying to map them - we must finish the read because
2457	* end_buffer_async_read() will never be called on any buffer
2458	* in this folio.
2459	*/
2460	if (prev)
2461	submit_bh(REQ_OP_READ, prev);
2462	else
2463	folio_end_read(folio, success: !page_error);
2464
2465	return `0`;
2466	}
2467	EXPORT_SYMBOL(block_read_full_folio);
2468
2469	/ utility function for filesystems that need to do work on expanding*
2470	* truncates. Uses filesystem pagecache writes to allow the filesystem to
2471	* deal with the hole.
2472	*/
2473	int generic_cont_expand_simple(struct inode *inode, loff_t size)
2474	{
2475	struct address_space *mapping = inode->i_mapping;
2476	const struct address_space_operations *aops = mapping->a_ops;
2477	struct folio *folio;
2478	void *fsdata = NULL;
2479	int err;
2480
2481	err = inode_newsize_ok(inode, offset: size);
2482	if (err)
2483	goto out;
2484
2485	err = aops->write_begin(NULL, mapping, size, `0`, &folio, &fsdata);
2486	if (err)
2487	goto out;
2488
2489	err = aops->write_end(NULL, mapping, size, `0`, `0`, folio, fsdata);
2490	BUG_ON(err > `0`);
2491
2492	out:
2493	return err;
2494	}
2495	EXPORT_SYMBOL(generic_cont_expand_simple);
2496
2497	static int cont_expand_zero(const struct kiocb *iocb,
2498	struct address_space *mapping,
2499	loff_t pos, loff_t *bytes)
2500	{
2501	struct inode *inode = mapping->host;
2502	const struct address_space_operations *aops = mapping->a_ops;
2503	unsigned int blocksize = i_blocksize(node: inode);
2504	struct folio *folio;
2505	void *fsdata = NULL;
2506	pgoff_t index, curidx;
2507	loff_t curpos;
2508	unsigned zerofrom, offset, len;
2509	int err = `0`;
2510
2511	index = pos >> PAGE_SHIFT;
2512	offset = pos & ~PAGE_MASK;
2513
2514	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2515	zerofrom = curpos & ~PAGE_MASK;
2516	if (zerofrom & (blocksize-`1`)) {
2517	*bytes \|= (blocksize-`1`);
2518	(*bytes)++;
2519	}
2520	len = PAGE_SIZE - zerofrom;
2521
2522	err = aops->write_begin(iocb, mapping, curpos, len,
2523	&folio, &fsdata);
2524	if (err)
2525	goto out;
2526	folio_zero_range(folio, offset_in_folio(folio, curpos), length: len);
2527	err = aops->write_end(iocb, mapping, curpos, len, len,
2528	folio, fsdata);
2529	if (err < `0`)
2530	goto out;
2531	BUG_ON(err != len);
2532	err = `0`;
2533
2534	balance_dirty_pages_ratelimited(mapping);
2535
2536	if (fatal_signal_pending(current)) {
2537	err = -EINTR;
2538	goto out;
2539	}
2540	}
2541
2542	/ page covers the boundary, find the boundary offset /
2543	if (index == curidx) {
2544	zerofrom = curpos & ~PAGE_MASK;
2545	/ if we will expand the thing last block will be filled /
2546	if (offset <= zerofrom) {
2547	goto out;
2548	}
2549	if (zerofrom & (blocksize-`1`)) {
2550	*bytes \|= (blocksize-`1`);
2551	(*bytes)++;
2552	}
2553	len = offset - zerofrom;
2554
2555	err = aops->write_begin(iocb, mapping, curpos, len,
2556	&folio, &fsdata);
2557	if (err)
2558	goto out;
2559	folio_zero_range(folio, offset_in_folio(folio, curpos), length: len);
2560	err = aops->write_end(iocb, mapping, curpos, len, len,
2561	folio, fsdata);
2562	if (err < `0`)
2563	goto out;
2564	BUG_ON(err != len);
2565	err = `0`;
2566	}
2567	out:
2568	return err;
2569	}
2570
2571	/*
2572	* For moronic filesystems that do not allow holes in file.
2573	* We may have to extend the file.
2574	*/
2575	int cont_write_begin(const struct kiocb iocb, struct* address_space *mapping,
2576	loff_t pos, unsigned len, struct folio **foliop,
2577	void *fsdata, get_block_t get_block, loff_t *bytes)
2578	{
2579	struct inode *inode = mapping->host;
2580	unsigned int blocksize = i_blocksize(node: inode);
2581	unsigned int zerofrom;
2582	int err;
2583
2584	err = cont_expand_zero(iocb, mapping, pos, bytes);
2585	if (err)
2586	return err;
2587
2588	zerofrom = *bytes & ~PAGE_MASK;
2589	if (pos+len > *bytes && zerofrom & (blocksize-`1`)) {
2590	*bytes \|= (blocksize-`1`);
2591	(*bytes)++;
2592	}
2593
2594	return block_write_begin(mapping, pos, len, foliop, get_block);
2595	}
2596	EXPORT_SYMBOL(cont_write_begin);
2597
2598	/*
2599	* block_page_mkwrite() is not allowed to change the file size as it gets
2600	* called from a page fault handler when a page is first dirtied. Hence we must
2601	* be careful to check for EOF conditions here. We set the page up correctly
2602	* for a written page which means we get ENOSPC checking when writing into
2603	* holes and correct delalloc and unwritten extent mapping on filesystems that
2604	* support these features.
2605	*
2606	* We are not allowed to take the i_rwsem here so we have to play games to
2607	* protect against truncate races as the page could now be beyond EOF. Because
2608	* truncate writes the inode size before removing pages, once we have the
2609	* page lock we can determine safely if the page is beyond EOF. If it is not
2610	* beyond EOF, then the page is guaranteed safe against truncation until we
2611	* unlock the page.
2612	*
2613	* Direct callers of this function should protect against filesystem freezing
2614	* using sb_start_pagefault() - sb_end_pagefault() functions.
2615	*/
2616	int block_page_mkwrite(struct vm_area_struct vma, struct* vm_fault *vmf,
2617	get_block_t get_block)
2618	{
2619	struct folio *folio = page_folio(vmf->page);
2620	struct inode *inode = file_inode(f: vma->vm_file);
2621	unsigned long end;
2622	loff_t size;
2623	int ret;
2624
2625	folio_lock(folio);
2626	size = i_size_read(inode);
2627	if ((folio->mapping != inode->i_mapping) \|\|
2628	(folio_pos(folio) >= size)) {
2629	/ We overload EFAULT to mean page got truncated /
2630	ret = -EFAULT;
2631	goto out_unlock;
2632	}
2633
2634	end = folio_size(folio);
2635	/ folio is wholly or partially inside EOF /
2636	if (folio_pos(folio) + end > size)
2637	end = size - folio_pos(folio);
2638
2639	ret = __block_write_begin_int(folio, pos: `0`, len: end, get_block, NULL);
2640	if (unlikely(ret))
2641	goto out_unlock;
2642
2643	block_commit_write(folio, `0`, end);
2644
2645	folio_mark_dirty(folio);
2646	folio_wait_stable(folio);
2647	return `0`;
2648	out_unlock:
2649	folio_unlock(folio);
2650	return ret;
2651	}
2652	EXPORT_SYMBOL(block_page_mkwrite);
2653
2654	int block_truncate_page(struct address_space *mapping,
2655	loff_t from, get_block_t *get_block)
2656	{
2657	pgoff_t index = from >> PAGE_SHIFT;
2658	unsigned blocksize;
2659	sector_t iblock;
2660	size_t offset, length, pos;
2661	struct inode *inode = mapping->host;
2662	struct folio *folio;
2663	struct buffer_head *bh;
2664	int err = `0`;
2665
2666	blocksize = i_blocksize(node: inode);
2667	length = from & (blocksize - `1`);
2668
2669	/ Block boundary? Nothing to do /
2670	if (!length)
2671	return `0`;
2672
2673	length = blocksize - length;
2674	iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
2675
2676	folio = filemap_grab_folio(mapping, index);
2677	if (IS_ERR(ptr: folio))
2678	return PTR_ERR(ptr: folio);
2679
2680	bh = folio_buffers(folio);
2681	if (!bh)
2682	bh = create_empty_buffers(folio, blocksize, `0`);
2683
2684	/ Find the buffer that contains "offset" /
2685	offset = offset_in_folio(folio, from);
2686	pos = blocksize;
2687	while (offset >= pos) {
2688	bh = bh->b_this_page;
2689	iblock++;
2690	pos += blocksize;
2691	}
2692
2693	if (!buffer_mapped(bh)) {
2694	WARN_ON(bh->b_size != blocksize);
2695	err = get_block(inode, iblock, bh, `0`);
2696	if (err)
2697	goto unlock;
2698	/ unmapped? It's a hole - nothing to do /
2699	if (!buffer_mapped(bh))
2700	goto unlock;
2701	}
2702
2703	/ Ok, it's mapped. Make sure it's up-to-date /
2704	if (folio_test_uptodate(folio))
2705	set_buffer_uptodate(bh);
2706
2707	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2708	err = bh_read(bh, op_flags: `0`);
2709	/ Uhhuh. Read error. Complain and punt. /
2710	if (err < `0`)
2711	goto unlock;
2712	}
2713
2714	folio_zero_range(folio, start: offset, length);
2715	mark_buffer_dirty(bh);
2716
2717	unlock:
2718	folio_unlock(folio);
2719	folio_put(folio);
2720
2721	return err;
2722	}
2723	EXPORT_SYMBOL(block_truncate_page);
2724
2725	/*
2726	* The generic write folio function for buffer-backed address_spaces
2727	*/
2728	int block_write_full_folio(struct folio folio, struct* writeback_control *wbc,
2729	void *get_block)
2730	{
2731	struct inode * const inode = folio->mapping->host;
2732	loff_t i_size = i_size_read(inode);
2733
2734	/ Is the folio fully inside i_size? /
2735	if (folio_pos(folio) + folio_size(folio) <= i_size)
2736	return __block_write_full_folio(inode, folio, get_block, wbc);
2737
2738	/ Is the folio fully outside i_size? (truncate in progress) /
2739	if (folio_pos(folio) >= i_size) {
2740	folio_unlock(folio);
2741	return `0`; / don't care /
2742	}
2743
2744	/*
2745	* The folio straddles i_size. It must be zeroed out on each and every
2746	* writeback invocation because it may be mmapped. "A file is mapped
2747	* in multiples of the page size. For a file that is not a multiple of
2748	* the page size, the remaining memory is zeroed when mapped, and
2749	* writes to that region are not written out to the file."
2750	*/
2751	folio_zero_segment(folio, offset_in_folio(folio, i_size),
2752	xend: folio_size(folio));
2753	return __block_write_full_folio(inode, folio, get_block, wbc);
2754	}
2755
2756	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2757	get_block_t *get_block)
2758	{
2759	struct inode *inode = mapping->host;
2760	struct buffer_head tmp = {
2761	.b_size = i_blocksize(node: inode),
2762	};
2763
2764	get_block(inode, block, &tmp, `0`);
2765	return tmp.b_blocknr;
2766	}
2767	EXPORT_SYMBOL(generic_block_bmap);
2768
2769	static void end_bio_bh_io_sync(struct bio *bio)
2770	{
2771	struct buffer_head *bh = bio->bi_private;
2772
2773	if (unlikely(bio_flagged(bio, BIO_QUIET)))
2774	set_bit(nr: BH_Quiet, addr: &bh->b_state);
2775
2776	bh->b_end_io(bh, !bio->bi_status);
2777	bio_put(bio);
2778	}
2779
2780	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2781	enum rw_hint write_hint,
2782	struct writeback_control *wbc)
2783	{
2784	const enum req_op op = opf & REQ_OP_MASK;
2785	struct bio *bio;
2786
2787	BUG_ON(!buffer_locked(bh));
2788	BUG_ON(!buffer_mapped(bh));
2789	BUG_ON(!bh->b_end_io);
2790	BUG_ON(buffer_delay(bh));
2791	BUG_ON(buffer_unwritten(bh));
2792
2793	/*
2794	* Only clear out a write error when rewriting
2795	*/
2796	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2797	clear_buffer_write_io_error(bh);
2798
2799	if (buffer_meta(bh))
2800	opf \|= REQ_META;
2801	if (buffer_prio(bh))
2802	opf \|= REQ_PRIO;
2803
2804	bio = bio_alloc(bdev: bh->b_bdev, nr_vecs: `1`, opf, GFP_NOIO);
2805
2806	fscrypt_set_bio_crypt_ctx_bh(bio, first_bh: bh, GFP_NOIO);
2807
2808	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> `9`);
2809	bio->bi_write_hint = write_hint;
2810
2811	bio_add_folio_nofail(bio, folio: bh->b_folio, len: bh->b_size, off: bh_offset(bh));
2812
2813	bio->bi_end_io = end_bio_bh_io_sync;
2814	bio->bi_private = bh;
2815
2816	/ Take care of bh's that straddle the end of the device /
2817	guard_bio_eod(bio);
2818
2819	if (wbc) {
2820	wbc_init_bio(wbc, bio);
2821	wbc_account_cgroup_owner(wbc, folio: bh->b_folio, bytes: bh->b_size);
2822	}
2823
2824	submit_bio(bio);
2825	}
2826
2827	void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2828	{
2829	submit_bh_wbc(opf, bh, write_hint: WRITE_LIFE_NOT_SET, NULL);
2830	}
2831	EXPORT_SYMBOL(submit_bh);
2832
2833	void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2834	{
2835	lock_buffer(bh);
2836	if (!test_clear_buffer_dirty(bh)) {
2837	unlock_buffer(bh);
2838	return;
2839	}
2840	bh->b_end_io = end_buffer_write_sync;
2841	get_bh(bh);
2842	submit_bh(REQ_OP_WRITE \| op_flags, bh);
2843	}
2844	EXPORT_SYMBOL(write_dirty_buffer);
2845
2846	/*
2847	* For a data-integrity writeout, we need to wait upon any in-progress I/O
2848	* and then start new I/O and then wait upon it. The caller must have a ref on
2849	* the buffer_head.
2850	*/
2851	int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2852	{
2853	WARN_ON(atomic_read(&bh->b_count) < `1`);
2854	lock_buffer(bh);
2855	if (test_clear_buffer_dirty(bh)) {
2856	/*
2857	* The bh should be mapped, but it might not be if the
2858	* device was hot-removed. Not much we can do but fail the I/O.
2859	*/
2860	if (!buffer_mapped(bh)) {
2861	unlock_buffer(bh);
2862	return -EIO;
2863	}
2864
2865	get_bh(bh);
2866	bh->b_end_io = end_buffer_write_sync;
2867	submit_bh(REQ_OP_WRITE \| op_flags, bh);
2868	wait_on_buffer(bh);
2869	if (!buffer_uptodate(bh))
2870	return -EIO;
2871	} else {
2872	unlock_buffer(bh);
2873	}
2874	return `0`;
2875	}
2876	EXPORT_SYMBOL(__sync_dirty_buffer);
2877
2878	int sync_dirty_buffer(struct buffer_head *bh)
2879	{
2880	return __sync_dirty_buffer(bh, REQ_SYNC);
2881	}
2882	EXPORT_SYMBOL(sync_dirty_buffer);
2883
2884	static inline int buffer_busy(struct buffer_head *bh)
2885	{
2886	return atomic_read(v: &bh->b_count) \|
2887	(bh->b_state & ((`1` << BH_Dirty) \| (`1` << BH_Lock)));
2888	}
2889
2890	static bool
2891	drop_buffers(struct folio folio, struct* buffer_head **buffers_to_free)
2892	{
2893	struct buffer_head *head = folio_buffers(folio);
2894	struct buffer_head *bh;
2895
2896	bh = head;
2897	do {
2898	if (buffer_busy(bh))
2899	goto failed;
2900	bh = bh->b_this_page;
2901	} while (bh != head);
2902
2903	do {
2904	struct buffer_head *next = bh->b_this_page;
2905
2906	if (bh->b_assoc_map)
2907	__remove_assoc_queue(bh);
2908	bh = next;
2909	} while (bh != head);
2910	*buffers_to_free = head;
2911	folio_detach_private(folio);
2912	return true;
2913	failed:
2914	return false;
2915	}
2916
2917	/**
2918	* try_to_free_buffers - Release buffers attached to this folio.
2919	* @folio: The folio.
2920	*
2921	* If any buffers are in use (dirty, under writeback, elevated refcount),
2922	* no buffers will be freed.
2923	*
2924	* If the folio is dirty but all the buffers are clean then we need to
2925	* be sure to mark the folio clean as well. This is because the folio
2926	* may be against a block device, and a later reattachment of buffers
2927	* to a dirty folio will set all buffers dirty. Which would corrupt
2928	* filesystem data on the same device.
2929	*
2930	* The same applies to regular filesystem folios: if all the buffers are
2931	* clean then we set the folio clean and proceed. To do that, we require
2932	* total exclusion from block_dirty_folio(). That is obtained with
2933	* i_private_lock.
2934	*
2935	* Exclusion against try_to_free_buffers may be obtained by either
2936	* locking the folio or by holding its mapping's i_private_lock.
2937	*
2938	* Context: Process context. @folio must be locked. Will not sleep.
2939	* Return: true if all buffers attached to this folio were freed.
2940	*/
2941	bool try_to_free_buffers(struct folio *folio)
2942	{
2943	struct address_space * const mapping = folio->mapping;
2944	struct buffer_head *buffers_to_free = NULL;
2945	bool ret = `0`;
2946
2947	BUG_ON(!folio_test_locked(folio));
2948	if (folio_test_writeback(folio))
2949	return false;
2950
2951	if (mapping == NULL) { / can this still happen? /
2952	ret = drop_buffers(folio, buffers_to_free: &buffers_to_free);
2953	goto out;
2954	}
2955
2956	spin_lock(lock: &mapping->i_private_lock);
2957	ret = drop_buffers(folio, buffers_to_free: &buffers_to_free);
2958
2959	/*
2960	* If the filesystem writes its buffers by hand (eg ext3)
2961	* then we can have clean buffers against a dirty folio. We
2962	* clean the folio here; otherwise the VM will never notice
2963	* that the filesystem did any IO at all.
2964	*
2965	* Also, during truncate, discard_buffer will have marked all
2966	* the folio's buffers clean. We discover that here and clean
2967	* the folio also.
2968	*
2969	* i_private_lock must be held over this entire operation in order
2970	* to synchronise against block_dirty_folio and prevent the
2971	* dirty bit from being lost.
2972	*/
2973	if (ret)
2974	folio_cancel_dirty(folio);
2975	spin_unlock(lock: &mapping->i_private_lock);
2976	out:
2977	if (buffers_to_free) {
2978	struct buffer_head *bh = buffers_to_free;
2979
2980	do {
2981	struct buffer_head *next = bh->b_this_page;
2982	free_buffer_head(bh);
2983	bh = next;
2984	} while (bh != buffers_to_free);
2985	}
2986	return ret;
2987	}
2988	EXPORT_SYMBOL(try_to_free_buffers);
2989
2990	/*
2991	* Buffer-head allocation
2992	*/
2993	static struct kmem_cache *bh_cachep __ro_after_init;
2994
2995	/*
2996	* Once the number of bh's in the machine exceeds this level, we start
2997	* stripping them in writeback.
2998	*/
2999	static unsigned long max_buffer_heads __ro_after_init;
3000
3001	int buffer_heads_over_limit;
3002
3003	struct bh_accounting {
3004	int nr; / Number of live bh's /
3005	int ratelimit; / Limit cacheline bouncing /
3006	};
3007
3008	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {`0`, `0`};
3009
3010	static void recalc_bh_state(void)
3011	{
3012	int i;
3013	int tot = `0`;
3014
3015	if (__this_cpu_inc_return(bh_accounting.ratelimit) - `1` < `4096`)
3016	return;
3017	__this_cpu_write(bh_accounting.ratelimit, `0`);
3018	for_each_online_cpu(i)
3019	tot += per_cpu(bh_accounting, i).nr;
3020	buffer_heads_over_limit = (tot > max_buffer_heads);
3021	}
3022
3023	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3024	{
3025	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3026	if (ret) {
3027	INIT_LIST_HEAD(list: &ret->b_assoc_buffers);
3028	spin_lock_init(&ret->b_uptodate_lock);
3029	preempt_disable();
3030	__this_cpu_inc(bh_accounting.nr);
3031	recalc_bh_state();
3032	preempt_enable();
3033	}
3034	return ret;
3035	}
3036	EXPORT_SYMBOL(alloc_buffer_head);
3037
3038	void free_buffer_head(struct buffer_head *bh)
3039	{
3040	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3041	kmem_cache_free(s: bh_cachep, objp: bh);
3042	preempt_disable();
3043	__this_cpu_dec(bh_accounting.nr);
3044	recalc_bh_state();
3045	preempt_enable();
3046	}
3047	EXPORT_SYMBOL(free_buffer_head);
3048
3049	static int buffer_exit_cpu_dead(unsigned int cpu)
3050	{
3051	int i;
3052	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3053
3054	for (i = `0`; i < BH_LRU_SIZE; i++) {
3055	brelse(bh: b->bhs[i]);
3056	b->bhs[i] = NULL;
3057	}
3058	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3059	per_cpu(bh_accounting, cpu).nr = `0`;
3060	return `0`;
3061	}
3062
3063	/**
3064	* bh_uptodate_or_lock - Test whether the buffer is uptodate
3065	* @bh: struct buffer_head
3066	*
3067	* Return true if the buffer is up-to-date and false,
3068	* with the buffer locked, if not.
3069	*/
3070	int bh_uptodate_or_lock(struct buffer_head *bh)
3071	{
3072	if (!buffer_uptodate(bh)) {
3073	lock_buffer(bh);
3074	if (!buffer_uptodate(bh))
3075	return `0`;
3076	unlock_buffer(bh);
3077	}
3078	return `1`;
3079	}
3080	EXPORT_SYMBOL(bh_uptodate_or_lock);
3081
3082	/**
3083	* __bh_read - Submit read for a locked buffer
3084	* @bh: struct buffer_head
3085	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3086	* @wait: wait until reading finish
3087	*
3088	* Returns zero on success or don't wait, and -EIO on error.
3089	*/
3090	int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3091	{
3092	int ret = `0`;
3093
3094	BUG_ON(!buffer_locked(bh));
3095
3096	get_bh(bh);
3097	bh->b_end_io = end_buffer_read_sync;
3098	submit_bh(REQ_OP_READ \| op_flags, bh);
3099	if (wait) {
3100	wait_on_buffer(bh);
3101	if (!buffer_uptodate(bh))
3102	ret = -EIO;
3103	}
3104	return ret;
3105	}
3106	EXPORT_SYMBOL(__bh_read);
3107
3108	/**
3109	* __bh_read_batch - Submit read for a batch of unlocked buffers
3110	* @nr: entry number of the buffer batch
3111	* @bhs: a batch of struct buffer_head
3112	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3113	* @force_lock: force to get a lock on the buffer if set, otherwise drops any
3114	* buffer that cannot lock.
3115	*
3116	* Returns zero on success or don't wait, and -EIO on error.
3117	*/
3118	void __bh_read_batch(int nr, struct buffer_head *bhs[],
3119	blk_opf_t op_flags, bool force_lock)
3120	{
3121	int i;
3122
3123	for (i = `0`; i < nr; i++) {
3124	struct buffer_head *bh = bhs[i];
3125
3126	if (buffer_uptodate(bh))
3127	continue;
3128
3129	if (force_lock)
3130	lock_buffer(bh);
3131	else
3132	if (!trylock_buffer(bh))
3133	continue;
3134
3135	if (buffer_uptodate(bh)) {
3136	unlock_buffer(bh);
3137	continue;
3138	}
3139
3140	bh->b_end_io = end_buffer_read_sync;
3141	get_bh(bh);
3142	submit_bh(REQ_OP_READ \| op_flags, bh);
3143	}
3144	}
3145	EXPORT_SYMBOL(__bh_read_batch);
3146
3147	void __init buffer_init(void)
3148	{
3149	unsigned long nrpages;
3150	int ret;
3151
3152	bh_cachep = KMEM_CACHE(buffer_head,
3153	SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC);
3154	/*
3155	* Limit the bh occupancy to 10% of ZONE_NORMAL
3156	*/
3157	nrpages = (nr_free_buffer_pages() * `10`) / `100`;
3158	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3159	ret = cpuhp_setup_state_nocalls(state: CPUHP_FS_BUFF_DEAD, name: "fs/buffer:dead",
3160	NULL, teardown: buffer_exit_cpu_dead);
3161	WARN_ON(ret < `0`);
3162	}
3163

Browse the source code of Linux/fs/buffer.c