mpage.c source code [Linux/fs/mpage.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* fs/mpage.c
4	*
5	* Copyright (C) 2002, Linus Torvalds.
6	*
7	* Contains functions related to preparing and submitting BIOs which contain
8	* multiple pagecache pages.
9	*
10	* 15May2002 Andrew Morton
11	* Initial version
12	* 27Jun2002 axboe@suse.de
13	* use bio_add_page() to build bio's just the right size
14	*/
15
16	#include <linux/kernel.h>
17	#include <linux/export.h>
18	#include <linux/mm.h>
19	#include <linux/kdev_t.h>
20	#include <linux/gfp.h>
21	#include <linux/bio.h>
22	#include <linux/fs.h>
23	#include <linux/buffer_head.h>
24	#include <linux/blkdev.h>
25	#include <linux/highmem.h>
26	#include <linux/prefetch.h>
27	#include <linux/mpage.h>
28	#include <linux/mm_inline.h>
29	#include <linux/writeback.h>
30	#include <linux/backing-dev.h>
31	#include <linux/pagevec.h>
32	#include "internal.h"
33
34	/*
35	* I/O completion handler for multipage BIOs.
36	*
37	* The mpage code never puts partial pages into a BIO (except for end-of-file).
38	* If a page does not map to a contiguous run of blocks then it simply falls
39	* back to block_read_full_folio().
40	*
41	* Why is this? If a page's completion depends on a number of different BIOs
42	* which can complete in any order (or at the same time) then determining the
43	* status of that page is hard. See end_buffer_async_read() for the details.
44	* There is no point in duplicating all that complexity.
45	*/
46	static void mpage_read_end_io(struct bio *bio)
47	{
48	struct folio_iter fi;
49	int err = blk_status_to_errno(status: bio->bi_status);
50
51	bio_for_each_folio_all(fi, bio)
52	folio_end_read(folio: fi.folio, success: err == `0`);
53
54	bio_put(bio);
55	}
56
57	static void mpage_write_end_io(struct bio *bio)
58	{
59	struct folio_iter fi;
60	int err = blk_status_to_errno(status: bio->bi_status);
61
62	bio_for_each_folio_all(fi, bio) {
63	if (err)
64	mapping_set_error(mapping: fi.folio->mapping, error: err);
65	folio_end_writeback(folio: fi.folio);
66	}
67
68	bio_put(bio);
69	}
70
71	static struct bio mpage_bio_submit_read(struct* bio *bio)
72	{
73	bio->bi_end_io = mpage_read_end_io;
74	guard_bio_eod(bio);
75	submit_bio(bio);
76	return NULL;
77	}
78
79	static struct bio mpage_bio_submit_write(struct* bio *bio)
80	{
81	bio->bi_end_io = mpage_write_end_io;
82	guard_bio_eod(bio);
83	submit_bio(bio);
84	return NULL;
85	}
86
87	/*
88	* support function for mpage_readahead. The fs supplied get_block might
89	* return an up to date buffer. This is used to map that buffer into
90	* the page, which allows read_folio to avoid triggering a duplicate call
91	* to get_block.
92	*
93	* The idea is to avoid adding buffers to pages that don't already have
94	* them. So when the buffer is up to date and the page size == block size,
95	* this marks the page up to date instead of adding new buffers.
96	*/
97	static void map_buffer_to_folio(struct folio folio, struct* buffer_head *bh,
98	int page_block)
99	{
100	struct inode *inode = folio->mapping->host;
101	struct buffer_head page_bh, head;
102	int block = `0`;
103
104	head = folio_buffers(folio);
105	if (!head) {
106	/*
107	* don't make any buffers if there is only one buffer on
108	* the folio and the folio just needs to be set up to date
109	*/
110	if (inode->i_blkbits == folio_shift(folio) &&
111	buffer_uptodate(bh)) {
112	folio_mark_uptodate(folio);
113	return;
114	}
115	head = create_empty_buffers(folio, blocksize: i_blocksize(node: inode), b_state: `0`);
116	}
117
118	page_bh = head;
119	do {
120	if (block == page_block) {
121	page_bh->b_state = bh->b_state;
122	page_bh->b_bdev = bh->b_bdev;
123	page_bh->b_blocknr = bh->b_blocknr;
124	break;
125	}
126	page_bh = page_bh->b_this_page;
127	block++;
128	} while (page_bh != head);
129	}
130
131	struct mpage_readpage_args {
132	struct bio *bio;
133	struct folio *folio;
134	unsigned int nr_pages;
135	bool is_readahead;
136	sector_t last_block_in_bio;
137	struct buffer_head map_bh;
138	unsigned long first_logical_block;
139	get_block_t *get_block;
140	};
141
142	/*
143	* This is the worker routine which does all the work of mapping the disk
144	* blocks and constructs largest possible bios, submits them for IO if the
145	* blocks are not contiguous on the disk.
146	*
147	* We pass a buffer_head back and forth and use its buffer_mapped() flag to
148	* represent the validity of its disk mapping and to decide when to do the next
149	* get_block() call.
150	*/
151	static void do_mpage_readpage(struct mpage_readpage_args *args)
152	{
153	struct folio *folio = args->folio;
154	struct inode *inode = folio->mapping->host;
155	const unsigned blkbits = inode->i_blkbits;
156	const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
157	const unsigned blocksize = `1` << blkbits;
158	struct buffer_head *map_bh = &args->map_bh;
159	sector_t block_in_file;
160	sector_t last_block;
161	sector_t last_block_in_file;
162	sector_t first_block;
163	unsigned page_block;
164	unsigned first_hole = blocks_per_folio;
165	struct block_device *bdev = NULL;
166	int length;
167	int fully_mapped = `1`;
168	blk_opf_t opf = REQ_OP_READ;
169	unsigned nblocks;
170	unsigned relative_block;
171	gfp_t gfp = mapping_gfp_constraint(mapping: folio->mapping, GFP_KERNEL);
172
173	if (args->is_readahead) {
174	opf \|= REQ_RAHEAD;
175	gfp \|= __GFP_NORETRY \| __GFP_NOWARN;
176	}
177
178	if (folio_buffers(folio))
179	goto confused;
180
181	block_in_file = folio_pos(folio) >> blkbits;
182	last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits);
183	last_block_in_file = (i_size_read(inode) + blocksize - `1`) >> blkbits;
184	if (last_block > last_block_in_file)
185	last_block = last_block_in_file;
186	page_block = `0`;
187
188	/*
189	* Map blocks using the result from the previous get_blocks call first.
190	*/
191	nblocks = map_bh->b_size >> blkbits;
192	if (buffer_mapped(bh: map_bh) &&
193	block_in_file > args->first_logical_block &&
194	block_in_file < (args->first_logical_block + nblocks)) {
195	unsigned map_offset = block_in_file - args->first_logical_block;
196	unsigned last = nblocks - map_offset;
197
198	first_block = map_bh->b_blocknr + map_offset;
199	for (relative_block = `0`; ; relative_block++) {
200	if (relative_block == last) {
201	clear_buffer_mapped(bh: map_bh);
202	break;
203	}
204	if (page_block == blocks_per_folio)
205	break;
206	page_block++;
207	block_in_file++;
208	}
209	bdev = map_bh->b_bdev;
210	}
211
212	/*
213	* Then do more get_blocks calls until we are done with this folio.
214	*/
215	map_bh->b_folio = folio;
216	while (page_block < blocks_per_folio) {
217	map_bh->b_state = `0`;
218	map_bh->b_size = `0`;
219
220	if (block_in_file < last_block) {
221	map_bh->b_size = (last_block-block_in_file) << blkbits;
222	if (args->get_block(inode, block_in_file, map_bh, `0`))
223	goto confused;
224	args->first_logical_block = block_in_file;
225	}
226
227	if (!buffer_mapped(bh: map_bh)) {
228	fully_mapped = `0`;
229	if (first_hole == blocks_per_folio)
230	first_hole = page_block;
231	page_block++;
232	block_in_file++;
233	continue;
234	}
235
236	/ some filesystems will copy data into the page during*
237	* the get_block call, in which case we don't want to
238	* read it again. map_buffer_to_folio copies the data
239	* we just collected from get_block into the folio's buffers
240	* so read_folio doesn't have to repeat the get_block call
241	*/
242	if (buffer_uptodate(bh: map_bh)) {
243	map_buffer_to_folio(folio, bh: map_bh, page_block);
244	goto confused;
245	}
246
247	if (first_hole != blocks_per_folio)
248	goto confused; / hole -> non-hole /
249
250	/ Contiguous blocks? /
251	if (!page_block)
252	first_block = map_bh->b_blocknr;
253	else if (first_block + page_block != map_bh->b_blocknr)
254	goto confused;
255	nblocks = map_bh->b_size >> blkbits;
256	for (relative_block = `0`; ; relative_block++) {
257	if (relative_block == nblocks) {
258	clear_buffer_mapped(bh: map_bh);
259	break;
260	} else if (page_block == blocks_per_folio)
261	break;
262	page_block++;
263	block_in_file++;
264	}
265	bdev = map_bh->b_bdev;
266	}
267
268	if (first_hole != blocks_per_folio) {
269	folio_zero_segment(folio, start: first_hole << blkbits, xend: folio_size(folio));
270	if (first_hole == `0`) {
271	folio_mark_uptodate(folio);
272	folio_unlock(folio);
273	goto out;
274	}
275	} else if (fully_mapped) {
276	folio_set_mappedtodisk(folio);
277	}
278
279	/*
280	* This folio will go to BIO. Do we need to send this BIO off first?
281	*/
282	if (args->bio && (args->last_block_in_bio != first_block - `1`))
283	args->bio = mpage_bio_submit_read(bio: args->bio);
284
285	alloc_new:
286	if (args->bio == NULL) {
287	args->bio = bio_alloc(bdev, nr_vecs: bio_max_segs(nr_segs: args->nr_pages), opf,
288	gfp_mask: gfp);
289	if (args->bio == NULL)
290	goto confused;
291	args->bio->bi_iter.bi_sector = first_block << (blkbits - `9`);
292	}
293
294	length = first_hole << blkbits;
295	if (!bio_add_folio(bio: args->bio, folio, len: length, off: `0`)) {
296	args->bio = mpage_bio_submit_read(bio: args->bio);
297	goto alloc_new;
298	}
299
300	relative_block = block_in_file - args->first_logical_block;
301	nblocks = map_bh->b_size >> blkbits;
302	if ((buffer_boundary(bh: map_bh) && relative_block == nblocks) \|\|
303	(first_hole != blocks_per_folio))
304	args->bio = mpage_bio_submit_read(bio: args->bio);
305	else
306	args->last_block_in_bio = first_block + blocks_per_folio - `1`;
307	out:
308	return;
309
310	confused:
311	if (args->bio)
312	args->bio = mpage_bio_submit_read(bio: args->bio);
313	if (!folio_test_uptodate(folio))
314	block_read_full_folio(folio, args->get_block);
315	else
316	folio_unlock(folio);
317	goto out;
318	}
319
320	/**
321	* mpage_readahead - start reads against pages
322	* @rac: Describes which pages to read.
323	* @get_block: The filesystem's block mapper function.
324	*
325	* This function walks the pages and the blocks within each page, building and
326	* emitting large BIOs.
327	*
328	* If anything unusual happens, such as:
329	*
330	* - encountering a page which has buffers
331	* - encountering a page which has a non-hole after a hole
332	* - encountering a page with non-contiguous blocks
333	*
334	* then this code just gives up and calls the buffer_head-based read function.
335	* It does handle a page which has holes at the end - that is a common case:
336	* the end-of-file on blocksize < PAGE_SIZE setups.
337	*
338	* BH_Boundary explanation:
339	*
340	* There is a problem. The mpage read code assembles several pages, gets all
341	* their disk mappings, and then submits them all. That's fine, but obtaining
342	* the disk mappings may require I/O. Reads of indirect blocks, for example.
343	*
344	* So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
345	* submitted in the following order:
346	*
347	* 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
348	*
349	* because the indirect block has to be read to get the mappings of blocks
350	* 13,14,15,16. Obviously, this impacts performance.
351	*
352	* So what we do it to allow the filesystem's get_block() function to set
353	* BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block
354	* after this one will require I/O against a block which is probably close to
355	* this one. So you should push what I/O you have currently accumulated.
356	*
357	* This all causes the disk requests to be issued in the correct order.
358	*/
359	void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
360	{
361	struct folio *folio;
362	struct mpage_readpage_args args = {
363	.get_block = get_block,
364	.is_readahead = true,
365	};
366
367	while ((folio = readahead_folio(ractl: rac))) {
368	prefetchw(x: &folio->flags);
369	args.folio = folio;
370	args.nr_pages = readahead_count(rac);
371	do_mpage_readpage(args: &args);
372	/*
373	* If read ahead failed synchronously, it may cause by removed
374	* device, or some filesystem metadata error.
375	*/
376	if (!folio_test_locked(folio) && !folio_test_uptodate(folio))
377	break;
378	}
379	if (args.bio)
380	mpage_bio_submit_read(bio: args.bio);
381	}
382	EXPORT_SYMBOL(mpage_readahead);
383
384	/*
385	* This isn't called much at all
386	*/
387	int mpage_read_folio(struct folio *folio, get_block_t get_block)
388	{
389	struct mpage_readpage_args args = {
390	.folio = folio,
391	.nr_pages = folio_nr_pages(folio),
392	.get_block = get_block,
393	};
394
395	do_mpage_readpage(args: &args);
396	if (args.bio)
397	mpage_bio_submit_read(bio: args.bio);
398	return `0`;
399	}
400	EXPORT_SYMBOL(mpage_read_folio);
401
402	/*
403	* Writing is not so simple.
404	*
405	* If the page has buffers then they will be used for obtaining the disk
406	* mapping. We only support pages which are fully mapped-and-dirty, with a
407	* special case for pages which are unmapped at the end: end-of-file.
408	*
409	* If the page has no buffers (preferred) then the page is mapped here.
410	*
411	* If all blocks are found to be contiguous then the page can go into the
412	* BIO. Otherwise fall back to the mapping's writepage().
413	*
414	* FIXME: This code wants an estimate of how many pages are still to be
415	* written, so it can intelligently allocate a suitably-sized BIO. For now,
416	* just allocate full-size (16-page) BIOs.
417	*/
418
419	struct mpage_data {
420	struct bio *bio;
421	sector_t last_block_in_bio;
422	get_block_t *get_block;
423	};
424
425	/*
426	* We have our BIO, so we can now mark the buffers clean. Make
427	* sure to only clean buffers which we know we'll be writing.
428	*/
429	static void clean_buffers(struct folio folio, unsigned* first_unmapped)
430	{
431	unsigned buffer_counter = `0`;
432	struct buffer_head bh, head = folio_buffers(folio);
433
434	if (!head)
435	return;
436	bh = head;
437
438	do {
439	if (buffer_counter++ == first_unmapped)
440	break;
441	clear_buffer_dirty(bh);
442	bh = bh->b_this_page;
443	} while (bh != head);
444
445	/*
446	* we cannot drop the bh if the page is not uptodate or a concurrent
447	* read_folio would fail to serialize with the bh and it would read from
448	* disk before we reach the platter.
449	*/
450	if (buffer_heads_over_limit && folio_test_uptodate(folio))
451	try_to_free_buffers(folio);
452	}
453
454	static int mpage_write_folio(struct writeback_control wbc, struct* folio *folio,
455	struct mpage_data *mpd)
456	{
457	struct bio *bio = mpd->bio;
458	struct address_space *mapping = folio->mapping;
459	struct inode *inode = mapping->host;
460	const unsigned blkbits = inode->i_blkbits;
461	const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
462	sector_t last_block;
463	sector_t block_in_file;
464	sector_t first_block;
465	unsigned page_block;
466	unsigned first_unmapped = blocks_per_folio;
467	struct block_device *bdev = NULL;
468	int boundary = `0`;
469	sector_t boundary_block = `0`;
470	struct block_device *boundary_bdev = NULL;
471	size_t length;
472	struct buffer_head map_bh;
473	loff_t i_size = i_size_read(inode);
474	int ret = `0`;
475	struct buffer_head *head = folio_buffers(folio);
476
477	if (head) {
478	struct buffer_head *bh = head;
479
480	/ If they're all mapped and dirty, do it /
481	page_block = `0`;
482	do {
483	BUG_ON(buffer_locked(bh));
484	if (!buffer_mapped(bh)) {
485	/*
486	* unmapped dirty buffers are created by
487	* block_dirty_folio -> mmapped data
488	*/
489	if (buffer_dirty(bh))
490	goto confused;
491	if (first_unmapped == blocks_per_folio)
492	first_unmapped = page_block;
493	continue;
494	}
495
496	if (first_unmapped != blocks_per_folio)
497	goto confused; / hole -> non-hole /
498
499	if (!buffer_dirty(bh) \|\| !buffer_uptodate(bh))
500	goto confused;
501	if (page_block) {
502	if (bh->b_blocknr != first_block + page_block)
503	goto confused;
504	} else {
505	first_block = bh->b_blocknr;
506	}
507	page_block++;
508	boundary = buffer_boundary(bh);
509	if (boundary) {
510	boundary_block = bh->b_blocknr;
511	boundary_bdev = bh->b_bdev;
512	}
513	bdev = bh->b_bdev;
514	} while ((bh = bh->b_this_page) != head);
515
516	if (first_unmapped)
517	goto page_is_mapped;
518
519	/*
520	* Page has buffers, but they are all unmapped. The page was
521	* created by pagein or read over a hole which was handled by
522	* block_read_full_folio(). If this address_space is also
523	* using mpage_readahead then this can rarely happen.
524	*/
525	goto confused;
526	}
527
528	/*
529	* The page has no buffers: map it to disk
530	*/
531	BUG_ON(!folio_test_uptodate(folio));
532	block_in_file = folio_pos(folio) >> blkbits;
533	/*
534	* Whole page beyond EOF? Skip allocating blocks to avoid leaking
535	* space.
536	*/
537	if (block_in_file >= (i_size + (`1` << blkbits) - `1`) >> blkbits)
538	goto page_is_mapped;
539	last_block = (i_size - `1`) >> blkbits;
540	map_bh.b_folio = folio;
541	for (page_block = `0`; page_block < blocks_per_folio; ) {
542
543	map_bh.b_state = `0`;
544	map_bh.b_size = `1` << blkbits;
545	if (mpd->get_block(inode, block_in_file, &map_bh, `1`))
546	goto confused;
547	if (!buffer_mapped(bh: &map_bh))
548	goto confused;
549	if (buffer_new(bh: &map_bh))
550	clean_bdev_bh_alias(bh: &map_bh);
551	if (buffer_boundary(bh: &map_bh)) {
552	boundary_block = map_bh.b_blocknr;
553	boundary_bdev = map_bh.b_bdev;
554	}
555	if (page_block) {
556	if (map_bh.b_blocknr != first_block + page_block)
557	goto confused;
558	} else {
559	first_block = map_bh.b_blocknr;
560	}
561	page_block++;
562	boundary = buffer_boundary(bh: &map_bh);
563	bdev = map_bh.b_bdev;
564	if (block_in_file == last_block)
565	break;
566	block_in_file++;
567	}
568	BUG_ON(page_block == `0`);
569
570	first_unmapped = page_block;
571
572	page_is_mapped:
573	/ Don't bother writing beyond EOF, truncate will discard the folio /
574	if (folio_pos(folio) >= i_size)
575	goto confused;
576	length = folio_size(folio);
577	if (folio_pos(folio) + length > i_size) {
578	/*
579	* The page straddles i_size. It must be zeroed out on each
580	* and every writepage invocation because it may be mmapped.
581	* "A file is mapped in multiples of the page size. For a file
582	* that is not a multiple of the page size, the remaining memory
583	* is zeroed when mapped, and writes to that region are not
584	* written out to the file."
585	*/
586	length = i_size - folio_pos(folio);
587	folio_zero_segment(folio, start: length, xend: folio_size(folio));
588	}
589
590	/*
591	* This page will go to BIO. Do we need to send this BIO off first?
592	*/
593	if (bio && mpd->last_block_in_bio != first_block - `1`)
594	bio = mpage_bio_submit_write(bio);
595
596	alloc_new:
597	if (bio == NULL) {
598	bio = bio_alloc(bdev, BIO_MAX_VECS,
599	opf: REQ_OP_WRITE \| wbc_to_write_flags(wbc),
600	GFP_NOFS);
601	bio->bi_iter.bi_sector = first_block << (blkbits - `9`);
602	wbc_init_bio(wbc, bio);
603	bio->bi_write_hint = inode->i_write_hint;
604	}
605
606	/*
607	* Must try to add the page before marking the buffer clean or
608	* the confused fail path above (OOM) will be very confused when
609	* it finds all bh marked clean (i.e. it will not write anything)
610	*/
611	wbc_account_cgroup_owner(wbc, folio, bytes: folio_size(folio));
612	length = first_unmapped << blkbits;
613	if (!bio_add_folio(bio, folio, len: length, off: `0`)) {
614	bio = mpage_bio_submit_write(bio);
615	goto alloc_new;
616	}
617
618	clean_buffers(folio, first_unmapped);
619
620	BUG_ON(folio_test_writeback(folio));
621	folio_start_writeback(folio);
622	folio_unlock(folio);
623	if (boundary \|\| (first_unmapped != blocks_per_folio)) {
624	bio = mpage_bio_submit_write(bio);
625	if (boundary_block) {
626	write_boundary_block(bdev: boundary_bdev,
627	bblock: boundary_block, blocksize: `1` << blkbits);
628	}
629	} else {
630	mpd->last_block_in_bio = first_block + blocks_per_folio - `1`;
631	}
632	goto out;
633
634	confused:
635	if (bio)
636	bio = mpage_bio_submit_write(bio);
637
638	/*
639	* The caller has a ref on the inode, so *mapping is stable
640	*/
641	ret = block_write_full_folio(folio, wbc, get_block: mpd->get_block);
642	mapping_set_error(mapping, error: ret);
643	out:
644	mpd->bio = bio;
645	return ret;
646	}
647
648	/**
649	* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
650	* @mapping: address space structure to write
651	* @wbc: subtract the number of written pages from *@wbc->nr_to_write
652	* @get_block: the filesystem's block mapper function.
653	*
654	* This is a library function, which implements the writepages()
655	* address_space_operation.
656	*/
657	int
658	mpage_writepages(struct address_space *mapping,
659	struct writeback_control *wbc, get_block_t get_block)
660	{
661	struct mpage_data mpd = {
662	.get_block = get_block,
663	};
664	struct folio *folio = NULL;
665	struct blk_plug plug;
666	int error;
667
668	blk_start_plug(&plug);
669	while ((folio = writeback_iter(mapping, wbc, folio, error: &error)))
670	error = mpage_write_folio(wbc, folio, mpd: &mpd);
671	if (mpd.bio)
672	mpage_bio_submit_write(bio: mpd.bio);
673	blk_finish_plug(&plug);
674	return error;
675	}
676	EXPORT_SYMBOL(mpage_writepages);
677

Browse the source code of Linux/fs/mpage.c