buffered_write.c source code [Linux/fs/netfs/buffered_write.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Network filesystem high-level buffered write support.*
3	*
4	* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5	* Written by David Howells (dhowells@redhat.com)
6	*/
7
8	#include <linux/export.h>
9	#include <linux/fs.h>
10	#include <linux/mm.h>
11	#include <linux/pagemap.h>
12	#include <linux/slab.h>
13	#include <linux/pagevec.h>
14	#include "internal.h"
15
16	static void __netfs_set_group(struct folio folio, struct* netfs_group *netfs_group)
17	{
18	if (netfs_group)
19	folio_attach_private(folio, data: netfs_get_group(netfs_group));
20	}
21
22	static void netfs_set_group(struct folio folio, struct* netfs_group *netfs_group)
23	{
24	void *priv = folio_get_private(folio);
25
26	if (unlikely(priv != netfs_group)) {
27	if (netfs_group && (!priv \|\| priv == NETFS_FOLIO_COPY_TO_CACHE))
28	folio_attach_private(folio, data: netfs_get_group(netfs_group));
29	else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
30	folio_detach_private(folio);
31	}
32	}
33
34	/*
35	* Grab a folio for writing and lock it. Attempt to allocate as large a folio
36	* as possible to hold as much of the remaining length as possible in one go.
37	*/
38	static struct folio netfs_grab_folio_for_write(struct* address_space *mapping,
39	loff_t pos, size_t part)
40	{
41	pgoff_t index = pos / PAGE_SIZE;
42	fgf_t fgp_flags = FGP_WRITEBEGIN;
43
44	if (mapping_large_folio_support(mapping))
45	fgp_flags \|= fgf_set_order(size: pos % PAGE_SIZE + part);
46
47	return __filemap_get_folio(mapping, index, fgp_flags,
48	gfp: mapping_gfp_mask(mapping));
49	}
50
51	/*
52	* Update i_size and estimate the update to i_blocks to reflect the additional
53	* data written into the pagecache until we can find out from the server what
54	* the values actually are.
55	*/
56	void netfs_update_i_size(struct netfs_inode ctx, struct* inode *inode,
57	loff_t pos, size_t copied)
58	{
59	loff_t i_size, end = pos + copied;
60	blkcnt_t add;
61	size_t gap;
62
63	if (end <= i_size_read(inode))
64	return;
65
66	if (ctx->ops->update_i_size) {
67	ctx->ops->update_i_size(inode, end);
68	return;
69	}
70
71	spin_lock(lock: &inode->i_lock);
72
73	i_size = i_size_read(inode);
74	if (end > i_size) {
75	i_size_write(inode, i_size: end);
76	#if IS_ENABLED(CONFIG_FSCACHE)
77	fscache_update_cookie(ctx->cache, NULL, &end);
78	#endif
79
80	gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - `1`));
81	if (copied > gap) {
82	add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
83
84	inode->i_blocks = min_t(blkcnt_t,
85	DIV_ROUND_UP(end, SECTOR_SIZE),
86	inode->i_blocks + add);
87	}
88	}
89	spin_unlock(lock: &inode->i_lock);
90	}
91
92	/**
93	* netfs_perform_write - Copy data into the pagecache.
94	* @iocb: The operation parameters
95	* @iter: The source buffer
96	* @netfs_group: Grouping for dirty folios (eg. ceph snaps).
97	*
98	* Copy data into pagecache folios attached to the inode specified by @iocb.
99	* The caller must hold appropriate inode locks.
100	*
101	* Dirty folios are tagged with a netfs_folio struct if they're not up to date
102	* to indicate the range modified. Dirty folios may also be tagged with a
103	* netfs-specific grouping such that data from an old group gets flushed before
104	* a new one is started.
105	*/
106	ssize_t netfs_perform_write(struct kiocb iocb, struct* iov_iter *iter,
107	struct netfs_group *netfs_group)
108	{
109	struct file *file = iocb->ki_filp;
110	struct inode *inode = file_inode(f: file);
111	struct address_space *mapping = inode->i_mapping;
112	struct netfs_inode *ctx = netfs_inode(inode);
113	struct writeback_control wbc = {
114	.sync_mode = WB_SYNC_NONE,
115	.for_sync = true,
116	.nr_to_write = LONG_MAX,
117	.range_start = iocb->ki_pos,
118	.range_end = iocb->ki_pos + iter->count,
119	};
120	struct netfs_io_request *wreq = NULL;
121	struct folio folio = NULL, writethrough = NULL;
122	unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : `0`;
123	ssize_t written = `0`, ret, ret2;
124	loff_t pos = iocb->ki_pos;
125	size_t max_chunk = mapping_max_folio_size(mapping);
126	bool maybe_trouble = false;
127
128	if (unlikely(iocb->ki_flags & (IOCB_DSYNC \| IOCB_SYNC))
129	) {
130	wbc_attach_fdatawrite_inode(wbc: &wbc, inode: mapping->host);
131
132	ret = filemap_write_and_wait_range(mapping, lstart: pos, lend: pos + iter->count);
133	if (ret < `0`) {
134	wbc_detach_inode(wbc: &wbc);
135	goto out;
136	}
137
138	wreq = netfs_begin_writethrough(iocb, len: iter->count);
139	if (IS_ERR(ptr: wreq)) {
140	wbc_detach_inode(wbc: &wbc);
141	ret = PTR_ERR(ptr: wreq);
142	wreq = NULL;
143	goto out;
144	}
145	if (!is_sync_kiocb(kiocb: iocb))
146	wreq->iocb = iocb;
147	netfs_stat(&netfs_n_wh_writethrough);
148	} else {
149	netfs_stat(&netfs_n_wh_buffered_write);
150	}
151
152	do {
153	struct netfs_folio *finfo;
154	struct netfs_group *group;
155	unsigned long long fpos;
156	size_t flen;
157	size_t offset; / Offset into pagecache folio /
158	size_t part; / Bytes to write to folio /
159	size_t copied; / Bytes copied from user /
160
161	offset = pos & (max_chunk - `1`);
162	part = min(max_chunk - offset, iov_iter_count(iter));
163
164	/ Bring in the user pages that we will copy from _first_ lest*
165	* we hit a nasty deadlock on copying from the same page as
166	* we're writing to, without it being marked uptodate.
167	*
168	* Not only is this an optimisation, but it is also required to
169	* check that the address is actually valid, when atomic
170	* usercopies are used below.
171	*
172	* We rely on the page being held onto long enough by the LRU
173	* that we can grab it below if this causes it to be read.
174	*/
175	ret = -EFAULT;
176	if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
177	break;
178
179	folio = netfs_grab_folio_for_write(mapping, pos, part);
180	if (IS_ERR(ptr: folio)) {
181	ret = PTR_ERR(ptr: folio);
182	break;
183	}
184
185	flen = folio_size(folio);
186	fpos = folio_pos(folio);
187	offset = pos - fpos;
188	part = min_t(size_t, flen - offset, part);
189
190	/ Wait for writeback to complete. The writeback engine owns*
191	* the info in folio->private and may change it until it
192	* removes the WB mark.
193	*/
194	if (folio_get_private(folio) &&
195	folio_wait_writeback_killable(folio)) {
196	ret = written ? -EINTR : -ERESTARTSYS;
197	goto error_folio_unlock;
198	}
199
200	if (signal_pending(current)) {
201	ret = written ? -EINTR : -ERESTARTSYS;
202	goto error_folio_unlock;
203	}
204
205	/ Decide how we should modify a folio. We might be attempting*
206	* to do write-streaming, in which case we don't want to a
207	* local RMW cycle if we can avoid it. If we're doing local
208	* caching or content crypto, we award that priority over
209	* avoiding RMW. If the file is open readably, then we also
210	* assume that we may want to read what we wrote.
211	*/
212	finfo = netfs_folio_info(folio);
213	group = netfs_folio_group(folio);
214
215	if (unlikely(group != netfs_group) &&
216	group != NETFS_FOLIO_COPY_TO_CACHE)
217	goto flush_content;
218
219	if (folio_test_uptodate(folio)) {
220	if (mapping_writably_mapped(mapping))
221	flush_dcache_folio(folio);
222	copied = copy_folio_from_iter_atomic(folio, offset, bytes: part, i: iter);
223	if (unlikely(copied == `0`))
224	goto copy_failed;
225	netfs_set_group(folio, netfs_group);
226	trace_netfs_folio(folio, why: netfs_folio_is_uptodate);
227	goto copied;
228	}
229
230	/ If the page is above the zero-point then we assume that the*
231	* server would just return a block of zeros or a short read if
232	* we try to read it.
233	*/
234	if (fpos >= ctx->zero_point) {
235	folio_zero_segment(folio, start: `0`, xend: offset);
236	copied = copy_folio_from_iter_atomic(folio, offset, bytes: part, i: iter);
237	if (unlikely(copied == `0`))
238	goto copy_failed;
239	folio_zero_segment(folio, start: offset + copied, xend: flen);
240	__netfs_set_group(folio, netfs_group);
241	folio_mark_uptodate(folio);
242	trace_netfs_folio(folio, why: netfs_modify_and_clear);
243	goto copied;
244	}
245
246	/ See if we can write a whole folio in one go. /
247	if (!maybe_trouble && offset == `0` && part >= flen) {
248	copied = copy_folio_from_iter_atomic(folio, offset, bytes: part, i: iter);
249	if (unlikely(copied == `0`))
250	goto copy_failed;
251	if (unlikely(copied < part)) {
252	maybe_trouble = true;
253	iov_iter_revert(i: iter, bytes: copied);
254	copied = `0`;
255	folio_unlock(folio);
256	goto retry;
257	}
258	__netfs_set_group(folio, netfs_group);
259	folio_mark_uptodate(folio);
260	trace_netfs_folio(folio, why: netfs_whole_folio_modify);
261	goto copied;
262	}
263
264	/ We don't want to do a streaming write on a file that loses*
265	* caching service temporarily because the backing store got
266	* culled and we don't really want to get a streaming write on
267	* a file that's open for reading as ->read_folio() then has to
268	* be able to flush it.
269	*/
270	if ((file->f_mode & FMODE_READ) \|\|
271	netfs_is_cache_enabled(ctx)) {
272	if (finfo) {
273	netfs_stat(&netfs_n_wh_wstream_conflict);
274	goto flush_content;
275	}
276	ret = netfs_prefetch_for_write(file, folio, offset, len: part);
277	if (ret < `0`) {
278	_debug("prefetch = %zd", ret);
279	goto error_folio_unlock;
280	}
281	/ Note that copy-to-cache may have been set. /
282
283	copied = copy_folio_from_iter_atomic(folio, offset, bytes: part, i: iter);
284	if (unlikely(copied == `0`))
285	goto copy_failed;
286	netfs_set_group(folio, netfs_group);
287	trace_netfs_folio(folio, why: netfs_just_prefetch);
288	goto copied;
289	}
290
291	if (!finfo) {
292	ret = -EIO;
293	if (WARN_ON(folio_get_private(folio)))
294	goto error_folio_unlock;
295	copied = copy_folio_from_iter_atomic(folio, offset, bytes: part, i: iter);
296	if (unlikely(copied == `0`))
297	goto copy_failed;
298	if (offset == `0` && copied == flen) {
299	__netfs_set_group(folio, netfs_group);
300	folio_mark_uptodate(folio);
301	trace_netfs_folio(folio, why: netfs_streaming_filled_page);
302	goto copied;
303	}
304
305	finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
306	if (!finfo) {
307	iov_iter_revert(i: iter, bytes: copied);
308	ret = -ENOMEM;
309	goto error_folio_unlock;
310	}
311	finfo->netfs_group = netfs_get_group(netfs_group);
312	finfo->dirty_offset = offset;
313	finfo->dirty_len = copied;
314	folio_attach_private(folio, data: (void )((unsigned* long)finfo \|
315	NETFS_FOLIO_INFO));
316	trace_netfs_folio(folio, why: netfs_streaming_write);
317	goto copied;
318	}
319
320	/ We can continue a streaming write only if it continues on*
321	* from the previous. If it overlaps, we must flush lest we
322	* suffer a partial copy and disjoint dirty regions.
323	*/
324	if (offset == finfo->dirty_offset + finfo->dirty_len) {
325	copied = copy_folio_from_iter_atomic(folio, offset, bytes: part, i: iter);
326	if (unlikely(copied == `0`))
327	goto copy_failed;
328	finfo->dirty_len += copied;
329	if (finfo->dirty_offset == `0` && finfo->dirty_len == flen) {
330	if (finfo->netfs_group)
331	folio_change_private(folio, data: finfo->netfs_group);
332	else
333	folio_detach_private(folio);
334	folio_mark_uptodate(folio);
335	kfree(objp: finfo);
336	trace_netfs_folio(folio, why: netfs_streaming_cont_filled_page);
337	} else {
338	trace_netfs_folio(folio, why: netfs_streaming_write_cont);
339	}
340	goto copied;
341	}
342
343	/ Incompatible write; flush the folio and try again. /
344	flush_content:
345	trace_netfs_folio(folio, why: netfs_flush_content);
346	folio_unlock(folio);
347	folio_put(folio);
348	ret = filemap_write_and_wait_range(mapping, lstart: fpos, lend: fpos + flen - `1`);
349	if (ret < `0`)
350	goto out;
351	continue;
352
353	copied:
354	flush_dcache_folio(folio);
355
356	/ Update the inode size if we moved the EOF marker /
357	netfs_update_i_size(ctx, inode, pos, copied);
358	pos += copied;
359	written += copied;
360
361	if (likely(!wreq)) {
362	folio_mark_dirty(folio);
363	folio_unlock(folio);
364	} else {
365	netfs_advance_writethrough(wreq, wbc: &wbc, folio, copied,
366	to_page_end: offset + copied == flen,
367	writethrough_cache: &writethrough);
368	/ Folio unlocked /
369	}
370	retry:
371	folio_put(folio);
372	folio = NULL;
373
374	ret = balance_dirty_pages_ratelimited_flags(mapping, flags: bdp_flags);
375	if (unlikely(ret < `0`))
376	break;
377
378	cond_resched();
379	} while (iov_iter_count(i: iter));
380
381	out:
382	if (likely(written)) {
383	/ Set indication that ctime and mtime got updated in case*
384	* close is deferred.
385	*/
386	set_bit(NETFS_ICTX_MODIFIED_ATTR, addr: &ctx->flags);
387	if (unlikely(ctx->ops->post_modify))
388	ctx->ops->post_modify(inode);
389	}
390
391	if (unlikely(wreq)) {
392	ret2 = netfs_end_writethrough(wreq, wbc: &wbc, writethrough_cache: writethrough);
393	wbc_detach_inode(wbc: &wbc);
394	if (ret2 == -EIOCBQUEUED)
395	return ret2;
396	if (ret == `0` && ret2 < `0`)
397	ret = ret2;
398	}
399
400	iocb->ki_pos += written;
401	_leave(" = %zd [%zd]", written, ret);
402	return written ? written : ret;
403
404	copy_failed:
405	ret = -EFAULT;
406	error_folio_unlock:
407	folio_unlock(folio);
408	folio_put(folio);
409	goto out;
410	}
411	EXPORT_SYMBOL(netfs_perform_write);
412
413	/**
414	* netfs_buffered_write_iter_locked - write data to a file
415	* @iocb: IO state structure (file, offset, etc.)
416	* @from: iov_iter with data to write
417	* @netfs_group: Grouping for dirty folios (eg. ceph snaps).
418	*
419	* This function does all the work needed for actually writing data to a
420	* file. It does all basic checks, removes SUID from the file, updates
421	* modification times and calls proper subroutines depending on whether we
422	* do direct IO or a standard buffered write.
423	*
424	* The caller must hold appropriate locks around this function and have called
425	* generic_write_checks() already. The caller is also responsible for doing
426	* any necessary syncing afterwards.
427	*
428	* This function does not take care of syncing data in case of O_SYNC write.
429	* A caller has to handle it. This is mainly due to the fact that we want to
430	* avoid syncing under i_rwsem.
431	*
432	* Return:
433	* * number of bytes written, even for truncated writes
434	* * negative error code if no data has been written at all
435	*/
436	ssize_t netfs_buffered_write_iter_locked(struct kiocb iocb, struct* iov_iter *from,
437	struct netfs_group *netfs_group)
438	{
439	struct file *file = iocb->ki_filp;
440	ssize_t ret;
441
442	trace_netfs_write_iter(iocb, from);
443
444	ret = file_remove_privs(file);
445	if (ret)
446	return ret;
447
448	ret = file_update_time(file);
449	if (ret)
450	return ret;
451
452	return netfs_perform_write(iocb, from, netfs_group);
453	}
454	EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
455
456	/**
457	* netfs_file_write_iter - write data to a file
458	* @iocb: IO state structure
459	* @from: iov_iter with data to write
460	*
461	* Perform a write to a file, writing into the pagecache if possible and doing
462	* an unbuffered write instead if not.
463	*
464	* Return:
465	* * Negative error code if no data has been written at all of
466	* vfs_fsync_range() failed for a synchronous write
467	* * Number of bytes written, even for truncated writes
468	*/
469	ssize_t netfs_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
470	{
471	struct file *file = iocb->ki_filp;
472	struct inode *inode = file->f_mapping->host;
473	struct netfs_inode *ictx = netfs_inode(inode);
474	ssize_t ret;
475
476	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
477
478	if (!iov_iter_count(i: from))
479	return `0`;
480
481	if ((iocb->ki_flags & IOCB_DIRECT) \|\|
482	test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
483	return netfs_unbuffered_write_iter(iocb, from);
484
485	ret = netfs_start_io_write(inode);
486	if (ret < `0`)
487	return ret;
488
489	ret = generic_write_checks(iocb, from);
490	if (ret > `0`)
491	ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
492	netfs_end_io_write(inode);
493	if (ret > `0`)
494	ret = generic_write_sync(iocb, count: ret);
495	return ret;
496	}
497	EXPORT_SYMBOL(netfs_file_write_iter);
498
499	/*
500	* Notification that a previously read-only page is about to become writable.
501	* The caller indicates the precise page that needs to be written to, but
502	* we only track group on a per-folio basis, so we block more often than
503	* we might otherwise.
504	*/
505	vm_fault_t netfs_page_mkwrite(struct vm_fault vmf, struct* netfs_group *netfs_group)
506	{
507	struct netfs_group *group;
508	struct folio *folio = page_folio(vmf->page);
509	struct file *file = vmf->vma->vm_file;
510	struct address_space *mapping = file->f_mapping;
511	struct inode *inode = file_inode(f: file);
512	struct netfs_inode *ictx = netfs_inode(inode);
513	vm_fault_t ret = VM_FAULT_NOPAGE;
514	int err;
515
516	_enter("%lx", folio->index);
517
518	sb_start_pagefault(sb: inode->i_sb);
519
520	if (folio_lock_killable(folio) < `0`)
521	goto out;
522	if (folio->mapping != mapping)
523	goto unlock;
524	if (folio_wait_writeback_killable(folio) < `0`)
525	goto unlock;
526
527	/ Can we see a streaming write here? /
528	if (WARN_ON(!folio_test_uptodate(folio))) {
529	ret = VM_FAULT_SIGBUS;
530	goto unlock;
531	}
532
533	group = netfs_folio_group(folio);
534	if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
535	folio_unlock(folio);
536	err = filemap_fdatawrite_range(mapping,
537	start: folio_pos(folio),
538	end: folio_pos(folio) + folio_size(folio));
539	switch (err) {
540	case `0`:
541	ret = VM_FAULT_RETRY;
542	goto out;
543	case -ENOMEM:
544	ret = VM_FAULT_OOM;
545	goto out;
546	default:
547	ret = VM_FAULT_SIGBUS;
548	goto out;
549	}
550	}
551
552	if (folio_test_dirty(folio))
553	trace_netfs_folio(folio, why: netfs_folio_trace_mkwrite_plus);
554	else
555	trace_netfs_folio(folio, why: netfs_folio_trace_mkwrite);
556	netfs_set_group(folio, netfs_group);
557	file_update_time(file);
558	set_bit(NETFS_ICTX_MODIFIED_ATTR, addr: &ictx->flags);
559	if (ictx->ops->post_modify)
560	ictx->ops->post_modify(inode);
561	ret = VM_FAULT_LOCKED;
562	out:
563	sb_end_pagefault(sb: inode->i_sb);
564	return ret;
565	unlock:
566	folio_unlock(folio);
567	goto out;
568	}
569	EXPORT_SYMBOL(netfs_page_mkwrite);
570

Browse the source code of Linux/fs/netfs/buffered_write.c