transaction.c source code [Linux/fs/jbd2/transaction.c]

1	// SPDX-License-Identifier: GPL-2.0+
2	/*
3	* linux/fs/jbd2/transaction.c
4	*
5	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6	*
7	* Copyright 1998 Red Hat corp --- All Rights Reserved
8	*
9	* Generic filesystem transaction handling code; part of the ext2fs
10	* journaling system.
11	*
12	* This file manages transactions (compound commits managed by the
13	* journaling code) and handles (individual atomic operations by the
14	* filesystem).
15	*/
16
17	#include <linux/time.h>
18	#include <linux/fs.h>
19	#include <linux/jbd2.h>
20	#include <linux/errno.h>
21	#include <linux/slab.h>
22	#include <linux/timer.h>
23	#include <linux/mm.h>
24	#include <linux/highmem.h>
25	#include <linux/hrtimer.h>
26	#include <linux/backing-dev.h>
27	#include <linux/bug.h>
28	#include <linux/module.h>
29	#include <linux/sched/mm.h>
30
31	#include <trace/events/jbd2.h>
32
33	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
34	static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
35
36	static struct kmem_cache *transaction_cache;
37	int __init jbd2_journal_init_transaction_cache(void)
38	{
39	J_ASSERT(!transaction_cache);
40	transaction_cache = kmem_cache_create("jbd2_transaction_s",
41	sizeof(transaction_t),
42	`0`,
43	SLAB_HWCACHE_ALIGN\|SLAB_TEMPORARY,
44	NULL);
45	if (!transaction_cache) {
46	pr_emerg("JBD2: failed to create transaction cache\n");
47	return -ENOMEM;
48	}
49	return `0`;
50	}
51
52	void jbd2_journal_destroy_transaction_cache(void)
53	{
54	kmem_cache_destroy(s: transaction_cache);
55	transaction_cache = NULL;
56	}
57
58	void jbd2_journal_free_transaction(transaction_t *transaction)
59	{
60	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
61	return;
62	kmem_cache_free(s: transaction_cache, objp: transaction);
63	}
64
65	/*
66	* jbd2_get_transaction: obtain a new transaction_t object.
67	*
68	* Simply initialise a new transaction. Initialize it in
69	* RUNNING state and add it to the current journal (which should not
70	* have an existing running transaction: we only make a new transaction
71	* once we have started to commit the old one).
72	*
73	* Preconditions:
74	* The journal MUST be locked. We don't perform atomic mallocs on the
75	* new transaction and we can't block without protecting against other
76	* processes trying to touch the journal while it is in transition.
77	*
78	*/
79
80	static void jbd2_get_transaction(journal_t *journal,
81	transaction_t *transaction)
82	{
83	transaction->t_journal = journal;
84	transaction->t_state = T_RUNNING;
85	transaction->t_start_time = ktime_get();
86	transaction->t_tid = journal->j_transaction_sequence++;
87	transaction->t_expires = jiffies + journal->j_commit_interval;
88	atomic_set(v: &transaction->t_updates, i: `0`);
89	atomic_set(v: &transaction->t_outstanding_credits,
90	i: journal->j_transaction_overhead_buffers +
91	atomic_read(v: &journal->j_reserved_credits));
92	atomic_set(v: &transaction->t_outstanding_revokes, i: `0`);
93	atomic_set(v: &transaction->t_handle_count, i: `0`);
94	INIT_LIST_HEAD(list: &transaction->t_inode_list);
95
96	/ Set up the commit timer for the new transaction. /
97	journal->j_commit_timer.expires = round_jiffies_up(j: transaction->t_expires);
98	add_timer(timer: &journal->j_commit_timer);
99
100	J_ASSERT(journal->j_running_transaction == NULL);
101	journal->j_running_transaction = transaction;
102	transaction->t_max_wait = `0`;
103	transaction->t_start = jiffies;
104	transaction->t_requested = `0`;
105	}
106
107	/*
108	* Handle management.
109	*
110	* A handle_t is an object which represents a single atomic update to a
111	* filesystem, and which tracks all of the modifications which form part
112	* of that one update.
113	*/
114
115	/*
116	* t_max_wait is carefully updated here with use of atomic compare exchange.
117	* Note that there could be multiplre threads trying to do this simultaneously
118	* hence using cmpxchg to avoid any use of locks in this case.
119	*/
120	static inline void update_t_max_wait(transaction_t *transaction,
121	unsigned long ts)
122	{
123	unsigned long oldts, newts;
124
125	if (time_after(transaction->t_start, ts)) {
126	newts = jbd2_time_diff(start: ts, end: transaction->t_start);
127	oldts = READ_ONCE(transaction->t_max_wait);
128	while (oldts < newts)
129	oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
130	}
131	}
132
133	/*
134	* Wait until running transaction passes to T_FLUSH state and new transaction
135	* can thus be started. Also starts the commit if needed. The function expects
136	* running transaction to exist and releases j_state_lock.
137	*/
138	static void wait_transaction_locked(journal_t *journal)
139	__releases(journal->j_state_lock)
140	{
141	DEFINE_WAIT(wait);
142	int need_to_start;
143	tid_t tid = journal->j_running_transaction->t_tid;
144
145	prepare_to_wait_exclusive(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait,
146	TASK_UNINTERRUPTIBLE);
147	need_to_start = !tid_geq(x: journal->j_commit_request, y: tid);
148	read_unlock(&journal->j_state_lock);
149	if (need_to_start)
150	jbd2_log_start_commit(journal, tid);
151	jbd2_might_wait_for_commit(journal);
152	schedule();
153	finish_wait(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait);
154	}
155
156	/*
157	* Wait until running transaction transitions from T_SWITCH to T_FLUSH
158	* state and new transaction can thus be started. The function releases
159	* j_state_lock.
160	*/
161	static void wait_transaction_switching(journal_t *journal)
162	__releases(journal->j_state_lock)
163	{
164	DEFINE_WAIT(wait);
165
166	if (WARN_ON(!journal->j_running_transaction \|\|
167	journal->j_running_transaction->t_state != T_SWITCH)) {
168	read_unlock(&journal->j_state_lock);
169	return;
170	}
171	prepare_to_wait_exclusive(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait,
172	TASK_UNINTERRUPTIBLE);
173	read_unlock(&journal->j_state_lock);
174	/*
175	* We don't call jbd2_might_wait_for_commit() here as there's no
176	* waiting for outstanding handles happening anymore in T_SWITCH state
177	* and handling of reserved handles actually relies on that for
178	* correctness.
179	*/
180	schedule();
181	finish_wait(wq_head: &journal->j_wait_transaction_locked, wq_entry: &wait);
182	}
183
184	static void sub_reserved_credits(journal_t journal, int* blocks)
185	{
186	atomic_sub(i: blocks, v: &journal->j_reserved_credits);
187	wake_up(&journal->j_wait_reserved);
188	}
189
190	/ Maximum number of blocks for user transaction payload /
191	static int jbd2_max_user_trans_buffers(journal_t *journal)
192	{
193	return journal->j_max_transaction_buffers -
194	journal->j_transaction_overhead_buffers;
195	}
196
197	/*
198	* Wait until we can add credits for handle to the running transaction. Called
199	* with j_state_lock held for reading. Returns 0 if handle joined the running
200	* transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
201	* caller must retry.
202	*
203	* Note: because j_state_lock may be dropped depending on the return
204	* value, we need to fake out sparse so ti doesn't complain about a
205	* locking imbalance. Callers of add_transaction_credits will need to
206	* make a similar accomodation.
207	*/
208	static int add_transaction_credits(journal_t journal, int* blocks,
209	int rsv_blocks)
210	__must_hold(&journal->j_state_lock)
211	{
212	transaction_t *t = journal->j_running_transaction;
213	int needed;
214	int total = blocks + rsv_blocks;
215
216	/*
217	* If the current transaction is locked down for commit, wait
218	* for the lock to be released.
219	*/
220	if (t->t_state != T_RUNNING) {
221	WARN_ON_ONCE(t->t_state >= T_FLUSH);
222	wait_transaction_locked(journal);
223	__acquire(&journal->j_state_lock); / fake out sparse /
224	return `1`;
225	}
226
227	/*
228	* If there is not enough space left in the log to write all
229	* potential buffers requested by this operation, we need to
230	* stall pending a log checkpoint to free some more log space.
231	*/
232	needed = atomic_add_return(i: total, v: &t->t_outstanding_credits);
233	if (needed > journal->j_max_transaction_buffers) {
234	/*
235	* If the current transaction is already too large,
236	* then start to commit it: we can then go back and
237	* attach this handle to a new transaction.
238	*/
239	atomic_sub(i: total, v: &t->t_outstanding_credits);
240
241	/*
242	* Is the number of reserved credits in the current transaction too
243	* big to fit this handle? Wait until reserved credits are freed.
244	*/
245	if (atomic_read(v: &journal->j_reserved_credits) + total >
246	jbd2_max_user_trans_buffers(journal)) {
247	read_unlock(&journal->j_state_lock);
248	jbd2_might_wait_for_commit(journal);
249	wait_event(journal->j_wait_reserved,
250	atomic_read(&journal->j_reserved_credits) + total <=
251	jbd2_max_user_trans_buffers(journal));
252	__acquire(&journal->j_state_lock); / fake out sparse /
253	return `1`;
254	}
255
256	wait_transaction_locked(journal);
257	__acquire(&journal->j_state_lock); / fake out sparse /
258	return `1`;
259	}
260
261	/*
262	* The commit code assumes that it can get enough log space
263	* without forcing a checkpoint. This is critical for
264	* correctness: a checkpoint of a buffer which is also
265	* associated with a committing transaction creates a deadlock,
266	* so commit simply cannot force through checkpoints.
267	*
268	* We must therefore ensure the necessary space in the journal
269	* before starting to dirty potentially checkpointed buffers
270	* in the new transaction.
271	*/
272	if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
273	atomic_sub(i: total, v: &t->t_outstanding_credits);
274	read_unlock(&journal->j_state_lock);
275	jbd2_might_wait_for_commit(journal);
276	write_lock(&journal->j_state_lock);
277	if (jbd2_log_space_left(journal) <
278	journal->j_max_transaction_buffers)
279	__jbd2_log_wait_for_space(journal);
280	write_unlock(&journal->j_state_lock);
281	__acquire(&journal->j_state_lock); / fake out sparse /
282	return `1`;
283	}
284
285	/ No reservation? We are done... /
286	if (!rsv_blocks)
287	return `0`;
288
289	needed = atomic_add_return(i: rsv_blocks, v: &journal->j_reserved_credits);
290	/ We allow at most half of a transaction to be reserved /
291	if (needed > jbd2_max_user_trans_buffers(journal) / `2`) {
292	sub_reserved_credits(journal, blocks: rsv_blocks);
293	atomic_sub(i: total, v: &t->t_outstanding_credits);
294	read_unlock(&journal->j_state_lock);
295	jbd2_might_wait_for_commit(journal);
296	wait_event(journal->j_wait_reserved,
297	atomic_read(&journal->j_reserved_credits) + rsv_blocks
298	<= jbd2_max_user_trans_buffers(journal) / `2`);
299	__acquire(&journal->j_state_lock); / fake out sparse /
300	return `1`;
301	}
302	return `0`;
303	}
304
305	/*
306	* start_this_handle: Given a handle, deal with any locking or stalling
307	* needed to make sure that there is enough journal space for the handle
308	* to begin. Attach the handle to a transaction and set up the
309	* transaction's buffer credits.
310	*/
311
312	static int start_this_handle(journal_t journal, handle_t handle,
313	gfp_t gfp_mask)
314	{
315	transaction_t transaction, new_transaction = NULL;
316	int blocks = handle->h_total_credits;
317	int rsv_blocks = `0`;
318	unsigned long ts = jiffies;
319
320	if (handle->h_rsv_handle)
321	rsv_blocks = handle->h_rsv_handle->h_total_credits;
322
323	/*
324	* Limit the number of reserved credits to 1/2 of maximum transaction
325	* size and limit the number of total credits to not exceed maximum
326	* transaction size per operation.
327	*/
328	if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / `2` \|\|
329	rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
330	printk(KERN_ERR "JBD2: %s wants too many credits "
331	"credits:%d rsv_credits:%d max:%d\n",
332	current->comm, blocks, rsv_blocks,
333	jbd2_max_user_trans_buffers(journal));
334	WARN_ON(`1`);
335	return -ENOSPC;
336	}
337
338	alloc_transaction:
339	/*
340	* This check is racy but it is just an optimization of allocating new
341	* transaction early if there are high chances we'll need it. If we
342	* guess wrong, we'll retry or free unused transaction.
343	*/
344	if (!data_race(journal->j_running_transaction)) {
345	/*
346	* If __GFP_FS is not present, then we may be being called from
347	* inside the fs writeback layer, so we MUST NOT fail.
348	*/
349	if ((gfp_mask & __GFP_FS) == `0`)
350	gfp_mask \|= __GFP_NOFAIL;
351	new_transaction = kmem_cache_zalloc(transaction_cache,
352	gfp_mask);
353	if (!new_transaction)
354	return -ENOMEM;
355	}
356
357	jbd2_debug(`3`, "New handle %p going live.\n", handle);
358
359	/*
360	* We need to hold j_state_lock until t_updates has been incremented,
361	* for proper journal barrier handling
362	*/
363	repeat:
364	read_lock(&journal->j_state_lock);
365	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
366	if (is_journal_aborted(journal) \|\|
367	(journal->j_errno != `0` && !(journal->j_flags & JBD2_ACK_ERR))) {
368	read_unlock(&journal->j_state_lock);
369	jbd2_journal_free_transaction(transaction: new_transaction);
370	return -EROFS;
371	}
372
373	/*
374	* Wait on the journal's transaction barrier if necessary. Specifically
375	* we allow reserved handles to proceed because otherwise commit could
376	* deadlock on page writeback not being able to complete.
377	*/
378	if (!handle->h_reserved && journal->j_barrier_count) {
379	read_unlock(&journal->j_state_lock);
380	wait_event(journal->j_wait_transaction_locked,
381	journal->j_barrier_count == `0`);
382	goto repeat;
383	}
384
385	if (!journal->j_running_transaction) {
386	read_unlock(&journal->j_state_lock);
387	if (!new_transaction)
388	goto alloc_transaction;
389	write_lock(&journal->j_state_lock);
390	if (!journal->j_running_transaction &&
391	(handle->h_reserved \|\| !journal->j_barrier_count)) {
392	jbd2_get_transaction(journal, transaction: new_transaction);
393	new_transaction = NULL;
394	}
395	write_unlock(&journal->j_state_lock);
396	goto repeat;
397	}
398
399	transaction = journal->j_running_transaction;
400
401	if (!handle->h_reserved) {
402	/ We may have dropped j_state_lock - restart in that case /
403	if (add_transaction_credits(journal, blocks, rsv_blocks)) {
404	/*
405	* add_transaction_credits releases
406	* j_state_lock on a non-zero return
407	*/
408	__release(&journal->j_state_lock);
409	goto repeat;
410	}
411	} else {
412	/*
413	* We have handle reserved so we are allowed to join T_LOCKED
414	* transaction and we don't have to check for transaction size
415	* and journal space. But we still have to wait while running
416	* transaction is being switched to a committing one as it
417	* won't wait for any handles anymore.
418	*/
419	if (transaction->t_state == T_SWITCH) {
420	wait_transaction_switching(journal);
421	goto repeat;
422	}
423	sub_reserved_credits(journal, blocks);
424	handle->h_reserved = `0`;
425	}
426
427	/ OK, account for the buffers that this operation expects to*
428	* use and add the handle to the running transaction.
429	*/
430	update_t_max_wait(transaction, ts);
431	handle->h_transaction = transaction;
432	handle->h_requested_credits = blocks;
433	handle->h_revoke_credits_requested = handle->h_revoke_credits;
434	handle->h_start_jiffies = jiffies;
435	atomic_inc(v: &transaction->t_updates);
436	atomic_inc(v: &transaction->t_handle_count);
437	jbd2_debug(`4`, "Handle %p given %d credits (total %d, free %lu)\n",
438	handle, blocks,
439	atomic_read(&transaction->t_outstanding_credits),
440	jbd2_log_space_left(journal));
441	read_unlock(&journal->j_state_lock);
442	current->journal_info = handle;
443
444	rwsem_acquire_read(&journal->j_trans_commit_map, `0`, `0`, _THIS_IP_);
445	jbd2_journal_free_transaction(transaction: new_transaction);
446	/*
447	* Ensure that no allocations done while the transaction is open are
448	* going to recurse back to the fs layer.
449	*/
450	handle->saved_alloc_context = memalloc_nofs_save();
451	return `0`;
452	}
453
454	/ Allocate a new handle. This should probably be in a slab... /
455	static handle_t new_handle(int* nblocks)
456	{
457	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
458	if (!handle)
459	return NULL;
460	handle->h_total_credits = nblocks;
461	handle->h_ref = `1`;
462
463	return handle;
464	}
465
466	handle_t jbd2__journal_start(journal_t journal, int nblocks, int rsv_blocks,
467	int revoke_records, gfp_t gfp_mask,
468	unsigned int type, unsigned int line_no)
469	{
470	handle_t *handle = journal_current_handle();
471	int err;
472
473	if (!journal)
474	return ERR_PTR(error: -EROFS);
475
476	if (handle) {
477	J_ASSERT(handle->h_transaction->t_journal == journal);
478	handle->h_ref++;
479	return handle;
480	}
481
482	nblocks += DIV_ROUND_UP(revoke_records,
483	journal->j_revoke_records_per_block);
484	handle = new_handle(nblocks);
485	if (!handle)
486	return ERR_PTR(error: -ENOMEM);
487	if (rsv_blocks) {
488	handle_t *rsv_handle;
489
490	rsv_handle = new_handle(nblocks: rsv_blocks);
491	if (!rsv_handle) {
492	jbd2_free_handle(handle);
493	return ERR_PTR(error: -ENOMEM);
494	}
495	rsv_handle->h_reserved = `1`;
496	rsv_handle->h_journal = journal;
497	handle->h_rsv_handle = rsv_handle;
498	}
499	handle->h_revoke_credits = revoke_records;
500
501	err = start_this_handle(journal, handle, gfp_mask);
502	if (err < `0`) {
503	if (handle->h_rsv_handle)
504	jbd2_free_handle(handle: handle->h_rsv_handle);
505	jbd2_free_handle(handle);
506	return ERR_PTR(error: err);
507	}
508	handle->h_type = type;
509	handle->h_line_no = line_no;
510	trace_jbd2_handle_start(dev: journal->j_fs_dev->bd_dev,
511	tid: handle->h_transaction->t_tid, type,
512	line_no, requested_blocks: nblocks);
513
514	return handle;
515	}
516	EXPORT_SYMBOL(jbd2__journal_start);
517
518
519	/**
520	* jbd2_journal_start() - Obtain a new handle.
521	* @journal: Journal to start transaction on.
522	* @nblocks: number of block buffer we might modify
523	*
524	* We make sure that the transaction can guarantee at least nblocks of
525	* modified buffers in the log. We block until the log can guarantee
526	* that much space. Additionally, if rsv_blocks > 0, we also create another
527	* handle with rsv_blocks reserved blocks in the journal. This handle is
528	* stored in h_rsv_handle. It is not attached to any particular transaction
529	* and thus doesn't block transaction commit. If the caller uses this reserved
530	* handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
531	* on the parent handle will dispose the reserved one. Reserved handle has to
532	* be converted to a normal handle using jbd2_journal_start_reserved() before
533	* it can be used.
534	*
535	* Return a pointer to a newly allocated handle, or an ERR_PTR() value
536	* on failure.
537	*/
538	handle_t jbd2_journal_start(journal_t journal, int nblocks)
539	{
540	return jbd2__journal_start(journal, nblocks, `0`, `0`, GFP_NOFS, `0`, `0`);
541	}
542	EXPORT_SYMBOL(jbd2_journal_start);
543
544	static void __jbd2_journal_unreserve_handle(handle_t handle, transaction_t t)
545	{
546	journal_t *journal = handle->h_journal;
547
548	WARN_ON(!handle->h_reserved);
549	sub_reserved_credits(journal, blocks: handle->h_total_credits);
550	if (t)
551	atomic_sub(i: handle->h_total_credits, v: &t->t_outstanding_credits);
552	}
553
554	void jbd2_journal_free_reserved(handle_t *handle)
555	{
556	journal_t *journal = handle->h_journal;
557
558	/ Get j_state_lock to pin running transaction if it exists /
559	read_lock(&journal->j_state_lock);
560	__jbd2_journal_unreserve_handle(handle, t: journal->j_running_transaction);
561	read_unlock(&journal->j_state_lock);
562	jbd2_free_handle(handle);
563	}
564	EXPORT_SYMBOL(jbd2_journal_free_reserved);
565
566	/**
567	* jbd2_journal_start_reserved() - start reserved handle
568	* @handle: handle to start
569	* @type: for handle statistics
570	* @line_no: for handle statistics
571	*
572	* Start handle that has been previously reserved with jbd2_journal_reserve().
573	* This attaches @handle to the running transaction (or creates one if there's
574	* not transaction running). Unlike jbd2_journal_start() this function cannot
575	* block on journal commit, checkpointing, or similar stuff. It can block on
576	* memory allocation or frozen journal though.
577	*
578	* Return 0 on success, non-zero on error - handle is freed in that case.
579	*/
580	int jbd2_journal_start_reserved(handle_t handle, unsigned* int type,
581	unsigned int line_no)
582	{
583	journal_t *journal = handle->h_journal;
584	int ret = -EIO;
585
586	if (WARN_ON(!handle->h_reserved)) {
587	/ Someone passed in normal handle? Just stop it. /
588	jbd2_journal_stop(handle);
589	return ret;
590	}
591	/*
592	* Usefulness of mixing of reserved and unreserved handles is
593	* questionable. So far nobody seems to need it so just error out.
594	*/
595	if (WARN_ON(current->journal_info)) {
596	jbd2_journal_free_reserved(handle);
597	return ret;
598	}
599
600	handle->h_journal = NULL;
601	/*
602	* GFP_NOFS is here because callers are likely from writeback or
603	* similarly constrained call sites
604	*/
605	ret = start_this_handle(journal, handle, GFP_NOFS);
606	if (ret < `0`) {
607	handle->h_journal = journal;
608	jbd2_journal_free_reserved(handle);
609	return ret;
610	}
611	handle->h_type = type;
612	handle->h_line_no = line_no;
613	trace_jbd2_handle_start(dev: journal->j_fs_dev->bd_dev,
614	tid: handle->h_transaction->t_tid, type,
615	line_no, requested_blocks: handle->h_total_credits);
616	return `0`;
617	}
618	EXPORT_SYMBOL(jbd2_journal_start_reserved);
619
620	/**
621	* jbd2_journal_extend() - extend buffer credits.
622	* @handle: handle to 'extend'
623	* @nblocks: nr blocks to try to extend by.
624	* @revoke_records: number of revoke records to try to extend by.
625	*
626	* Some transactions, such as large extends and truncates, can be done
627	* atomically all at once or in several stages. The operation requests
628	* a credit for a number of buffer modifications in advance, but can
629	* extend its credit if it needs more.
630	*
631	* jbd2_journal_extend tries to give the running handle more buffer credits.
632	* It does not guarantee that allocation - this is a best-effort only.
633	* The calling process MUST be able to deal cleanly with a failure to
634	* extend here.
635	*
636	* Return 0 on success, non-zero on failure.
637	*
638	* return code < 0 implies an error
639	* return code > 0 implies normal transaction-full status.
640	*/
641	int jbd2_journal_extend(handle_t handle, int* nblocks, int revoke_records)
642	{
643	transaction_t *transaction = handle->h_transaction;
644	journal_t *journal;
645	int result;
646	int wanted;
647
648	if (is_handle_aborted(handle))
649	return -EROFS;
650	journal = transaction->t_journal;
651
652	result = `1`;
653
654	read_lock(&journal->j_state_lock);
655
656	/ Don't extend a locked-down transaction! /
657	if (transaction->t_state != T_RUNNING) {
658	jbd2_debug(`3`, "denied handle %p %d blocks: "
659	"transaction not running\n", handle, nblocks);
660	goto error_out;
661	}
662
663	nblocks += DIV_ROUND_UP(
664	handle->h_revoke_credits_requested + revoke_records,
665	journal->j_revoke_records_per_block) -
666	DIV_ROUND_UP(
667	handle->h_revoke_credits_requested,
668	journal->j_revoke_records_per_block);
669	wanted = atomic_add_return(i: nblocks,
670	v: &transaction->t_outstanding_credits);
671
672	if (wanted > journal->j_max_transaction_buffers) {
673	jbd2_debug(`3`, "denied handle %p %d blocks: "
674	"transaction too large\n", handle, nblocks);
675	atomic_sub(i: nblocks, v: &transaction->t_outstanding_credits);
676	goto error_out;
677	}
678
679	trace_jbd2_handle_extend(dev: journal->j_fs_dev->bd_dev,
680	tid: transaction->t_tid,
681	type: handle->h_type, line_no: handle->h_line_no,
682	buffer_credits: handle->h_total_credits,
683	requested_blocks: nblocks);
684
685	handle->h_total_credits += nblocks;
686	handle->h_requested_credits += nblocks;
687	handle->h_revoke_credits += revoke_records;
688	handle->h_revoke_credits_requested += revoke_records;
689	result = `0`;
690
691	jbd2_debug(`3`, "extended handle %p by %d\n", handle, nblocks);
692	error_out:
693	read_unlock(&journal->j_state_lock);
694	return result;
695	}
696
697	static void stop_this_handle(handle_t *handle)
698	{
699	transaction_t *transaction = handle->h_transaction;
700	journal_t *journal = transaction->t_journal;
701	int revokes;
702
703	J_ASSERT(journal_current_handle() == handle);
704	J_ASSERT(atomic_read(&transaction->t_updates) > `0`);
705	current->journal_info = NULL;
706	/*
707	* Subtract necessary revoke descriptor blocks from handle credits. We
708	* take care to account only for revoke descriptor blocks the
709	* transaction will really need as large sequences of transactions with
710	* small numbers of revokes are relatively common.
711	*/
712	revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
713	if (revokes) {
714	int t_revokes, revoke_descriptors;
715	int rr_per_blk = journal->j_revoke_records_per_block;
716
717	WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
718	> handle->h_total_credits);
719	t_revokes = atomic_add_return(i: revokes,
720	v: &transaction->t_outstanding_revokes);
721	revoke_descriptors =
722	DIV_ROUND_UP(t_revokes, rr_per_blk) -
723	DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
724	handle->h_total_credits -= revoke_descriptors;
725	}
726	atomic_sub(i: handle->h_total_credits,
727	v: &transaction->t_outstanding_credits);
728	if (handle->h_rsv_handle)
729	__jbd2_journal_unreserve_handle(handle: handle->h_rsv_handle,
730	t: transaction);
731	if (atomic_dec_and_test(v: &transaction->t_updates))
732	wake_up(&journal->j_wait_updates);
733
734	rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
735	/*
736	* Scope of the GFP_NOFS context is over here and so we can restore the
737	* original alloc context.
738	*/
739	memalloc_nofs_restore(flags: handle->saved_alloc_context);
740	}
741
742	/**
743	* jbd2__journal_restart() - restart a handle .
744	* @handle: handle to restart
745	* @nblocks: nr credits requested
746	* @revoke_records: number of revoke record credits requested
747	* @gfp_mask: memory allocation flags (for start_this_handle)
748	*
749	* Restart a handle for a multi-transaction filesystem
750	* operation.
751	*
752	* If the jbd2_journal_extend() call above fails to grant new buffer credits
753	* to a running handle, a call to jbd2_journal_restart will commit the
754	* handle's transaction so far and reattach the handle to a new
755	* transaction capable of guaranteeing the requested number of
756	* credits. We preserve reserved handle if there's any attached to the
757	* passed in handle.
758	*/
759	int jbd2__journal_restart(handle_t handle, int* nblocks, int revoke_records,
760	gfp_t gfp_mask)
761	{
762	transaction_t *transaction = handle->h_transaction;
763	journal_t *journal;
764	tid_t tid;
765	int need_to_start;
766	int ret;
767
768	/ If we've had an abort of any type, don't even think about*
769	* actually doing the restart! */
770	if (is_handle_aborted(handle))
771	return `0`;
772	journal = transaction->t_journal;
773	tid = transaction->t_tid;
774
775	/*
776	* First unlink the handle from its current transaction, and start the
777	* commit on that.
778	*/
779	jbd2_debug(`2`, "restarting handle %p\n", handle);
780	stop_this_handle(handle);
781	handle->h_transaction = NULL;
782
783	/*
784	* TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
785	* get rid of pointless j_state_lock traffic like this.
786	*/
787	read_lock(&journal->j_state_lock);
788	need_to_start = !tid_geq(x: journal->j_commit_request, y: tid);
789	read_unlock(&journal->j_state_lock);
790	if (need_to_start)
791	jbd2_log_start_commit(journal, tid);
792	handle->h_total_credits = nblocks +
793	DIV_ROUND_UP(revoke_records,
794	journal->j_revoke_records_per_block);
795	handle->h_revoke_credits = revoke_records;
796	ret = start_this_handle(journal, handle, gfp_mask);
797	trace_jbd2_handle_restart(dev: journal->j_fs_dev->bd_dev,
798	tid: ret ? `0` : handle->h_transaction->t_tid,
799	type: handle->h_type, line_no: handle->h_line_no,
800	requested_blocks: handle->h_total_credits);
801	return ret;
802	}
803	EXPORT_SYMBOL(jbd2__journal_restart);
804
805
806	int jbd2_journal_restart(handle_t handle, int* nblocks)
807	{
808	return jbd2__journal_restart(handle, nblocks, `0`, GFP_NOFS);
809	}
810	EXPORT_SYMBOL(jbd2_journal_restart);
811
812	/*
813	* Waits for any outstanding t_updates to finish.
814	* This is called with write j_state_lock held.
815	*/
816	void jbd2_journal_wait_updates(journal_t *journal)
817	{
818	DEFINE_WAIT(wait);
819
820	while (`1`) {
821	/*
822	* Note that the running transaction can get freed under us if
823	* this transaction is getting committed in
824	* jbd2_journal_commit_transaction() ->
825	* jbd2_journal_free_transaction(). This can only happen when we
826	* release j_state_lock -> schedule() -> acquire j_state_lock.
827	* Hence we should everytime retrieve new j_running_transaction
828	* value (after j_state_lock release acquire cycle), else it may
829	* lead to use-after-free of old freed transaction.
830	*/
831	transaction_t *transaction = journal->j_running_transaction;
832
833	if (!transaction)
834	break;
835
836	prepare_to_wait(wq_head: &journal->j_wait_updates, wq_entry: &wait,
837	TASK_UNINTERRUPTIBLE);
838	if (!atomic_read(v: &transaction->t_updates)) {
839	finish_wait(wq_head: &journal->j_wait_updates, wq_entry: &wait);
840	break;
841	}
842	write_unlock(&journal->j_state_lock);
843	schedule();
844	finish_wait(wq_head: &journal->j_wait_updates, wq_entry: &wait);
845	write_lock(&journal->j_state_lock);
846	}
847	}
848
849	/**
850	* jbd2_journal_lock_updates () - establish a transaction barrier.
851	* @journal: Journal to establish a barrier on.
852	*
853	* This locks out any further updates from being started, and blocks
854	* until all existing updates have completed, returning only once the
855	* journal is in a quiescent state with no updates running.
856	*
857	* The journal lock should not be held on entry.
858	*/
859	void jbd2_journal_lock_updates(journal_t *journal)
860	{
861	jbd2_might_wait_for_commit(journal);
862
863	write_lock(&journal->j_state_lock);
864	++journal->j_barrier_count;
865
866	/ Wait until there are no reserved handles /
867	if (atomic_read(v: &journal->j_reserved_credits)) {
868	write_unlock(&journal->j_state_lock);
869	wait_event(journal->j_wait_reserved,
870	atomic_read(&journal->j_reserved_credits) == `0`);
871	write_lock(&journal->j_state_lock);
872	}
873
874	/ Wait until there are no running t_updates /
875	jbd2_journal_wait_updates(journal);
876
877	write_unlock(&journal->j_state_lock);
878
879	/*
880	* We have now established a barrier against other normal updates, but
881	* we also need to barrier against other jbd2_journal_lock_updates() calls
882	* to make sure that we serialise special journal-locked operations
883	* too.
884	*/
885	mutex_lock(lock: &journal->j_barrier);
886	}
887
888	/**
889	* jbd2_journal_unlock_updates () - release barrier
890	* @journal: Journal to release the barrier on.
891	*
892	* Release a transaction barrier obtained with jbd2_journal_lock_updates().
893	*
894	* Should be called without the journal lock held.
895	*/
896	void jbd2_journal_unlock_updates (journal_t *journal)
897	{
898	J_ASSERT(journal->j_barrier_count != `0`);
899
900	mutex_unlock(lock: &journal->j_barrier);
901	write_lock(&journal->j_state_lock);
902	--journal->j_barrier_count;
903	write_unlock(&journal->j_state_lock);
904	wake_up_all(&journal->j_wait_transaction_locked);
905	}
906
907	static void warn_dirty_buffer(struct buffer_head *bh)
908	{
909	printk(KERN_WARNING
910	"JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
911	"There's a risk of filesystem corruption in case of system "
912	"crash.\n",
913	bh->b_bdev, (unsigned long long)bh->b_blocknr);
914	}
915
916	/ Call t_frozen trigger and copy buffer data into jh->b_frozen_data. /
917	static void jbd2_freeze_jh_data(struct journal_head *jh)
918	{
919	char *source;
920	struct buffer_head *bh = jh2bh(jh);
921
922	J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
923	source = kmap_local_folio(folio: bh->b_folio, offset: bh_offset(bh));
924	/ Fire data frozen trigger just before we copy the data /
925	jbd2_buffer_frozen_trigger(jh, mapped_data: source, triggers: jh->b_triggers);
926	memcpy(to: jh->b_frozen_data, from: source, len: bh->b_size);
927	kunmap_local(source);
928
929	/*
930	* Now that the frozen data is saved off, we need to store any matching
931	* triggers.
932	*/
933	jh->b_frozen_triggers = jh->b_triggers;
934	}
935
936	/*
937	* If the buffer is already part of the current transaction, then there
938	* is nothing we need to do. If it is already part of a prior
939	* transaction which we are still committing to disk, then we need to
940	* make sure that we do not overwrite the old copy: we do copy-out to
941	* preserve the copy going to disk. We also account the buffer against
942	* the handle's metadata buffer credits (unless the buffer is already
943	* part of the transaction, that is).
944	*
945	*/
946	static int
947	do_get_write_access(handle_t handle, struct* journal_head *jh,
948	int force_copy)
949	{
950	struct buffer_head *bh;
951	transaction_t *transaction = handle->h_transaction;
952	journal_t *journal;
953	int error;
954	char *frozen_buffer = NULL;
955	unsigned long start_lock, time_lock;
956
957	journal = transaction->t_journal;
958
959	jbd2_debug(`5`, "journal_head %p, force_copy %d\n", jh, force_copy);
960
961	JBUFFER_TRACE(jh, "entry");
962	repeat:
963	bh = jh2bh(jh);
964
965	/ @@@ Need to check for errors here at some point. /
966
967	start_lock = jiffies;
968	lock_buffer(bh);
969	spin_lock(lock: &jh->b_state_lock);
970
971	/ If it takes too long to lock the buffer, trace it /
972	time_lock = jbd2_time_diff(start: start_lock, end: jiffies);
973	if (time_lock > HZ/`10`)
974	trace_jbd2_lock_buffer_stall(dev: bh->b_bdev->bd_dev,
975	stall_ms: jiffies_to_msecs(j: time_lock));
976
977	/ We now hold the buffer lock so it is safe to query the buffer*
978	* state. Is the buffer dirty?
979	*
980	* If so, there are two possibilities. The buffer may be
981	* non-journaled, and undergoing a quite legitimate writeback.
982	* Otherwise, it is journaled, and we don't expect dirty buffers
983	* in that state (the buffers should be marked JBD_Dirty
984	* instead.) So either the IO is being done under our own
985	* control and this is a bug, or it's a third party IO such as
986	* dump(8) (which may leave the buffer scheduled for read ---
987	* ie. locked but not dirty) or tune2fs (which may actually have
988	* the buffer dirtied, ugh.) */
989
990	if (buffer_dirty(bh) && jh->b_transaction) {
991	warn_dirty_buffer(bh);
992	/*
993	* We need to clean the dirty flag and we must do it under the
994	* buffer lock to be sure we don't race with running write-out.
995	*/
996	JBUFFER_TRACE(jh, "Journalling dirty buffer");
997	clear_buffer_dirty(bh);
998	/*
999	* The buffer is going to be added to BJ_Reserved list now and
1000	* nothing guarantees jbd2_journal_dirty_metadata() will be
1001	* ever called for it. So we need to set jbddirty bit here to
1002	* make sure the buffer is dirtied and written out when the
1003	* journaling machinery is done with it.
1004	*/
1005	set_buffer_jbddirty(bh);
1006	}
1007
1008	error = -EROFS;
1009	if (is_handle_aborted(handle)) {
1010	spin_unlock(lock: &jh->b_state_lock);
1011	unlock_buffer(bh);
1012	goto out;
1013	}
1014	error = `0`;
1015
1016	/*
1017	* The buffer is already part of this transaction if b_transaction or
1018	* b_next_transaction points to it
1019	*/
1020	if (jh->b_transaction == transaction \|\|
1021	jh->b_next_transaction == transaction) {
1022	unlock_buffer(bh);
1023	goto done;
1024	}
1025
1026	/*
1027	* this is the first time this transaction is touching this buffer,
1028	* reset the modified flag
1029	*/
1030	jh->b_modified = `0`;
1031
1032	/*
1033	* If the buffer is not journaled right now, we need to make sure it
1034	* doesn't get written to disk before the caller actually commits the
1035	* new data
1036	*/
1037	if (!jh->b_transaction) {
1038	JBUFFER_TRACE(jh, "no transaction");
1039	J_ASSERT_JH(jh, !jh->b_next_transaction);
1040	JBUFFER_TRACE(jh, "file as BJ_Reserved");
1041	/*
1042	* Make sure all stores to jh (b_modified, b_frozen_data) are
1043	* visible before attaching it to the running transaction.
1044	* Paired with barrier in jbd2_write_access_granted()
1045	*/
1046	smp_wmb();
1047	spin_lock(lock: &journal->j_list_lock);
1048	if (test_clear_buffer_dirty(bh)) {
1049	/*
1050	* Execute buffer dirty clearing and jh->b_transaction
1051	* assignment under journal->j_list_lock locked to
1052	* prevent bh being removed from checkpoint list if
1053	* the buffer is in an intermediate state (not dirty
1054	* and jh->b_transaction is NULL).
1055	*/
1056	JBUFFER_TRACE(jh, "Journalling dirty buffer");
1057	set_buffer_jbddirty(bh);
1058	}
1059	__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1060	spin_unlock(lock: &journal->j_list_lock);
1061	unlock_buffer(bh);
1062	goto done;
1063	}
1064	unlock_buffer(bh);
1065
1066	/*
1067	* If there is already a copy-out version of this buffer, then we don't
1068	* need to make another one
1069	*/
1070	if (jh->b_frozen_data) {
1071	JBUFFER_TRACE(jh, "has frozen data");
1072	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1073	goto attach_next;
1074	}
1075
1076	JBUFFER_TRACE(jh, "owned by older transaction");
1077	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1078	J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
1079
1080	/*
1081	* There is one case we have to be very careful about. If the
1082	* committing transaction is currently writing this buffer out to disk
1083	* and has NOT made a copy-out, then we cannot modify the buffer
1084	* contents at all right now. The essence of copy-out is that it is
1085	* the extra copy, not the primary copy, which gets journaled. If the
1086	* primary copy is already going to disk then we cannot do copy-out
1087	* here.
1088	*/
1089	if (buffer_shadow(bh)) {
1090	JBUFFER_TRACE(jh, "on shadow: sleep");
1091	spin_unlock(lock: &jh->b_state_lock);
1092	wait_on_bit_io(word: &bh->b_state, bit: BH_Shadow, TASK_UNINTERRUPTIBLE);
1093	goto repeat;
1094	}
1095
1096	/*
1097	* Only do the copy if the currently-owning transaction still needs it.
1098	* If buffer isn't on BJ_Metadata list, the committing transaction is
1099	* past that stage (here we use the fact that BH_Shadow is set under
1100	* bh_state lock together with refiling to BJ_Shadow list and at this
1101	* point we know the buffer doesn't have BH_Shadow set).
1102	*
1103	* Subtle point, though: if this is a get_undo_access, then we will be
1104	* relying on the frozen_data to contain the new value of the
1105	* committed_data record after the transaction, so we HAVE to force the
1106	* frozen_data copy in that case.
1107	*/
1108	if (jh->b_jlist == BJ_Metadata \|\| force_copy) {
1109	JBUFFER_TRACE(jh, "generate frozen data");
1110	if (!frozen_buffer) {
1111	JBUFFER_TRACE(jh, "allocate memory for buffer");
1112	spin_unlock(lock: &jh->b_state_lock);
1113	frozen_buffer = jbd2_alloc(size: jh2bh(jh)->b_size,
1114	GFP_NOFS \| __GFP_NOFAIL);
1115	goto repeat;
1116	}
1117	jh->b_frozen_data = frozen_buffer;
1118	frozen_buffer = NULL;
1119	jbd2_freeze_jh_data(jh);
1120	}
1121	attach_next:
1122	/*
1123	* Make sure all stores to jh (b_modified, b_frozen_data) are visible
1124	* before attaching it to the running transaction. Paired with barrier
1125	* in jbd2_write_access_granted()
1126	*/
1127	smp_wmb();
1128	jh->b_next_transaction = transaction;
1129
1130	done:
1131	spin_unlock(lock: &jh->b_state_lock);
1132
1133	/*
1134	* If we are about to journal a buffer, then any revoke pending on it is
1135	* no longer valid
1136	*/
1137	jbd2_journal_cancel_revoke(handle, jh);
1138
1139	out:
1140	if (unlikely(frozen_buffer)) / It's usually NULL /
1141	jbd2_free(ptr: frozen_buffer, size: bh->b_size);
1142
1143	JBUFFER_TRACE(jh, "exit");
1144	return error;
1145	}
1146
1147	/ Fast check whether buffer is already attached to the required transaction /
1148	static bool jbd2_write_access_granted(handle_t handle, struct* buffer_head *bh,
1149	bool undo)
1150	{
1151	struct journal_head *jh;
1152	bool ret = false;
1153
1154	/ Dirty buffers require special handling... /
1155	if (buffer_dirty(bh))
1156	return false;
1157
1158	/*
1159	* RCU protects us from dereferencing freed pages. So the checks we do
1160	* are guaranteed not to oops. However the jh slab object can get freed
1161	* & reallocated while we work with it. So we have to be careful. When
1162	* we see jh attached to the running transaction, we know it must stay
1163	* so until the transaction is committed. Thus jh won't be freed and
1164	* will be attached to the same bh while we run. However it can
1165	* happen jh gets freed, reallocated, and attached to the transaction
1166	* just after we get pointer to it from bh. So we have to be careful
1167	* and recheck jh still belongs to our bh before we return success.
1168	*/
1169	rcu_read_lock();
1170	if (!buffer_jbd(bh))
1171	goto out;
1172	/ This should be bh2jh() but that doesn't work with inline functions /
1173	jh = READ_ONCE(bh->b_private);
1174	if (!jh)
1175	goto out;
1176	/ For undo access buffer must have data copied /
1177	if (undo && !jh->b_committed_data)
1178	goto out;
1179	if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
1180	READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
1181	goto out;
1182	/*
1183	* There are two reasons for the barrier here:
1184	* 1) Make sure to fetch b_bh after we did previous checks so that we
1185	* detect when jh went through free, realloc, attach to transaction
1186	* while we were checking. Paired with implicit barrier in that path.
1187	* 2) So that access to bh done after jbd2_write_access_granted()
1188	* doesn't get reordered and see inconsistent state of concurrent
1189	* do_get_write_access().
1190	*/
1191	smp_mb();
1192	if (unlikely(jh->b_bh != bh))
1193	goto out;
1194	ret = true;
1195	out:
1196	rcu_read_unlock();
1197	return ret;
1198	}
1199
1200	/**
1201	* jbd2_journal_get_write_access() - notify intent to modify a buffer
1202	* for metadata (not data) update.
1203	* @handle: transaction to add buffer modifications to
1204	* @bh: bh to be used for metadata writes
1205	*
1206	* Returns: error code or 0 on success.
1207	*
1208	* In full data journalling mode the buffer may be of type BJ_AsyncData,
1209	* because we're ``write()ing`` a buffer which is also part of a shared mapping.
1210	*/
1211
1212	int jbd2_journal_get_write_access(handle_t handle, struct* buffer_head *bh)
1213	{
1214	struct journal_head *jh;
1215	journal_t *journal;
1216	int rc;
1217
1218	if (is_handle_aborted(handle))
1219	return -EROFS;
1220
1221	journal = handle->h_transaction->t_journal;
1222	if (jbd2_check_fs_dev_write_error(journal)) {
1223	/*
1224	* If the fs dev has writeback errors, it may have failed
1225	* to async write out metadata buffers in the background.
1226	* In this case, we could read old data from disk and write
1227	* it out again, which may lead to on-disk filesystem
1228	* inconsistency. Aborting journal can avoid it happen.
1229	*/
1230	jbd2_journal_abort(journal, -EIO);
1231	return -EIO;
1232	}
1233
1234	if (jbd2_write_access_granted(handle, bh, undo: false))
1235	return `0`;
1236
1237	jh = jbd2_journal_add_journal_head(bh);
1238	/ We do not want to get caught playing with fields which the*
1239	* log thread also manipulates. Make sure that the buffer
1240	* completes any outstanding IO before proceeding. */
1241	rc = do_get_write_access(handle, jh, force_copy: `0`);
1242	jbd2_journal_put_journal_head(jh);
1243	return rc;
1244	}
1245
1246
1247	/*
1248	* When the user wants to journal a newly created buffer_head
1249	* (ie. getblk() returned a new buffer and we are going to populate it
1250	* manually rather than reading off disk), then we need to keep the
1251	* buffer_head locked until it has been completely filled with new
1252	* data. In this case, we should be able to make the assertion that
1253	* the bh is not already part of an existing transaction.
1254	*
1255	* The buffer should already be locked by the caller by this point.
1256	* There is no lock ranking violation: it was a newly created,
1257	* unlocked buffer beforehand. */
1258
1259	/**
1260	* jbd2_journal_get_create_access () - notify intent to use newly created bh
1261	* @handle: transaction to new buffer to
1262	* @bh: new buffer.
1263	*
1264	* Call this if you create a new bh.
1265	*/
1266	int jbd2_journal_get_create_access(handle_t handle, struct* buffer_head *bh)
1267	{
1268	transaction_t *transaction = handle->h_transaction;
1269	journal_t *journal;
1270	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1271	int err;
1272
1273	jbd2_debug(`5`, "journal_head %p\n", jh);
1274	err = -EROFS;
1275	if (is_handle_aborted(handle))
1276	goto out;
1277	journal = transaction->t_journal;
1278	err = `0`;
1279
1280	JBUFFER_TRACE(jh, "entry");
1281	/*
1282	* The buffer may already belong to this transaction due to pre-zeroing
1283	* in the filesystem's new_block code. It may also be on the previous,
1284	* committing transaction's lists, but it HAS to be in Forget state in
1285	* that case: the transaction must have deleted the buffer for it to be
1286	* reused here.
1287	*/
1288	spin_lock(lock: &jh->b_state_lock);
1289	J_ASSERT_JH(jh, (jh->b_transaction == transaction \|\|
1290	jh->b_transaction == NULL \|\|
1291	(jh->b_transaction == journal->j_committing_transaction &&
1292	jh->b_jlist == BJ_Forget)));
1293
1294	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1295	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
1296
1297	if (jh->b_transaction == NULL) {
1298	/*
1299	* Previous jbd2_journal_forget() could have left the buffer
1300	* with jbddirty bit set because it was being committed. When
1301	* the commit finished, we've filed the buffer for
1302	* checkpointing and marked it dirty. Now we are reallocating
1303	* the buffer so the transaction freeing it must have
1304	* committed and so it's safe to clear the dirty bit.
1305	*/
1306	clear_buffer_dirty(bh: jh2bh(jh));
1307	/ first access by this transaction /
1308	jh->b_modified = `0`;
1309
1310	JBUFFER_TRACE(jh, "file as BJ_Reserved");
1311	spin_lock(lock: &journal->j_list_lock);
1312	__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
1313	spin_unlock(lock: &journal->j_list_lock);
1314	} else if (jh->b_transaction == journal->j_committing_transaction) {
1315	/ first access by this transaction /
1316	jh->b_modified = `0`;
1317
1318	JBUFFER_TRACE(jh, "set next transaction");
1319	spin_lock(lock: &journal->j_list_lock);
1320	jh->b_next_transaction = transaction;
1321	spin_unlock(lock: &journal->j_list_lock);
1322	}
1323	spin_unlock(lock: &jh->b_state_lock);
1324
1325	/*
1326	* akpm: I added this. ext3_alloc_branch can pick up new indirect
1327	* blocks which contain freed but then revoked metadata. We need
1328	* to cancel the revoke in case we end up freeing it yet again
1329	* and the reallocating as data - this would cause a second revoke,
1330	* which hits an assertion error.
1331	*/
1332	JBUFFER_TRACE(jh, "cancelling revoke");
1333	jbd2_journal_cancel_revoke(handle, jh);
1334	out:
1335	jbd2_journal_put_journal_head(jh);
1336	return err;
1337	}
1338
1339	/**
1340	* jbd2_journal_get_undo_access() - Notify intent to modify metadata with
1341	* non-rewindable consequences
1342	* @handle: transaction
1343	* @bh: buffer to undo
1344	*
1345	* Sometimes there is a need to distinguish between metadata which has
1346	* been committed to disk and that which has not. The ext3fs code uses
1347	* this for freeing and allocating space, we have to make sure that we
1348	* do not reuse freed space until the deallocation has been committed,
1349	* since if we overwrote that space we would make the delete
1350	* un-rewindable in case of a crash.
1351	*
1352	* To deal with that, jbd2_journal_get_undo_access requests write access to a
1353	* buffer for parts of non-rewindable operations such as delete
1354	* operations on the bitmaps. The journaling code must keep a copy of
1355	* the buffer's contents prior to the undo_access call until such time
1356	* as we know that the buffer has definitely been committed to disk.
1357	*
1358	* We never need to know which transaction the committed data is part
1359	* of, buffers touched here are guaranteed to be dirtied later and so
1360	* will be committed to a new transaction in due course, at which point
1361	* we can discard the old committed data pointer.
1362	*
1363	* Returns error number or 0 on success.
1364	*/
1365	int jbd2_journal_get_undo_access(handle_t handle, struct* buffer_head *bh)
1366	{
1367	int err;
1368	struct journal_head *jh;
1369	char *committed_data = NULL;
1370
1371	if (is_handle_aborted(handle))
1372	return -EROFS;
1373
1374	if (jbd2_write_access_granted(handle, bh, undo: true))
1375	return `0`;
1376
1377	jh = jbd2_journal_add_journal_head(bh);
1378	JBUFFER_TRACE(jh, "entry");
1379
1380	/*
1381	* Do this first --- it can drop the journal lock, so we want to
1382	* make sure that obtaining the committed_data is done
1383	* atomically wrt. completion of any outstanding commits.
1384	*/
1385	err = do_get_write_access(handle, jh, force_copy: `1`);
1386	if (err)
1387	goto out;
1388
1389	repeat:
1390	if (!jh->b_committed_data)
1391	committed_data = jbd2_alloc(size: jh2bh(jh)->b_size,
1392	GFP_NOFS\|__GFP_NOFAIL);
1393
1394	spin_lock(lock: &jh->b_state_lock);
1395	if (!jh->b_committed_data) {
1396	/ Copy out the current buffer contents into the*
1397	* preserved, committed copy. */
1398	JBUFFER_TRACE(jh, "generate b_committed data");
1399	if (!committed_data) {
1400	spin_unlock(lock: &jh->b_state_lock);
1401	goto repeat;
1402	}
1403
1404	jh->b_committed_data = committed_data;
1405	committed_data = NULL;
1406	memcpy(to: jh->b_committed_data, from: bh->b_data, len: bh->b_size);
1407	}
1408	spin_unlock(lock: &jh->b_state_lock);
1409	out:
1410	jbd2_journal_put_journal_head(jh);
1411	if (unlikely(committed_data))
1412	jbd2_free(ptr: committed_data, size: bh->b_size);
1413	return err;
1414	}
1415
1416	/**
1417	* jbd2_journal_set_triggers() - Add triggers for commit writeout
1418	* @bh: buffer to trigger on
1419	* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
1420	*
1421	* Set any triggers on this journal_head. This is always safe, because
1422	* triggers for a committing buffer will be saved off, and triggers for
1423	* a running transaction will match the buffer in that transaction.
1424	*
1425	* Call with NULL to clear the triggers.
1426	*/
1427	void jbd2_journal_set_triggers(struct buffer_head *bh,
1428	struct jbd2_buffer_trigger_type *type)
1429	{
1430	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
1431
1432	if (WARN_ON_ONCE(!jh))
1433	return;
1434	jh->b_triggers = type;
1435	jbd2_journal_put_journal_head(jh);
1436	}
1437
1438	void jbd2_buffer_frozen_trigger(struct journal_head jh, void* *mapped_data,
1439	struct jbd2_buffer_trigger_type *triggers)
1440	{
1441	struct buffer_head *bh = jh2bh(jh);
1442
1443	if (!triggers \|\| !triggers->t_frozen)
1444	return;
1445
1446	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
1447	}
1448
1449	void jbd2_buffer_abort_trigger(struct journal_head *jh,
1450	struct jbd2_buffer_trigger_type *triggers)
1451	{
1452	if (!triggers \|\| !triggers->t_abort)
1453	return;
1454
1455	triggers->t_abort(triggers, jh2bh(jh));
1456	}
1457
1458	/**
1459	* jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1460	* @handle: transaction to add buffer to.
1461	* @bh: buffer to mark
1462	*
1463	* mark dirty metadata which needs to be journaled as part of the current
1464	* transaction.
1465	*
1466	* The buffer must have previously had jbd2_journal_get_write_access()
1467	* called so that it has a valid journal_head attached to the buffer
1468	* head.
1469	*
1470	* The buffer is placed on the transaction's metadata list and is marked
1471	* as belonging to the transaction.
1472	*
1473	* Returns error number or 0 on success.
1474	*
1475	* Special care needs to be taken if the buffer already belongs to the
1476	* current committing transaction (in which case we should have frozen
1477	* data present for that commit). In that case, we don't relink the
1478	* buffer: that only gets done when the old transaction finally
1479	* completes its commit.
1480	*/
1481	int jbd2_journal_dirty_metadata(handle_t handle, struct* buffer_head *bh)
1482	{
1483	transaction_t *transaction = handle->h_transaction;
1484	journal_t *journal;
1485	struct journal_head *jh;
1486	int ret = `0`;
1487
1488	if (!buffer_jbd(bh))
1489	return -EUCLEAN;
1490
1491	/*
1492	* We don't grab jh reference here since the buffer must be part
1493	* of the running transaction.
1494	*/
1495	jh = bh2jh(bh);
1496	jbd2_debug(`5`, "journal_head %p\n", jh);
1497	JBUFFER_TRACE(jh, "entry");
1498
1499	/*
1500	* This and the following assertions are unreliable since we may see jh
1501	* in inconsistent state unless we grab bh_state lock. But this is
1502	* crucial to catch bugs so let's do a reliable check until the
1503	* lockless handling is fully proven.
1504	*/
1505	if (data_race(jh->b_transaction != transaction &&
1506	jh->b_next_transaction != transaction)) {
1507	spin_lock(lock: &jh->b_state_lock);
1508	J_ASSERT_JH(jh, jh->b_transaction == transaction \|\|
1509	jh->b_next_transaction == transaction);
1510	spin_unlock(lock: &jh->b_state_lock);
1511	}
1512	if (data_race(jh->b_modified == `1`)) {
1513	/ If it's in our transaction it must be in BJ_Metadata list. /
1514	if (data_race(jh->b_transaction == transaction &&
1515	jh->b_jlist != BJ_Metadata)) {
1516	spin_lock(lock: &jh->b_state_lock);
1517	if (jh->b_transaction == transaction &&
1518	jh->b_jlist != BJ_Metadata)
1519	pr_err("JBD2: assertion failure: h_type=%u "
1520	"h_line_no=%u block_no=%llu jlist=%u\n",
1521	handle->h_type, handle->h_line_no,
1522	(unsigned long long) bh->b_blocknr,
1523	jh->b_jlist);
1524	J_ASSERT_JH(jh, jh->b_transaction != transaction \|\|
1525	jh->b_jlist == BJ_Metadata);
1526	spin_unlock(lock: &jh->b_state_lock);
1527	}
1528	goto out;
1529	}
1530
1531	spin_lock(lock: &jh->b_state_lock);
1532
1533	if (is_handle_aborted(handle)) {
1534	/*
1535	* Check journal aborting with @jh->b_state_lock locked,
1536	* since 'jh->b_transaction' could be replaced with
1537	* 'jh->b_next_transaction' during old transaction
1538	* committing if journal aborted, which may fail
1539	* assertion on 'jh->b_frozen_data == NULL'.
1540	*/
1541	ret = -EROFS;
1542	goto out_unlock_bh;
1543	}
1544
1545	journal = transaction->t_journal;
1546
1547	if (jh->b_modified == `0`) {
1548	/*
1549	* This buffer's got modified and becoming part
1550	* of the transaction. This needs to be done
1551	* once a transaction -bzzz
1552	*/
1553	if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= `0`)) {
1554	ret = -ENOSPC;
1555	goto out_unlock_bh;
1556	}
1557	jh->b_modified = `1`;
1558	handle->h_total_credits--;
1559	}
1560
1561	/*
1562	* fastpath, to avoid expensive locking. If this buffer is already
1563	* on the running transaction's metadata list there is nothing to do.
1564	* Nobody can take it off again because there is a handle open.
1565	* I _think_ we're OK here with SMP barriers - a mistaken decision will
1566	* result in this test being false, so we go in and take the locks.
1567	*/
1568	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1569	JBUFFER_TRACE(jh, "fastpath");
1570	if (unlikely(jh->b_transaction !=
1571	journal->j_running_transaction)) {
1572	printk(KERN_ERR "JBD2: %s: "
1573	"jh->b_transaction (%llu, %p, %u) != "
1574	"journal->j_running_transaction (%p, %u)\n",
1575	journal->j_devname,
1576	(unsigned long long) bh->b_blocknr,
1577	jh->b_transaction,
1578	jh->b_transaction ? jh->b_transaction->t_tid : `0`,
1579	journal->j_running_transaction,
1580	journal->j_running_transaction ?
1581	journal->j_running_transaction->t_tid : `0`);
1582	ret = -EINVAL;
1583	}
1584	goto out_unlock_bh;
1585	}
1586
1587	set_buffer_jbddirty(bh);
1588
1589	/*
1590	* Metadata already on the current transaction list doesn't
1591	* need to be filed. Metadata on another transaction's list must
1592	* be committing, and will be refiled once the commit completes:
1593	* leave it alone for now.
1594	*/
1595	if (jh->b_transaction != transaction) {
1596	JBUFFER_TRACE(jh, "already on other transaction");
1597	if (unlikely(((jh->b_transaction !=
1598	journal->j_committing_transaction)) \|\|
1599	(jh->b_next_transaction != transaction))) {
1600	printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
1601	"bad jh for block %llu: "
1602	"transaction (%p, %u), "
1603	"jh->b_transaction (%p, %u), "
1604	"jh->b_next_transaction (%p, %u), jlist %u\n",
1605	journal->j_devname,
1606	(unsigned long long) bh->b_blocknr,
1607	transaction, transaction->t_tid,
1608	jh->b_transaction,
1609	jh->b_transaction ?
1610	jh->b_transaction->t_tid : `0`,
1611	jh->b_next_transaction,
1612	jh->b_next_transaction ?
1613	jh->b_next_transaction->t_tid : `0`,
1614	jh->b_jlist);
1615	WARN_ON(`1`);
1616	ret = -EINVAL;
1617	}
1618	/ And this case is illegal: we can't reuse another*
1619	* transaction's data buffer, ever. */
1620	goto out_unlock_bh;
1621	}
1622
1623	/ That test should have eliminated the following case: /
1624	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1625
1626	JBUFFER_TRACE(jh, "file as BJ_Metadata");
1627	spin_lock(lock: &journal->j_list_lock);
1628	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1629	spin_unlock(lock: &journal->j_list_lock);
1630	out_unlock_bh:
1631	spin_unlock(lock: &jh->b_state_lock);
1632	out:
1633	JBUFFER_TRACE(jh, "exit");
1634	return ret;
1635	}
1636
1637	/**
1638	* jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1639	* @handle: transaction handle
1640	* @bh: bh to 'forget'
1641	*
1642	* We can only do the bforget if there are no commits pending against the
1643	* buffer. If the buffer is dirty in the current running transaction we
1644	* can safely unlink it.
1645	*
1646	* bh may not be a journalled buffer at all - it may be a non-JBD
1647	* buffer which came off the hashtable. Check for this.
1648	*
1649	* Decrements bh->b_count by one.
1650	*
1651	* Allow this call even if the handle has aborted --- it may be part of
1652	* the caller's cleanup after an abort.
1653	*/
1654	int jbd2_journal_forget(handle_t handle, struct* buffer_head *bh)
1655	{
1656	transaction_t *transaction = handle->h_transaction;
1657	journal_t *journal;
1658	struct journal_head *jh;
1659	int drop_reserve = `0`;
1660	int err = `0`;
1661	int was_modified = `0`;
1662
1663	if (is_handle_aborted(handle))
1664	return -EROFS;
1665	journal = transaction->t_journal;
1666
1667	BUFFER_TRACE(bh, "entry");
1668
1669	jh = jbd2_journal_grab_journal_head(bh);
1670	if (!jh) {
1671	__bforget(bh);
1672	return `0`;
1673	}
1674
1675	spin_lock(lock: &jh->b_state_lock);
1676
1677	/ Critical error: attempting to delete a bitmap buffer, maybe?*
1678	* Don't do any jbd operations, and return an error. */
1679	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1680	"inconsistent data on disk")) {
1681	err = -EIO;
1682	goto drop;
1683	}
1684
1685	/ keep track of whether or not this transaction modified us /
1686	was_modified = jh->b_modified;
1687
1688	/*
1689	* The buffer's going from the transaction, we must drop
1690	* all references -bzzz
1691	*/
1692	jh->b_modified = `0`;
1693
1694	if (jh->b_transaction == transaction) {
1695	J_ASSERT_JH(jh, !jh->b_frozen_data);
1696
1697	/ If we are forgetting a buffer which is already part*
1698	* of this transaction, then we can just drop it from
1699	* the transaction immediately. */
1700	clear_buffer_dirty(bh);
1701	clear_buffer_jbddirty(bh);
1702
1703	JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1704
1705	/*
1706	* we only want to drop a reference if this transaction
1707	* modified the buffer
1708	*/
1709	if (was_modified)
1710	drop_reserve = `1`;
1711
1712	/*
1713	* We are no longer going to journal this buffer.
1714	* However, the commit of this transaction is still
1715	* important to the buffer: the delete that we are now
1716	* processing might obsolete an old log entry, so by
1717	* committing, we can satisfy the buffer's checkpoint.
1718	*
1719	* So, if we have a checkpoint on the buffer, we should
1720	* now refile the buffer on our BJ_Forget list so that
1721	* we know to remove the checkpoint after we commit.
1722	*/
1723
1724	spin_lock(lock: &journal->j_list_lock);
1725	if (jh->b_cp_transaction) {
1726	__jbd2_journal_temp_unlink_buffer(jh);
1727	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1728	} else {
1729	__jbd2_journal_unfile_buffer(jh);
1730	jbd2_journal_put_journal_head(jh);
1731	}
1732	spin_unlock(lock: &journal->j_list_lock);
1733	} else if (jh->b_transaction) {
1734	J_ASSERT_JH(jh, (jh->b_transaction ==
1735	journal->j_committing_transaction));
1736	/ However, if the buffer is still owned by a prior*
1737	* (committing) transaction, we can't drop it yet... */
1738	JBUFFER_TRACE(jh, "belongs to older transaction");
1739	/ ... but we CAN drop it from the new transaction through*
1740	* marking the buffer as freed and set j_next_transaction to
1741	* the new transaction, so that not only the commit code
1742	* knows it should clear dirty bits when it is done with the
1743	* buffer, but also the buffer can be checkpointed only
1744	* after the new transaction commits. */
1745
1746	set_buffer_freed(bh);
1747
1748	if (!jh->b_next_transaction) {
1749	spin_lock(lock: &journal->j_list_lock);
1750	jh->b_next_transaction = transaction;
1751	spin_unlock(lock: &journal->j_list_lock);
1752	} else {
1753	J_ASSERT(jh->b_next_transaction == transaction);
1754
1755	/*
1756	* only drop a reference if this transaction modified
1757	* the buffer
1758	*/
1759	if (was_modified)
1760	drop_reserve = `1`;
1761	}
1762	} else {
1763	/*
1764	* Finally, if the buffer is not belongs to any
1765	* transaction, we can just drop it now if it has no
1766	* checkpoint.
1767	*/
1768	spin_lock(lock: &journal->j_list_lock);
1769	if (!jh->b_cp_transaction) {
1770	JBUFFER_TRACE(jh, "belongs to none transaction");
1771	spin_unlock(lock: &journal->j_list_lock);
1772	goto drop;
1773	}
1774
1775	/*
1776	* Otherwise, if the buffer has been written to disk,
1777	* it is safe to remove the checkpoint and drop it.
1778	*/
1779	if (jbd2_journal_try_remove_checkpoint(jh) >= `0`) {
1780	spin_unlock(lock: &journal->j_list_lock);
1781	goto drop;
1782	}
1783
1784	/*
1785	* The buffer is still not written to disk, we should
1786	* attach this buffer to current transaction so that the
1787	* buffer can be checkpointed only after the current
1788	* transaction commits.
1789	*/
1790	clear_buffer_dirty(bh);
1791	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1792	spin_unlock(lock: &journal->j_list_lock);
1793	}
1794	drop:
1795	__brelse(bh);
1796	spin_unlock(lock: &jh->b_state_lock);
1797	jbd2_journal_put_journal_head(jh);
1798	if (drop_reserve) {
1799	/ no need to reserve log space for this block -bzzz /
1800	handle->h_total_credits++;
1801	}
1802	return err;
1803	}
1804
1805	/**
1806	* jbd2_journal_stop() - complete a transaction
1807	* @handle: transaction to complete.
1808	*
1809	* All done for a particular handle.
1810	*
1811	* There is not much action needed here. We just return any remaining
1812	* buffer credits to the transaction and remove the handle. The only
1813	* complication is that we need to start a commit operation if the
1814	* filesystem is marked for synchronous update.
1815	*
1816	* jbd2_journal_stop itself will not usually return an error, but it may
1817	* do so in unusual circumstances. In particular, expect it to
1818	* return -EIO if a jbd2_journal_abort has been executed since the
1819	* transaction began.
1820	*/
1821	int jbd2_journal_stop(handle_t *handle)
1822	{
1823	transaction_t *transaction = handle->h_transaction;
1824	journal_t *journal;
1825	int err = `0`, wait_for_commit = `0`;
1826	tid_t tid;
1827	pid_t pid;
1828
1829	if (--handle->h_ref > `0`) {
1830	jbd2_debug(`4`, "h_ref %d -> %d\n", handle->h_ref + `1`,
1831	handle->h_ref);
1832	if (is_handle_aborted(handle))
1833	return -EIO;
1834	return `0`;
1835	}
1836	if (!transaction) {
1837	/*
1838	* Handle is already detached from the transaction so there is
1839	* nothing to do other than free the handle.
1840	*/
1841	memalloc_nofs_restore(flags: handle->saved_alloc_context);
1842	goto free_and_exit;
1843	}
1844	journal = transaction->t_journal;
1845	tid = transaction->t_tid;
1846
1847	if (is_handle_aborted(handle))
1848	err = -EIO;
1849
1850	jbd2_debug(`4`, "Handle %p going down\n", handle);
1851	trace_jbd2_handle_stats(dev: journal->j_fs_dev->bd_dev,
1852	tid, type: handle->h_type, line_no: handle->h_line_no,
1853	interval: jiffies - handle->h_start_jiffies,
1854	sync: handle->h_sync, requested_blocks: handle->h_requested_credits,
1855	dirtied_blocks: (handle->h_requested_credits -
1856	handle->h_total_credits));
1857
1858	/*
1859	* Implement synchronous transaction batching. If the handle
1860	* was synchronous, don't force a commit immediately. Let's
1861	* yield and let another thread piggyback onto this
1862	* transaction. Keep doing that while new threads continue to
1863	* arrive. It doesn't cost much - we're about to run a commit
1864	* and sleep on IO anyway. Speeds up many-threaded, many-dir
1865	* operations by 30x or more...
1866	*
1867	* We try and optimize the sleep time against what the
1868	* underlying disk can do, instead of having a static sleep
1869	* time. This is useful for the case where our storage is so
1870	* fast that it is more optimal to go ahead and force a flush
1871	* and wait for the transaction to be committed than it is to
1872	* wait for an arbitrary amount of time for new writers to
1873	* join the transaction. We achieve this by measuring how
1874	* long it takes to commit a transaction, and compare it with
1875	* how long this transaction has been running, and if run time
1876	* < commit time then we sleep for the delta and commit. This
1877	* greatly helps super fast disks that would see slowdowns as
1878	* more threads started doing fsyncs.
1879	*
1880	* But don't do this if this process was the most recent one
1881	* to perform a synchronous write. We do this to detect the
1882	* case where a single process is doing a stream of sync
1883	* writes. No point in waiting for joiners in that case.
1884	*
1885	* Setting max_batch_time to 0 disables this completely.
1886	*/
1887	pid = current->pid;
1888	if (handle->h_sync && journal->j_last_sync_writer != pid &&
1889	journal->j_max_batch_time) {
1890	u64 commit_time, trans_time;
1891
1892	journal->j_last_sync_writer = pid;
1893
1894	read_lock(&journal->j_state_lock);
1895	commit_time = journal->j_average_commit_time;
1896	read_unlock(&journal->j_state_lock);
1897
1898	trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1899	transaction->t_start_time));
1900
1901	commit_time = max_t(u64, commit_time,
1902	`1000`*journal->j_min_batch_time);
1903	commit_time = min_t(u64, commit_time,
1904	`1000`*journal->j_max_batch_time);
1905
1906	if (trans_time < commit_time) {
1907	ktime_t expires = ktime_add_ns(ktime_get(),
1908	commit_time);
1909	set_current_state(TASK_UNINTERRUPTIBLE);
1910	schedule_hrtimeout(expires: &expires, mode: HRTIMER_MODE_ABS);
1911	}
1912	}
1913
1914	if (handle->h_sync)
1915	transaction->t_synchronous_commit = `1`;
1916
1917	/*
1918	* If the handle is marked SYNC, we need to set another commit
1919	* going! We also want to force a commit if the transaction is too
1920	* old now.
1921	*/
1922	if (handle->h_sync \|\|
1923	time_after_eq(jiffies, transaction->t_expires)) {
1924	/ Do this even for aborted journals: an abort still*
1925	* completes the commit thread, it just doesn't write
1926	* anything to disk. */
1927
1928	jbd2_debug(`2`, "transaction too old, requesting commit for "
1929	"handle %p\n", handle);
1930	/ This is non-blocking /
1931	jbd2_log_start_commit(journal, tid);
1932
1933	/*
1934	* Special case: JBD2_SYNC synchronous updates require us
1935	* to wait for the commit to complete.
1936	*/
1937	if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1938	wait_for_commit = `1`;
1939	}
1940
1941	/*
1942	* Once stop_this_handle() drops t_updates, the transaction could start
1943	* committing on us and eventually disappear. So we must not
1944	* dereference transaction pointer again after calling
1945	* stop_this_handle().
1946	*/
1947	stop_this_handle(handle);
1948
1949	if (wait_for_commit)
1950	err = jbd2_log_wait_commit(journal, tid);
1951
1952	free_and_exit:
1953	if (handle->h_rsv_handle)
1954	jbd2_free_handle(handle: handle->h_rsv_handle);
1955	jbd2_free_handle(handle);
1956	return err;
1957	}
1958
1959	/*
1960	*
1961	* List management code snippets: various functions for manipulating the
1962	* transaction buffer lists.
1963	*
1964	*/
1965
1966	/*
1967	* Append a buffer to a transaction list, given the transaction's list head
1968	* pointer.
1969	*
1970	* j_list_lock is held.
1971	*
1972	* jh->b_state_lock is held.
1973	*/
1974
1975	static inline void
1976	__blist_add_buffer(struct journal_head list, struct** journal_head *jh)
1977	{
1978	if (!*list) {
1979	jh->b_tnext = jh->b_tprev = jh;
1980	*list = jh;
1981	} else {
1982	/ Insert at the tail of the list to preserve order /
1983	struct journal_head first = list, *last = first->b_tprev;
1984	jh->b_tprev = last;
1985	jh->b_tnext = first;
1986	last->b_tnext = first->b_tprev = jh;
1987	}
1988	}
1989
1990	/*
1991	* Remove a buffer from a transaction list, given the transaction's list
1992	* head pointer.
1993	*
1994	* Called with j_list_lock held, and the journal may not be locked.
1995	*
1996	* jh->b_state_lock is held.
1997	*/
1998
1999	static inline void
2000	__blist_del_buffer(struct journal_head list, struct** journal_head *jh)
2001	{
2002	if (*list == jh) {
2003	*list = jh->b_tnext;
2004	if (*list == jh)
2005	*list = NULL;
2006	}
2007	jh->b_tprev->b_tnext = jh->b_tnext;
2008	jh->b_tnext->b_tprev = jh->b_tprev;
2009	}
2010
2011	/*
2012	* Remove a buffer from the appropriate transaction list.
2013	*
2014	* Note that this function can change the value of
2015	* bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
2016	* t_reserved_list. If the caller is holding onto a copy of one of these
2017	* pointers, it could go bad. Generally the caller needs to re-read the
2018	* pointer from the transaction_t.
2019	*
2020	* Called under j_list_lock.
2021	*/
2022	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
2023	{
2024	struct journal_head **list = NULL;
2025	transaction_t *transaction;
2026	struct buffer_head *bh = jh2bh(jh);
2027
2028	lockdep_assert_held(&jh->b_state_lock);
2029	transaction = jh->b_transaction;
2030	if (transaction)
2031	assert_spin_locked(&transaction->t_journal->j_list_lock);
2032
2033	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2034	if (jh->b_jlist != BJ_None)
2035	J_ASSERT_JH(jh, transaction != NULL);
2036
2037	switch (jh->b_jlist) {
2038	case BJ_None:
2039	return;
2040	case BJ_Metadata:
2041	transaction->t_nr_buffers--;
2042	J_ASSERT_JH(jh, transaction->t_nr_buffers >= `0`);
2043	list = &transaction->t_buffers;
2044	break;
2045	case BJ_Forget:
2046	list = &transaction->t_forget;
2047	break;
2048	case BJ_Shadow:
2049	list = &transaction->t_shadow_list;
2050	break;
2051	case BJ_Reserved:
2052	list = &transaction->t_reserved_list;
2053	break;
2054	}
2055
2056	__blist_del_buffer(list, jh);
2057	jh->b_jlist = BJ_None;
2058	if (transaction && is_journal_aborted(journal: transaction->t_journal))
2059	clear_buffer_jbddirty(bh);
2060	else if (test_clear_buffer_jbddirty(bh))
2061	mark_buffer_dirty(bh); / Expose it to the VM /
2062	}
2063
2064	/*
2065	* Remove buffer from all transactions. The caller is responsible for dropping
2066	* the jh reference that belonged to the transaction.
2067	*
2068	* Called with bh_state lock and j_list_lock
2069	*/
2070	static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
2071	{
2072	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2073	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2074
2075	__jbd2_journal_temp_unlink_buffer(jh);
2076	jh->b_transaction = NULL;
2077	}
2078
2079	/**
2080	* jbd2_journal_try_to_free_buffers() - try to free page buffers.
2081	* @journal: journal for operation
2082	* @folio: Folio to detach data from.
2083	*
2084	* For all the buffers on this page,
2085	* if they are fully written out ordered data, move them onto BUF_CLEAN
2086	* so try_to_free_buffers() can reap them.
2087	*
2088	* This function returns non-zero if we wish try_to_free_buffers()
2089	* to be called. We do this if the page is releasable by try_to_free_buffers().
2090	* We also do it if the page has locked or dirty buffers and the caller wants
2091	* us to perform sync or async writeout.
2092	*
2093	* This complicates JBD locking somewhat. We aren't protected by the
2094	* BKL here. We wish to remove the buffer from its committing or
2095	* running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
2096	*
2097	* This may change the value of transaction_t->t_datalist, so anyone
2098	* who looks at t_datalist needs to lock against this function.
2099	*
2100	* Even worse, someone may be doing a jbd2_journal_dirty_data on this
2101	* buffer. So we need to lock against that. jbd2_journal_dirty_data()
2102	* will come out of the lock with the buffer dirty, which makes it
2103	* ineligible for release here.
2104	*
2105	* Who else is affected by this? hmm... Really the only contender
2106	* is do_get_write_access() - it could be looking at the buffer while
2107	* journal_try_to_free_buffer() is changing its state. But that
2108	* cannot happen because we never reallocate freed data as metadata
2109	* while the data is part of a transaction. Yes?
2110	*
2111	* Return false on failure, true on success
2112	*/
2113	bool jbd2_journal_try_to_free_buffers(journal_t journal, struct* folio *folio)
2114	{
2115	struct buffer_head *head;
2116	struct buffer_head *bh;
2117	bool ret = false;
2118
2119	J_ASSERT(folio_test_locked(folio));
2120
2121	head = folio_buffers(folio);
2122	bh = head;
2123	do {
2124	struct journal_head *jh;
2125
2126	/*
2127	* We take our own ref against the journal_head here to avoid
2128	* having to add tons of locking around each instance of
2129	* jbd2_journal_put_journal_head().
2130	*/
2131	jh = jbd2_journal_grab_journal_head(bh);
2132	if (!jh)
2133	continue;
2134
2135	spin_lock(lock: &jh->b_state_lock);
2136	if (!jh->b_transaction && !jh->b_next_transaction) {
2137	spin_lock(lock: &journal->j_list_lock);
2138	/ Remove written-back checkpointed metadata buffer /
2139	if (jh->b_cp_transaction != NULL)
2140	jbd2_journal_try_remove_checkpoint(jh);
2141	spin_unlock(lock: &journal->j_list_lock);
2142	}
2143	spin_unlock(lock: &jh->b_state_lock);
2144	jbd2_journal_put_journal_head(jh);
2145	if (buffer_jbd(bh))
2146	goto busy;
2147	} while ((bh = bh->b_this_page) != head);
2148
2149	ret = try_to_free_buffers(folio);
2150	busy:
2151	return ret;
2152	}
2153
2154	/*
2155	* This buffer is no longer needed. If it is on an older transaction's
2156	* checkpoint list we need to record it on this transaction's forget list
2157	* to pin this buffer (and hence its checkpointing transaction) down until
2158	* this transaction commits. If the buffer isn't on a checkpoint list, we
2159	* release it.
2160	* Returns non-zero if JBD no longer has an interest in the buffer.
2161	*
2162	* Called under j_list_lock.
2163	*
2164	* Called under jh->b_state_lock.
2165	*/
2166	static int __dispose_buffer(struct journal_head jh, transaction_t transaction)
2167	{
2168	int may_free = `1`;
2169	struct buffer_head *bh = jh2bh(jh);
2170
2171	if (jh->b_cp_transaction) {
2172	JBUFFER_TRACE(jh, "on running+cp transaction");
2173	__jbd2_journal_temp_unlink_buffer(jh);
2174	/*
2175	* We don't want to write the buffer anymore, clear the
2176	* bit so that we don't confuse checks in
2177	* __jbd2_journal_file_buffer
2178	*/
2179	clear_buffer_dirty(bh);
2180	__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
2181	may_free = `0`;
2182	} else {
2183	JBUFFER_TRACE(jh, "on running transaction");
2184	__jbd2_journal_unfile_buffer(jh);
2185	jbd2_journal_put_journal_head(jh);
2186	}
2187	return may_free;
2188	}
2189
2190	/*
2191	* jbd2_journal_invalidate_folio
2192	*
2193	* This code is tricky. It has a number of cases to deal with.
2194	*
2195	* There are two invariants which this code relies on:
2196	*
2197	* i_size must be updated on disk before we start calling invalidate_folio
2198	* on the data.
2199	*
2200	* This is done in ext3 by defining an ext3_setattr method which
2201	* updates i_size before truncate gets going. By maintaining this
2202	* invariant, we can be sure that it is safe to throw away any buffers
2203	* attached to the current transaction: once the transaction commits,
2204	* we know that the data will not be needed.
2205	*
2206	* Note however that we can not throw away data belonging to the
2207	* previous, committing transaction!
2208	*
2209	* Any disk blocks which are part of the previous, committing
2210	* transaction (and which therefore cannot be discarded immediately) are
2211	* not going to be reused in the new running transaction
2212	*
2213	* The bitmap committed_data images guarantee this: any block which is
2214	* allocated in one transaction and removed in the next will be marked
2215	* as in-use in the committed_data bitmap, so cannot be reused until
2216	* the next transaction to delete the block commits. This means that
2217	* leaving committing buffers dirty is quite safe: the disk blocks
2218	* cannot be reallocated to a different file and so buffer aliasing is
2219	* not possible.
2220	*
2221	*
2222	* The above applies mainly to ordered data mode. In writeback mode we
2223	* don't make guarantees about the order in which data hits disk --- in
2224	* particular we don't guarantee that new dirty data is flushed before
2225	* transaction commit --- so it is always safe just to discard data
2226	* immediately in that mode. --sct
2227	*/
2228
2229	/*
2230	* The journal_unmap_buffer helper function returns zero if the buffer
2231	* concerned remains pinned as an anonymous buffer belonging to an older
2232	* transaction.
2233	*
2234	* We're outside-transaction here. Either or both of j_running_transaction
2235	* and j_committing_transaction may be NULL.
2236	*/
2237	static int journal_unmap_buffer(journal_t journal, struct* buffer_head *bh,
2238	int partial_page)
2239	{
2240	transaction_t *transaction;
2241	struct journal_head *jh;
2242	int may_free = `1`;
2243
2244	BUFFER_TRACE(bh, "entry");
2245
2246	/*
2247	* It is safe to proceed here without the j_list_lock because the
2248	* buffers cannot be stolen by try_to_free_buffers as long as we are
2249	* holding the page lock. --sct
2250	*/
2251
2252	jh = jbd2_journal_grab_journal_head(bh);
2253	if (!jh)
2254	goto zap_buffer_unlocked;
2255
2256	/ OK, we have data buffer in journaled mode /
2257	write_lock(&journal->j_state_lock);
2258	spin_lock(lock: &jh->b_state_lock);
2259	spin_lock(lock: &journal->j_list_lock);
2260
2261	/*
2262	* We cannot remove the buffer from checkpoint lists until the
2263	* transaction adding inode to orphan list (let's call it T)
2264	* is committed. Otherwise if the transaction changing the
2265	* buffer would be cleaned from the journal before T is
2266	* committed, a crash will cause that the correct contents of
2267	* the buffer will be lost. On the other hand we have to
2268	* clear the buffer dirty bit at latest at the moment when the
2269	* transaction marking the buffer as freed in the filesystem
2270	* structures is committed because from that moment on the
2271	* block can be reallocated and used by a different page.
2272	* Since the block hasn't been freed yet but the inode has
2273	* already been added to orphan list, it is safe for us to add
2274	* the buffer to BJ_Forget list of the newest transaction.
2275	*
2276	* Also we have to clear buffer_mapped flag of a truncated buffer
2277	* because the buffer_head may be attached to the page straddling
2278	* i_size (can happen only when blocksize < pagesize) and thus the
2279	* buffer_head can be reused when the file is extended again. So we end
2280	* up keeping around invalidated buffers attached to transactions'
2281	* BJ_Forget list just to stop checkpointing code from cleaning up
2282	* the transaction this buffer was modified in.
2283	*/
2284	transaction = jh->b_transaction;
2285	if (transaction == NULL) {
2286	/ First case: not on any transaction. If it*
2287	* has no checkpoint link, then we can zap it:
2288	* it's a writeback-mode buffer so we don't care
2289	* if it hits disk safely. */
2290	if (!jh->b_cp_transaction) {
2291	JBUFFER_TRACE(jh, "not on any transaction: zap");
2292	goto zap_buffer;
2293	}
2294
2295	if (!buffer_dirty(bh)) {
2296	/ bdflush has written it. We can drop it now /
2297	__jbd2_journal_remove_checkpoint(jh);
2298	goto zap_buffer;
2299	}
2300
2301	/ OK, it must be in the journal but still not*
2302	* written fully to disk: it's metadata or
2303	* journaled data... */
2304
2305	if (journal->j_running_transaction) {
2306	/ ... and once the current transaction has*
2307	* committed, the buffer won't be needed any
2308	* longer. */
2309	JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
2310	may_free = __dispose_buffer(jh,
2311	transaction: journal->j_running_transaction);
2312	goto zap_buffer;
2313	} else {
2314	/ There is no currently-running transaction. So the*
2315	* orphan record which we wrote for this file must have
2316	* passed into commit. We must attach this buffer to
2317	* the committing transaction, if it exists. */
2318	if (journal->j_committing_transaction) {
2319	JBUFFER_TRACE(jh, "give to committing trans");
2320	may_free = __dispose_buffer(jh,
2321	transaction: journal->j_committing_transaction);
2322	goto zap_buffer;
2323	} else {
2324	/ The orphan record's transaction has*
2325	* committed. We can cleanse this buffer */
2326	clear_buffer_jbddirty(bh);
2327	__jbd2_journal_remove_checkpoint(jh);
2328	goto zap_buffer;
2329	}
2330	}
2331	} else if (transaction == journal->j_committing_transaction) {
2332	JBUFFER_TRACE(jh, "on committing transaction");
2333	/*
2334	* The buffer is committing, we simply cannot touch
2335	* it. If the page is straddling i_size we have to wait
2336	* for commit and try again.
2337	*/
2338	if (partial_page) {
2339	spin_unlock(lock: &journal->j_list_lock);
2340	spin_unlock(lock: &jh->b_state_lock);
2341	write_unlock(&journal->j_state_lock);
2342	jbd2_journal_put_journal_head(jh);
2343	/ Already zapped buffer? Nothing to do... /
2344	if (!bh->b_bdev)
2345	return `0`;
2346	return -EBUSY;
2347	}
2348	/*
2349	* OK, buffer won't be reachable after truncate. We just clear
2350	* b_modified to not confuse transaction credit accounting, and
2351	* set j_next_transaction to the running transaction (if there
2352	* is one) and mark buffer as freed so that commit code knows
2353	* it should clear dirty bits when it is done with the buffer.
2354	*/
2355	set_buffer_freed(bh);
2356	if (journal->j_running_transaction && buffer_jbddirty(bh))
2357	jh->b_next_transaction = journal->j_running_transaction;
2358	jh->b_modified = `0`;
2359	spin_unlock(lock: &journal->j_list_lock);
2360	spin_unlock(lock: &jh->b_state_lock);
2361	write_unlock(&journal->j_state_lock);
2362	jbd2_journal_put_journal_head(jh);
2363	return `0`;
2364	} else {
2365	/ Good, the buffer belongs to the running transaction.*
2366	* We are writing our own transaction's data, not any
2367	* previous one's, so it is safe to throw it away
2368	* (remember that we expect the filesystem to have set
2369	* i_size already for this truncate so recovery will not
2370	* expose the disk blocks we are discarding here.) */
2371	J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
2372	JBUFFER_TRACE(jh, "on running transaction");
2373	may_free = __dispose_buffer(jh, transaction);
2374	}
2375
2376	zap_buffer:
2377	/*
2378	* This is tricky. Although the buffer is truncated, it may be reused
2379	* if blocksize < pagesize and it is attached to the page straddling
2380	* EOF. Since the buffer might have been added to BJ_Forget list of the
2381	* running transaction, journal_get_write_access() won't clear
2382	* b_modified and credit accounting gets confused. So clear b_modified
2383	* here.
2384	*/
2385	jh->b_modified = `0`;
2386	spin_unlock(lock: &journal->j_list_lock);
2387	spin_unlock(lock: &jh->b_state_lock);
2388	write_unlock(&journal->j_state_lock);
2389	jbd2_journal_put_journal_head(jh);
2390	zap_buffer_unlocked:
2391	clear_buffer_dirty(bh);
2392	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
2393	clear_buffer_mapped(bh);
2394	clear_buffer_req(bh);
2395	clear_buffer_new(bh);
2396	clear_buffer_delay(bh);
2397	clear_buffer_unwritten(bh);
2398	bh->b_bdev = NULL;
2399	return may_free;
2400	}
2401
2402	/**
2403	* jbd2_journal_invalidate_folio()
2404	* @journal: journal to use for flush...
2405	* @folio: folio to flush
2406	* @offset: start of the range to invalidate
2407	* @length: length of the range to invalidate
2408	*
2409	* Reap page buffers containing data after in the specified range in page.
2410	* Can return -EBUSY if buffers are part of the committing transaction and
2411	* the page is straddling i_size. Caller then has to wait for current commit
2412	* and try again.
2413	*/
2414	int jbd2_journal_invalidate_folio(journal_t journal, struct* folio *folio,
2415	size_t offset, size_t length)
2416	{
2417	struct buffer_head head, bh, *next;
2418	unsigned int stop = offset + length;
2419	unsigned int curr_off = `0`;
2420	int partial_page = (offset \|\| length < folio_size(folio));
2421	int may_free = `1`;
2422	int ret = `0`;
2423
2424	if (!folio_test_locked(folio))
2425	BUG();
2426	head = folio_buffers(folio);
2427	if (!head)
2428	return `0`;
2429
2430	BUG_ON(stop > folio_size(folio) \|\| stop < length);
2431
2432	/ We will potentially be playing with lists other than just the*
2433	* data lists (especially for journaled data mode), so be
2434	* cautious in our locking. */
2435
2436	bh = head;
2437	do {
2438	unsigned int next_off = curr_off + bh->b_size;
2439	next = bh->b_this_page;
2440
2441	if (next_off > stop)
2442	return `0`;
2443
2444	if (offset <= curr_off) {
2445	/ This block is wholly outside the truncation point /
2446	lock_buffer(bh);
2447	ret = journal_unmap_buffer(journal, bh, partial_page);
2448	unlock_buffer(bh);
2449	if (ret < `0`)
2450	return ret;
2451	may_free &= ret;
2452	}
2453	curr_off = next_off;
2454	bh = next;
2455
2456	} while (bh != head);
2457
2458	if (!partial_page) {
2459	if (may_free && try_to_free_buffers(folio))
2460	J_ASSERT(!folio_buffers(folio));
2461	}
2462	return `0`;
2463	}
2464
2465	/*
2466	* File a buffer on the given transaction list.
2467	*/
2468	void __jbd2_journal_file_buffer(struct journal_head *jh,
2469	transaction_t transaction, int* jlist)
2470	{
2471	struct journal_head **list = NULL;
2472	int was_dirty = `0`;
2473	struct buffer_head *bh = jh2bh(jh);
2474
2475	lockdep_assert_held(&jh->b_state_lock);
2476	assert_spin_locked(&transaction->t_journal->j_list_lock);
2477
2478	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
2479	J_ASSERT_JH(jh, jh->b_transaction == transaction \|\|
2480	jh->b_transaction == NULL);
2481
2482	if (jh->b_transaction && jh->b_jlist == jlist)
2483	return;
2484
2485	if (jlist == BJ_Metadata \|\| jlist == BJ_Reserved \|\|
2486	jlist == BJ_Shadow \|\| jlist == BJ_Forget) {
2487	/*
2488	* For metadata buffers, we track dirty bit in buffer_jbddirty
2489	* instead of buffer_dirty. We should not see a dirty bit set
2490	* here because we clear it in do_get_write_access but e.g.
2491	* tune2fs can modify the sb and set the dirty bit at any time
2492	* so we try to gracefully handle that.
2493	*/
2494	if (buffer_dirty(bh))
2495	warn_dirty_buffer(bh);
2496	if (test_clear_buffer_dirty(bh) \|\|
2497	test_clear_buffer_jbddirty(bh))
2498	was_dirty = `1`;
2499	}
2500
2501	if (jh->b_transaction)
2502	__jbd2_journal_temp_unlink_buffer(jh);
2503	else
2504	jbd2_journal_grab_journal_head(bh);
2505	jh->b_transaction = transaction;
2506
2507	switch (jlist) {
2508	case BJ_None:
2509	J_ASSERT_JH(jh, !jh->b_committed_data);
2510	J_ASSERT_JH(jh, !jh->b_frozen_data);
2511	return;
2512	case BJ_Metadata:
2513	transaction->t_nr_buffers++;
2514	list = &transaction->t_buffers;
2515	break;
2516	case BJ_Forget:
2517	list = &transaction->t_forget;
2518	break;
2519	case BJ_Shadow:
2520	list = &transaction->t_shadow_list;
2521	break;
2522	case BJ_Reserved:
2523	list = &transaction->t_reserved_list;
2524	break;
2525	}
2526
2527	__blist_add_buffer(list, jh);
2528	jh->b_jlist = jlist;
2529
2530	if (was_dirty)
2531	set_buffer_jbddirty(bh);
2532	}
2533
2534	void jbd2_journal_file_buffer(struct journal_head *jh,
2535	transaction_t transaction, int* jlist)
2536	{
2537	spin_lock(lock: &jh->b_state_lock);
2538	spin_lock(lock: &transaction->t_journal->j_list_lock);
2539	__jbd2_journal_file_buffer(jh, transaction, jlist);
2540	spin_unlock(lock: &transaction->t_journal->j_list_lock);
2541	spin_unlock(lock: &jh->b_state_lock);
2542	}
2543
2544	/*
2545	* Remove a buffer from its current buffer list in preparation for
2546	* dropping it from its current transaction entirely. If the buffer has
2547	* already started to be used by a subsequent transaction, refile the
2548	* buffer on that transaction's metadata list.
2549	*
2550	* Called under j_list_lock
2551	* Called under jh->b_state_lock
2552	*
2553	* When this function returns true, there's no next transaction to refile to
2554	* and the caller has to drop jh reference through
2555	* jbd2_journal_put_journal_head().
2556	*/
2557	bool __jbd2_journal_refile_buffer(struct journal_head *jh)
2558	{
2559	int was_dirty, jlist;
2560	struct buffer_head *bh = jh2bh(jh);
2561
2562	lockdep_assert_held(&jh->b_state_lock);
2563	if (jh->b_transaction)
2564	assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2565
2566	/ If the buffer is now unused, just drop it. /
2567	if (jh->b_next_transaction == NULL) {
2568	__jbd2_journal_unfile_buffer(jh);
2569	return true;
2570	}
2571
2572	/*
2573	* It has been modified by a later transaction: add it to the new
2574	* transaction's metadata list.
2575	*/
2576
2577	was_dirty = test_clear_buffer_jbddirty(bh);
2578	__jbd2_journal_temp_unlink_buffer(jh);
2579
2580	/*
2581	* b_transaction must be set, otherwise the new b_transaction won't
2582	* be holding jh reference
2583	*/
2584	J_ASSERT_JH(jh, jh->b_transaction != NULL);
2585
2586	/*
2587	* We set b_transaction here because b_next_transaction will inherit
2588	* our jh reference and thus __jbd2_journal_file_buffer() must not
2589	* take a new one.
2590	*/
2591	WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
2592	WRITE_ONCE(jh->b_next_transaction, NULL);
2593	if (buffer_freed(bh))
2594	jlist = BJ_Forget;
2595	else if (jh->b_modified)
2596	jlist = BJ_Metadata;
2597	else
2598	jlist = BJ_Reserved;
2599	__jbd2_journal_file_buffer(jh, transaction: jh->b_transaction, jlist);
2600	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2601
2602	if (was_dirty)
2603	set_buffer_jbddirty(bh);
2604	return false;
2605	}
2606
2607	/*
2608	* __jbd2_journal_refile_buffer() with necessary locking added. We take our
2609	* bh reference so that we can safely unlock bh.
2610	*
2611	* The jh and bh may be freed by this call.
2612	*/
2613	void jbd2_journal_refile_buffer(journal_t journal, struct* journal_head *jh)
2614	{
2615	bool drop;
2616
2617	spin_lock(lock: &jh->b_state_lock);
2618	spin_lock(lock: &journal->j_list_lock);
2619	drop = __jbd2_journal_refile_buffer(jh);
2620	spin_unlock(lock: &jh->b_state_lock);
2621	spin_unlock(lock: &journal->j_list_lock);
2622	if (drop)
2623	jbd2_journal_put_journal_head(jh);
2624	}
2625
2626	/*
2627	* File inode in the inode list of the handle's transaction
2628	*/
2629	static int jbd2_journal_file_inode(handle_t handle, struct* jbd2_inode *jinode,
2630	unsigned long flags, loff_t start_byte, loff_t end_byte)
2631	{
2632	transaction_t *transaction = handle->h_transaction;
2633	journal_t *journal;
2634
2635	if (is_handle_aborted(handle))
2636	return -EROFS;
2637	journal = transaction->t_journal;
2638
2639	jbd2_debug(`4`, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2640	transaction->t_tid);
2641
2642	spin_lock(lock: &journal->j_list_lock);
2643	jinode->i_flags \|= flags;
2644
2645	if (jinode->i_dirty_end) {
2646	jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
2647	jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
2648	} else {
2649	jinode->i_dirty_start = start_byte;
2650	jinode->i_dirty_end = end_byte;
2651	}
2652
2653	/ Is inode already attached where we need it? /
2654	if (jinode->i_transaction == transaction \|\|
2655	jinode->i_next_transaction == transaction)
2656	goto done;
2657
2658	/*
2659	* We only ever set this variable to 1 so the test is safe. Since
2660	* t_need_data_flush is likely to be set, we do the test to save some
2661	* cacheline bouncing
2662	*/
2663	if (!transaction->t_need_data_flush)
2664	transaction->t_need_data_flush = `1`;
2665	/ On some different transaction's list - should be*
2666	* the committing one */
2667	if (jinode->i_transaction) {
2668	J_ASSERT(jinode->i_next_transaction == NULL);
2669	J_ASSERT(jinode->i_transaction ==
2670	journal->j_committing_transaction);
2671	jinode->i_next_transaction = transaction;
2672	goto done;
2673	}
2674	/ Not on any transaction list... /
2675	J_ASSERT(!jinode->i_next_transaction);
2676	jinode->i_transaction = transaction;
2677	list_add(new: &jinode->i_list, head: &transaction->t_inode_list);
2678	done:
2679	spin_unlock(lock: &journal->j_list_lock);
2680
2681	return `0`;
2682	}
2683
2684	int jbd2_journal_inode_ranged_write(handle_t *handle,
2685	struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
2686	{
2687	return jbd2_journal_file_inode(handle, jinode,
2688	JI_WRITE_DATA \| JI_WAIT_DATA, start_byte,
2689	end_byte: start_byte + length - `1`);
2690	}
2691
2692	int jbd2_journal_inode_ranged_wait(handle_t handle, struct* jbd2_inode *jinode,
2693	loff_t start_byte, loff_t length)
2694	{
2695	return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
2696	start_byte, end_byte: start_byte + length - `1`);
2697	}
2698
2699	/*
2700	* File truncate and transaction commit interact with each other in a
2701	* non-trivial way. If a transaction writing data block A is
2702	* committing, we cannot discard the data by truncate until we have
2703	* written them. Otherwise if we crashed after the transaction with
2704	* write has committed but before the transaction with truncate has
2705	* committed, we could see stale data in block A. This function is a
2706	* helper to solve this problem. It starts writeout of the truncated
2707	* part in case it is in the committing transaction.
2708	*
2709	* Filesystem code must call this function when inode is journaled in
2710	* ordered mode before truncation happens and after the inode has been
2711	* placed on orphan list with the new inode size. The second condition
2712	* avoids the race that someone writes new data and we start
2713	* committing the transaction after this function has been called but
2714	* before a transaction for truncate is started (and furthermore it
2715	* allows us to optimize the case where the addition to orphan list
2716	* happens in the same transaction as write --- we don't have to write
2717	* any data in such case).
2718	*/
2719	int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2720	struct jbd2_inode *jinode,
2721	loff_t new_size)
2722	{
2723	transaction_t inode_trans, commit_trans;
2724	int ret = `0`;
2725
2726	/ This is a quick check to avoid locking if not necessary /
2727	if (!jinode->i_transaction)
2728	goto out;
2729	/ Locks are here just to force reading of recent values, it is*
2730	* enough that the transaction was not committing before we started
2731	* a transaction adding the inode to orphan list */
2732	read_lock(&journal->j_state_lock);
2733	commit_trans = journal->j_committing_transaction;
2734	read_unlock(&journal->j_state_lock);
2735	spin_lock(lock: &journal->j_list_lock);
2736	inode_trans = jinode->i_transaction;
2737	spin_unlock(lock: &journal->j_list_lock);
2738	if (inode_trans == commit_trans) {
2739	ret = filemap_fdatawrite_range(mapping: jinode->i_vfs_inode->i_mapping,
2740	start: new_size, LLONG_MAX);
2741	if (ret)
2742	jbd2_journal_abort(journal, ret);
2743	}
2744	out:
2745	return ret;
2746	}
2747

Browse the source code of Linux/fs/jbd2/transaction.c