inode.c source code [Linux/fs/inode.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* (C) 1997 Linus Torvalds
4	* (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
5	*/
6	#include <linux/export.h>
7	#include <linux/fs.h>
8	#include <linux/filelock.h>
9	#include <linux/mm.h>
10	#include <linux/backing-dev.h>
11	#include <linux/hash.h>
12	#include <linux/swap.h>
13	#include <linux/security.h>
14	#include <linux/cdev.h>
15	#include <linux/memblock.h>
16	#include <linux/fsnotify.h>
17	#include <linux/mount.h>
18	#include <linux/posix_acl.h>
19	#include <linux/buffer_head.h> /* for inode_has_buffers */
20	#include <linux/ratelimit.h>
21	#include <linux/list_lru.h>
22	#include <linux/iversion.h>
23	#include <linux/rw_hint.h>
24	#include <linux/seq_file.h>
25	#include <linux/debugfs.h>
26	#include <trace/events/writeback.h>
27	#define CREATE_TRACE_POINTS
28	#include <trace/events/timestamp.h>
29
30	#include "internal.h"
31
32	/*
33	* Inode locking rules:
34	*
35	* inode->i_lock protects:
36	* inode->i_state, inode->i_hash, __iget(), inode->i_io_list
37	* Inode LRU list locks protect:
38	* inode->i_sb->s_inode_lru, inode->i_lru
39	* inode->i_sb->s_inode_list_lock protects:
40	* inode->i_sb->s_inodes, inode->i_sb_list
41	* bdi->wb.list_lock protects:
42	* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
43	* inode_hash_lock protects:
44	* inode_hashtable, inode->i_hash
45	*
46	* Lock ordering:
47	*
48	* inode->i_sb->s_inode_list_lock
49	* inode->i_lock
50	* Inode LRU list locks
51	*
52	* bdi->wb.list_lock
53	* inode->i_lock
54	*
55	* inode_hash_lock
56	* inode->i_sb->s_inode_list_lock
57	* inode->i_lock
58	*
59	* iunique_lock
60	* inode_hash_lock
61	*/
62
63	static unsigned int i_hash_mask __ro_after_init;
64	static unsigned int i_hash_shift __ro_after_init;
65	static struct hlist_head *inode_hashtable __ro_after_init;
66	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
67
68	/*
69	* Empty aops. Can be used for the cases where the user does not
70	* define any of the address_space operations.
71	*/
72	const struct address_space_operations empty_aops = {
73	};
74	EXPORT_SYMBOL(empty_aops);
75
76	static DEFINE_PER_CPU(unsigned long, nr_inodes);
77	static DEFINE_PER_CPU(unsigned long, nr_unused);
78
79	static struct kmem_cache *inode_cachep __ro_after_init;
80
81	static long get_nr_inodes(void)
82	{
83	int i;
84	long sum = `0`;
85	for_each_possible_cpu(i)
86	sum += per_cpu(nr_inodes, i);
87	return sum < `0` ? `0` : sum;
88	}
89
90	static inline long get_nr_inodes_unused(void)
91	{
92	int i;
93	long sum = `0`;
94	for_each_possible_cpu(i)
95	sum += per_cpu(nr_unused, i);
96	return sum < `0` ? `0` : sum;
97	}
98
99	long get_nr_dirty_inodes(void)
100	{
101	/ not actually dirty inodes, but a wild approximation /
102	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
103	return nr_dirty > `0` ? nr_dirty : `0`;
104	}
105
106	#ifdef CONFIG_DEBUG_FS
107	static DEFINE_PER_CPU(long, mg_ctime_updates);
108	static DEFINE_PER_CPU(long, mg_fine_stamps);
109	static DEFINE_PER_CPU(long, mg_ctime_swaps);
110
111	static unsigned long get_mg_ctime_updates(void)
112	{
113	unsigned long sum = `0`;
114	int i;
115
116	for_each_possible_cpu(i)
117	sum += data_race(per_cpu(mg_ctime_updates, i));
118	return sum;
119	}
120
121	static unsigned long get_mg_fine_stamps(void)
122	{
123	unsigned long sum = `0`;
124	int i;
125
126	for_each_possible_cpu(i)
127	sum += data_race(per_cpu(mg_fine_stamps, i));
128	return sum;
129	}
130
131	static unsigned long get_mg_ctime_swaps(void)
132	{
133	unsigned long sum = `0`;
134	int i;
135
136	for_each_possible_cpu(i)
137	sum += data_race(per_cpu(mg_ctime_swaps, i));
138	return sum;
139	}
140
141	#define mgtime_counter_inc(__var) this_cpu_inc(__var)
142
143	static int mgts_show(struct seq_file s, void* *p)
144	{
145	unsigned long ctime_updates = get_mg_ctime_updates();
146	unsigned long ctime_swaps = get_mg_ctime_swaps();
147	unsigned long fine_stamps = get_mg_fine_stamps();
148	unsigned long floor_swaps = timekeeping_get_mg_floor_swaps();
149
150	seq_printf(m: s, fmt: "%lu %lu %lu %lu\n",
151	ctime_updates, ctime_swaps, fine_stamps, floor_swaps);
152	return `0`;
153	}
154
155	DEFINE_SHOW_ATTRIBUTE(mgts);
156
157	static int __init mg_debugfs_init(void)
158	{
159	debugfs_create_file("multigrain_timestamps", S_IFREG \| S_IRUGO, NULL, NULL, &mgts_fops);
160	return `0`;
161	}
162	late_initcall(mg_debugfs_init);
163
164	#else /* ! CONFIG_DEBUG_FS */
165
166	#define mgtime_counter_inc(__var) do { } while (0)
167
168	#endif /* CONFIG_DEBUG_FS */
169
170	/*
171	* Handle nr_inode sysctl
172	*/
173	#ifdef CONFIG_SYSCTL
174	/*
175	* Statistics gathering..
176	*/
177	static struct inodes_stat_t inodes_stat;
178
179	static int proc_nr_inodes(const struct ctl_table table, int* write, void *buffer,
180	size_t lenp, loff_t ppos)
181	{
182	inodes_stat.nr_inodes = get_nr_inodes();
183	inodes_stat.nr_unused = get_nr_inodes_unused();
184	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
185	}
186
187	static const struct ctl_table inodes_sysctls[] = {
188	{
189	.procname = "inode-nr",
190	.data = &inodes_stat,
191	.maxlen = `2`*sizeof(long),
192	.mode = `0444`,
193	.proc_handler = proc_nr_inodes,
194	},
195	{
196	.procname = "inode-state",
197	.data = &inodes_stat,
198	.maxlen = `7`*sizeof(long),
199	.mode = `0444`,
200	.proc_handler = proc_nr_inodes,
201	},
202	};
203
204	static int __init init_fs_inode_sysctls(void)
205	{
206	register_sysctl_init("fs", inodes_sysctls);
207	return `0`;
208	}
209	early_initcall(init_fs_inode_sysctls);
210	#endif
211
212	static int no_open(struct inode inode, struct* file *file)
213	{
214	return -ENXIO;
215	}
216
217	/**
218	* inode_init_always_gfp - perform inode structure initialisation
219	* @sb: superblock inode belongs to
220	* @inode: inode to initialise
221	* @gfp: allocation flags
222	*
223	* These are initializations that need to be done on every inode
224	* allocation as the fields are not initialised by slab allocation.
225	* If there are additional allocations required @gfp is used.
226	*/
227	int inode_init_always_gfp(struct super_block sb, struct* inode *inode, gfp_t gfp)
228	{
229	static const struct inode_operations empty_iops;
230	static const struct file_operations no_open_fops = {.open = no_open};
231	struct address_space *const mapping = &inode->i_data;
232
233	inode->i_sb = sb;
234	inode->i_blkbits = sb->s_blocksize_bits;
235	inode->i_flags = `0`;
236	inode->i_state = `0`;
237	atomic64_set(v: &inode->i_sequence, i: `0`);
238	atomic_set(v: &inode->i_count, i: `1`);
239	inode->i_op = &empty_iops;
240	inode->i_fop = &no_open_fops;
241	inode->i_ino = `0`;
242	inode->__i_nlink = `1`;
243	inode->i_opflags = `0`;
244	if (sb->s_xattr)
245	inode->i_opflags \|= IOP_XATTR;
246	if (sb->s_type->fs_flags & FS_MGTIME)
247	inode->i_opflags \|= IOP_MGTIME;
248	i_uid_write(inode, uid: `0`);
249	i_gid_write(inode, gid: `0`);
250	atomic_set(v: &inode->i_writecount, i: `0`);
251	inode->i_size = `0`;
252	inode->i_write_hint = WRITE_LIFE_NOT_SET;
253	inode->i_blocks = `0`;
254	inode->i_bytes = `0`;
255	inode->i_generation = `0`;
256	inode->i_pipe = NULL;
257	inode->i_cdev = NULL;
258	inode->i_link = NULL;
259	inode->i_dir_seq = `0`;
260	inode->i_rdev = `0`;
261	inode->dirtied_when = `0`;
262
263	#ifdef CONFIG_CGROUP_WRITEBACK
264	inode->i_wb_frn_winner = `0`;
265	inode->i_wb_frn_avg_time = `0`;
266	inode->i_wb_frn_history = `0`;
267	#endif
268
269	spin_lock_init(&inode->i_lock);
270	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
271
272	init_rwsem(&inode->i_rwsem);
273	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
274
275	atomic_set(v: &inode->i_dio_count, i: `0`);
276
277	mapping->a_ops = &empty_aops;
278	mapping->host = inode;
279	mapping->flags = `0`;
280	mapping->wb_err = `0`;
281	atomic_set(v: &mapping->i_mmap_writable, i: `0`);
282	#ifdef CONFIG_READ_ONLY_THP_FOR_FS
283	atomic_set(&mapping->nr_thps, `0`);
284	#endif
285	mapping_set_gfp_mask(m: mapping, GFP_HIGHUSER_MOVABLE);
286	mapping->i_private_data = NULL;
287	mapping->writeback_index = `0`;
288	init_rwsem(&mapping->invalidate_lock);
289	lockdep_set_class_and_name(&mapping->invalidate_lock,
290	&sb->s_type->invalidate_lock_key,
291	"mapping.invalidate_lock");
292	if (sb->s_iflags & SB_I_STABLE_WRITES)
293	mapping_set_stable_writes(mapping);
294	inode->i_private = NULL;
295	inode->i_mapping = mapping;
296	INIT_HLIST_HEAD(&inode->i_dentry); / buggered by rcu freeing /
297	#ifdef CONFIG_FS_POSIX_ACL
298	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
299	#endif
300
301	#ifdef CONFIG_FSNOTIFY
302	inode->i_fsnotify_mask = `0`;
303	#endif
304	inode->i_flctx = NULL;
305
306	if (unlikely(security_inode_alloc(inode, gfp)))
307	return -ENOMEM;
308
309	this_cpu_inc(nr_inodes);
310
311	return `0`;
312	}
313	EXPORT_SYMBOL(inode_init_always_gfp);
314
315	void free_inode_nonrcu(struct inode *inode)
316	{
317	kmem_cache_free(s: inode_cachep, objp: inode);
318	}
319	EXPORT_SYMBOL(free_inode_nonrcu);
320
321	static void i_callback(struct rcu_head *head)
322	{
323	struct inode inode = container_of(head, struct* inode, i_rcu);
324	if (inode->free_inode)
325	inode->free_inode(inode);
326	else
327	free_inode_nonrcu(inode);
328	}
329
330	/**
331	* alloc_inode - obtain an inode
332	* @sb: superblock
333	*
334	* Allocates a new inode for given superblock.
335	* Inode wont be chained in superblock s_inodes list
336	* This means :
337	* - fs can't be unmount
338	* - quotas, fsnotify, writeback can't work
339	*/
340	struct inode alloc_inode(struct* super_block *sb)
341	{
342	const struct super_operations *ops = sb->s_op;
343	struct inode *inode;
344
345	if (ops->alloc_inode)
346	inode = ops->alloc_inode(sb);
347	else
348	inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
349
350	if (!inode)
351	return NULL;
352
353	if (unlikely(inode_init_always(sb, inode))) {
354	if (ops->destroy_inode) {
355	ops->destroy_inode(inode);
356	if (!ops->free_inode)
357	return NULL;
358	}
359	inode->free_inode = ops->free_inode;
360	i_callback(head: &inode->i_rcu);
361	return NULL;
362	}
363
364	return inode;
365	}
366
367	void __destroy_inode(struct inode *inode)
368	{
369	BUG_ON(inode_has_buffers(inode));
370	inode_detach_wb(inode);
371	security_inode_free(inode);
372	fsnotify_inode_delete(inode);
373	locks_free_lock_context(inode);
374	if (!inode->i_nlink) {
375	WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == `0`);
376	atomic_long_dec(v: &inode->i_sb->s_remove_count);
377	}
378
379	#ifdef CONFIG_FS_POSIX_ACL
380	if (inode->i_acl && !is_uncached_acl(acl: inode->i_acl))
381	posix_acl_release(acl: inode->i_acl);
382	if (inode->i_default_acl && !is_uncached_acl(acl: inode->i_default_acl))
383	posix_acl_release(acl: inode->i_default_acl);
384	#endif
385	this_cpu_dec(nr_inodes);
386	}
387	EXPORT_SYMBOL(__destroy_inode);
388
389	static void destroy_inode(struct inode *inode)
390	{
391	const struct super_operations *ops = inode->i_sb->s_op;
392
393	BUG_ON(!list_empty(&inode->i_lru));
394	__destroy_inode(inode);
395	if (ops->destroy_inode) {
396	ops->destroy_inode(inode);
397	if (!ops->free_inode)
398	return;
399	}
400	inode->free_inode = ops->free_inode;
401	call_rcu(head: &inode->i_rcu, func: i_callback);
402	}
403
404	/**
405	* drop_nlink - directly drop an inode's link count
406	* @inode: inode
407	*
408	* This is a low-level filesystem helper to replace any
409	* direct filesystem manipulation of i_nlink. In cases
410	* where we are attempting to track writes to the
411	* filesystem, a decrement to zero means an imminent
412	* write when the file is truncated and actually unlinked
413	* on the filesystem.
414	*/
415	void drop_nlink(struct inode *inode)
416	{
417	WARN_ON(inode->i_nlink == `0`);
418	inode->__i_nlink--;
419	if (!inode->i_nlink)
420	atomic_long_inc(v: &inode->i_sb->s_remove_count);
421	}
422	EXPORT_SYMBOL(drop_nlink);
423
424	/**
425	* clear_nlink - directly zero an inode's link count
426	* @inode: inode
427	*
428	* This is a low-level filesystem helper to replace any
429	* direct filesystem manipulation of i_nlink. See
430	* drop_nlink() for why we care about i_nlink hitting zero.
431	*/
432	void clear_nlink(struct inode *inode)
433	{
434	if (inode->i_nlink) {
435	inode->__i_nlink = `0`;
436	atomic_long_inc(v: &inode->i_sb->s_remove_count);
437	}
438	}
439	EXPORT_SYMBOL(clear_nlink);
440
441	/**
442	* set_nlink - directly set an inode's link count
443	* @inode: inode
444	* @nlink: new nlink (should be non-zero)
445	*
446	* This is a low-level filesystem helper to replace any
447	* direct filesystem manipulation of i_nlink.
448	*/
449	void set_nlink(struct inode inode, unsigned* int nlink)
450	{
451	if (!nlink) {
452	clear_nlink(inode);
453	} else {
454	/ Yes, some filesystems do change nlink from zero to one /
455	if (inode->i_nlink == `0`)
456	atomic_long_dec(v: &inode->i_sb->s_remove_count);
457
458	inode->__i_nlink = nlink;
459	}
460	}
461	EXPORT_SYMBOL(set_nlink);
462
463	/**
464	* inc_nlink - directly increment an inode's link count
465	* @inode: inode
466	*
467	* This is a low-level filesystem helper to replace any
468	* direct filesystem manipulation of i_nlink. Currently,
469	* it is only here for parity with dec_nlink().
470	*/
471	void inc_nlink(struct inode *inode)
472	{
473	if (unlikely(inode->i_nlink == `0`)) {
474	WARN_ON(!(inode->i_state & I_LINKABLE));
475	atomic_long_dec(v: &inode->i_sb->s_remove_count);
476	}
477
478	inode->__i_nlink++;
479	}
480	EXPORT_SYMBOL(inc_nlink);
481
482	static void __address_space_init_once(struct address_space *mapping)
483	{
484	xa_init_flags(xa: &mapping->i_pages, XA_FLAGS_LOCK_IRQ \| XA_FLAGS_ACCOUNT);
485	init_rwsem(&mapping->i_mmap_rwsem);
486	INIT_LIST_HEAD(list: &mapping->i_private_list);
487	spin_lock_init(&mapping->i_private_lock);
488	mapping->i_mmap = RB_ROOT_CACHED;
489	}
490
491	void address_space_init_once(struct address_space *mapping)
492	{
493	memset(s: mapping, c: `0`, n: sizeof(*mapping));
494	__address_space_init_once(mapping);
495	}
496	EXPORT_SYMBOL(address_space_init_once);
497
498	/*
499	* These are initializations that only need to be done
500	* once, because the fields are idempotent across use
501	* of the inode, so let the slab aware of that.
502	*/
503	void inode_init_once(struct inode *inode)
504	{
505	memset(s: inode, c: `0`, n: sizeof(*inode));
506	INIT_HLIST_NODE(h: &inode->i_hash);
507	INIT_LIST_HEAD(list: &inode->i_devices);
508	INIT_LIST_HEAD(list: &inode->i_io_list);
509	INIT_LIST_HEAD(list: &inode->i_wb_list);
510	INIT_LIST_HEAD(list: &inode->i_lru);
511	INIT_LIST_HEAD(list: &inode->i_sb_list);
512	__address_space_init_once(mapping: &inode->i_data);
513	i_size_ordered_init(inode);
514	}
515	EXPORT_SYMBOL(inode_init_once);
516
517	static void init_once(void *foo)
518	{
519	struct inode inode = (struct* inode *) foo;
520
521	inode_init_once(inode);
522	}
523
524	/*
525	* get additional reference to inode; caller must already hold one.
526	*/
527	void ihold(struct inode *inode)
528	{
529	WARN_ON(atomic_inc_return(&inode->i_count) < `2`);
530	}
531	EXPORT_SYMBOL(ihold);
532
533	static void __inode_add_lru(struct inode *inode, bool rotate)
534	{
535	if (inode->i_state & (I_DIRTY_ALL \| I_SYNC \| I_FREEING \| I_WILL_FREE))
536	return;
537	if (icount_read(inode))
538	return;
539	if (!(inode->i_sb->s_flags & SB_ACTIVE))
540	return;
541	if (!mapping_shrinkable(mapping: &inode->i_data))
542	return;
543
544	if (list_lru_add_obj(lru: &inode->i_sb->s_inode_lru, item: &inode->i_lru))
545	this_cpu_inc(nr_unused);
546	else if (rotate)
547	inode->i_state \|= I_REFERENCED;
548	}
549
550	struct wait_queue_head inode_bit_waitqueue(struct* wait_bit_queue_entry *wqe,
551	struct inode *inode, u32 bit)
552	{
553	void *bit_address;
554
555	bit_address = inode_state_wait_address(inode, bit);
556	init_wait_var_entry(wbq_entry: wqe, var: bit_address, flags: `0`);
557	return __var_waitqueue(p: bit_address);
558	}
559	EXPORT_SYMBOL(inode_bit_waitqueue);
560
561	/*
562	* Add inode to LRU if needed (inode is unused and clean).
563	*
564	* Needs inode->i_lock held.
565	*/
566	void inode_add_lru(struct inode *inode)
567	{
568	__inode_add_lru(inode, rotate: false);
569	}
570
571	static void inode_lru_list_del(struct inode *inode)
572	{
573	if (list_lru_del_obj(lru: &inode->i_sb->s_inode_lru, item: &inode->i_lru))
574	this_cpu_dec(nr_unused);
575	}
576
577	static void inode_pin_lru_isolating(struct inode *inode)
578	{
579	lockdep_assert_held(&inode->i_lock);
580	WARN_ON(inode->i_state & (I_LRU_ISOLATING \| I_FREEING \| I_WILL_FREE));
581	inode->i_state \|= I_LRU_ISOLATING;
582	}
583
584	static void inode_unpin_lru_isolating(struct inode *inode)
585	{
586	spin_lock(lock: &inode->i_lock);
587	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
588	inode->i_state &= ~I_LRU_ISOLATING;
589	/ Called with inode->i_lock which ensures memory ordering. /
590	inode_wake_up_bit(inode, bit: __I_LRU_ISOLATING);
591	spin_unlock(lock: &inode->i_lock);
592	}
593
594	static void inode_wait_for_lru_isolating(struct inode *inode)
595	{
596	struct wait_bit_queue_entry wqe;
597	struct wait_queue_head *wq_head;
598
599	lockdep_assert_held(&inode->i_lock);
600	if (!(inode->i_state & I_LRU_ISOLATING))
601	return;
602
603	wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
604	for (;;) {
605	prepare_to_wait_event(wq_head, wq_entry: &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
606	/*
607	* Checking I_LRU_ISOLATING with inode->i_lock guarantees
608	* memory ordering.
609	*/
610	if (!(inode->i_state & I_LRU_ISOLATING))
611	break;
612	spin_unlock(lock: &inode->i_lock);
613	schedule();
614	spin_lock(lock: &inode->i_lock);
615	}
616	finish_wait(wq_head, wq_entry: &wqe.wq_entry);
617	WARN_ON(inode->i_state & I_LRU_ISOLATING);
618	}
619
620	/**
621	* inode_sb_list_add - add inode to the superblock list of inodes
622	* @inode: inode to add
623	*/
624	void inode_sb_list_add(struct inode *inode)
625	{
626	struct super_block *sb = inode->i_sb;
627
628	spin_lock(lock: &sb->s_inode_list_lock);
629	list_add(new: &inode->i_sb_list, head: &sb->s_inodes);
630	spin_unlock(lock: &sb->s_inode_list_lock);
631	}
632	EXPORT_SYMBOL_GPL(inode_sb_list_add);
633
634	static inline void inode_sb_list_del(struct inode *inode)
635	{
636	struct super_block *sb = inode->i_sb;
637
638	if (!list_empty(head: &inode->i_sb_list)) {
639	spin_lock(lock: &sb->s_inode_list_lock);
640	list_del_init(entry: &inode->i_sb_list);
641	spin_unlock(lock: &sb->s_inode_list_lock);
642	}
643	}
644
645	static unsigned long hash(struct super_block sb, unsigned* long hashval)
646	{
647	unsigned long tmp;
648
649	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
650	L1_CACHE_BYTES;
651	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
652	return tmp & i_hash_mask;
653	}
654
655	/**
656	* __insert_inode_hash - hash an inode
657	* @inode: unhashed inode
658	* @hashval: unsigned long value used to locate this object in the
659	* inode_hashtable.
660	*
661	* Add an inode to the inode hash for this superblock.
662	*/
663	void __insert_inode_hash(struct inode inode, unsigned* long hashval)
664	{
665	struct hlist_head *b = inode_hashtable + hash(sb: inode->i_sb, hashval);
666
667	spin_lock(lock: &inode_hash_lock);
668	spin_lock(lock: &inode->i_lock);
669	hlist_add_head_rcu(n: &inode->i_hash, h: b);
670	spin_unlock(lock: &inode->i_lock);
671	spin_unlock(lock: &inode_hash_lock);
672	}
673	EXPORT_SYMBOL(__insert_inode_hash);
674
675	/**
676	* __remove_inode_hash - remove an inode from the hash
677	* @inode: inode to unhash
678	*
679	* Remove an inode from the superblock.
680	*/
681	void __remove_inode_hash(struct inode *inode)
682	{
683	spin_lock(lock: &inode_hash_lock);
684	spin_lock(lock: &inode->i_lock);
685	hlist_del_init_rcu(n: &inode->i_hash);
686	spin_unlock(lock: &inode->i_lock);
687	spin_unlock(lock: &inode_hash_lock);
688	}
689	EXPORT_SYMBOL(__remove_inode_hash);
690
691	void dump_mapping(const struct address_space *mapping)
692	{
693	struct inode *host;
694	const struct address_space_operations *a_ops;
695	struct hlist_node *dentry_first;
696	struct dentry *dentry_ptr;
697	struct dentry dentry;
698	char fname[`64`] = {};
699	unsigned long ino;
700
701	/*
702	* If mapping is an invalid pointer, we don't want to crash
703	* accessing it, so probe everything depending on it carefully.
704	*/
705	if (get_kernel_nofault(host, &mapping->host) \|\|
706	get_kernel_nofault(a_ops, &mapping->a_ops)) {
707	pr_warn("invalid mapping:%px\n", mapping);
708	return;
709	}
710
711	if (!host) {
712	pr_warn("aops:%ps\n", a_ops);
713	return;
714	}
715
716	if (get_kernel_nofault(dentry_first, &host->i_dentry.first) \|\|
717	get_kernel_nofault(ino, &host->i_ino)) {
718	pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
719	return;
720	}
721
722	if (!dentry_first) {
723	pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
724	return;
725	}
726
727	dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
728	if (get_kernel_nofault(dentry, dentry_ptr) \|\|
729	!dentry.d_parent \|\| !dentry.d_name.name) {
730	pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
731	a_ops, ino, dentry_ptr);
732	return;
733	}
734
735	if (strncpy_from_kernel_nofault(dst: fname, unsafe_addr: dentry.d_name.name, count: `63`) < `0`)
736	strscpy(fname, "<invalid>");
737	/*
738	* Even if strncpy_from_kernel_nofault() succeeded,
739	* the fname could be unreliable
740	*/
741	pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n",
742	a_ops, ino, fname);
743	}
744
745	void clear_inode(struct inode *inode)
746	{
747	/*
748	* We have to cycle the i_pages lock here because reclaim can be in the
749	* process of removing the last page (in __filemap_remove_folio())
750	* and we must not free the mapping under it.
751	*/
752	xa_lock_irq(&inode->i_data.i_pages);
753	BUG_ON(inode->i_data.nrpages);
754	/*
755	* Almost always, mapping_empty(&inode->i_data) here; but there are
756	* two known and long-standing ways in which nodes may get left behind
757	* (when deep radix-tree node allocation failed partway; or when THP
758	* collapse_file() failed). Until those two known cases are cleaned up,
759	* or a cleanup function is called here, do not BUG_ON(!mapping_empty),
760	* nor even WARN_ON(!mapping_empty).
761	*/
762	xa_unlock_irq(&inode->i_data.i_pages);
763	BUG_ON(!list_empty(&inode->i_data.i_private_list));
764	BUG_ON(!(inode->i_state & I_FREEING));
765	BUG_ON(inode->i_state & I_CLEAR);
766	BUG_ON(!list_empty(&inode->i_wb_list));
767	/ don't need i_lock here, no concurrent mods to i_state /
768	inode->i_state = I_FREEING \| I_CLEAR;
769	}
770	EXPORT_SYMBOL(clear_inode);
771
772	/*
773	* Free the inode passed in, removing it from the lists it is still connected
774	* to. We remove any pages still attached to the inode and wait for any IO that
775	* is still in progress before finally destroying the inode.
776	*
777	* An inode must already be marked I_FREEING so that we avoid the inode being
778	* moved back onto lists if we race with other code that manipulates the lists
779	* (e.g. writeback_single_inode). The caller is responsible for setting this.
780	*
781	* An inode must already be removed from the LRU list before being evicted from
782	* the cache. This should occur atomically with setting the I_FREEING state
783	* flag, so no inodes here should ever be on the LRU when being evicted.
784	*/
785	static void evict(struct inode *inode)
786	{
787	const struct super_operations *op = inode->i_sb->s_op;
788
789	BUG_ON(!(inode->i_state & I_FREEING));
790	BUG_ON(!list_empty(&inode->i_lru));
791
792	if (!list_empty(head: &inode->i_io_list))
793	inode_io_list_del(inode);
794
795	inode_sb_list_del(inode);
796
797	spin_lock(lock: &inode->i_lock);
798	inode_wait_for_lru_isolating(inode);
799
800	/*
801	* Wait for flusher thread to be done with the inode so that filesystem
802	* does not start destroying it while writeback is still running. Since
803	* the inode has I_FREEING set, flusher thread won't start new work on
804	* the inode. We just have to wait for running writeback to finish.
805	*/
806	inode_wait_for_writeback(inode);
807	spin_unlock(lock: &inode->i_lock);
808
809	if (op->evict_inode) {
810	op->evict_inode(inode);
811	} else {
812	truncate_inode_pages_final(&inode->i_data);
813	clear_inode(inode);
814	}
815	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
816	cd_forget(inode);
817
818	remove_inode_hash(inode);
819
820	/*
821	* Wake up waiters in __wait_on_freeing_inode().
822	*
823	* It is an invariant that any thread we need to wake up is already
824	* accounted for before remove_inode_hash() acquires ->i_lock -- both
825	* sides take the lock and sleep is aborted if the inode is found
826	* unhashed. Thus either the sleeper wins and goes off CPU, or removal
827	* wins and the sleeper aborts after testing with the lock.
828	*
829	* This also means we don't need any fences for the call below.
830	*/
831	inode_wake_up_bit(inode, bit: __I_NEW);
832	BUG_ON(inode->i_state != (I_FREEING \| I_CLEAR));
833
834	destroy_inode(inode);
835	}
836
837	/*
838	* dispose_list - dispose of the contents of a local list
839	* @head: the head of the list to free
840	*
841	* Dispose-list gets a local list with local inodes in it, so it doesn't
842	* need to worry about list corruption and SMP locks.
843	*/
844	static void dispose_list(struct list_head *head)
845	{
846	while (!list_empty(head)) {
847	struct inode *inode;
848
849	inode = list_first_entry(head, struct inode, i_lru);
850	list_del_init(entry: &inode->i_lru);
851
852	evict(inode);
853	cond_resched();
854	}
855	}
856
857	/**
858	* evict_inodes - evict all evictable inodes for a superblock
859	* @sb: superblock to operate on
860	*
861	* Make sure that no inodes with zero refcount are retained. This is
862	* called by superblock shutdown after having SB_ACTIVE flag removed,
863	* so any inode reaching zero refcount during or after that call will
864	* be immediately evicted.
865	*/
866	void evict_inodes(struct super_block *sb)
867	{
868	struct inode *inode;
869	LIST_HEAD(dispose);
870
871	again:
872	spin_lock(lock: &sb->s_inode_list_lock);
873	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
874	if (icount_read(inode))
875	continue;
876
877	spin_lock(lock: &inode->i_lock);
878	if (icount_read(inode)) {
879	spin_unlock(lock: &inode->i_lock);
880	continue;
881	}
882	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
883	spin_unlock(lock: &inode->i_lock);
884	continue;
885	}
886
887	inode->i_state \|= I_FREEING;
888	inode_lru_list_del(inode);
889	spin_unlock(lock: &inode->i_lock);
890	list_add(new: &inode->i_lru, head: &dispose);
891
892	/*
893	* We can have a ton of inodes to evict at unmount time given
894	* enough memory, check to see if we need to go to sleep for a
895	* bit so we don't livelock.
896	*/
897	if (need_resched()) {
898	spin_unlock(lock: &sb->s_inode_list_lock);
899	cond_resched();
900	dispose_list(head: &dispose);
901	goto again;
902	}
903	}
904	spin_unlock(lock: &sb->s_inode_list_lock);
905
906	dispose_list(head: &dispose);
907	}
908	EXPORT_SYMBOL_GPL(evict_inodes);
909
910	/*
911	* Isolate the inode from the LRU in preparation for freeing it.
912	*
913	* If the inode has the I_REFERENCED flag set, then it means that it has been
914	* used recently - the flag is set in iput_final(). When we encounter such an
915	* inode, clear the flag and move it to the back of the LRU so it gets another
916	* pass through the LRU before it gets reclaimed. This is necessary because of
917	* the fact we are doing lazy LRU updates to minimise lock contention so the
918	* LRU does not have strict ordering. Hence we don't want to reclaim inodes
919	* with this flag set because they are the inodes that are out of order.
920	*/
921	static enum lru_status inode_lru_isolate(struct list_head *item,
922	struct list_lru_one lru, void* *arg)
923	{
924	struct list_head *freeable = arg;
925	struct inode inode = container_of(item, struct* inode, i_lru);
926
927	/*
928	* We are inverting the lru lock/inode->i_lock here, so use a
929	* trylock. If we fail to get the lock, just skip it.
930	*/
931	if (!spin_trylock(lock: &inode->i_lock))
932	return LRU_SKIP;
933
934	/*
935	* Inodes can get referenced, redirtied, or repopulated while
936	* they're already on the LRU, and this can make them
937	* unreclaimable for a while. Remove them lazily here; iput,
938	* sync, or the last page cache deletion will requeue them.
939	*/
940	if (icount_read(inode) \|\|
941	(inode->i_state & ~I_REFERENCED) \|\|
942	!mapping_shrinkable(mapping: &inode->i_data)) {
943	list_lru_isolate(list: lru, item: &inode->i_lru);
944	spin_unlock(lock: &inode->i_lock);
945	this_cpu_dec(nr_unused);
946	return LRU_REMOVED;
947	}
948
949	/ Recently referenced inodes get one more pass /
950	if (inode->i_state & I_REFERENCED) {
951	inode->i_state &= ~I_REFERENCED;
952	spin_unlock(lock: &inode->i_lock);
953	return LRU_ROTATE;
954	}
955
956	/*
957	* On highmem systems, mapping_shrinkable() permits dropping
958	* page cache in order to free up struct inodes: lowmem might
959	* be under pressure before the cache inside the highmem zone.
960	*/
961	if (inode_has_buffers(inode) \|\| !mapping_empty(mapping: &inode->i_data)) {
962	inode_pin_lru_isolating(inode);
963	spin_unlock(lock: &inode->i_lock);
964	spin_unlock(lock: &lru->lock);
965	if (remove_inode_buffers(inode)) {
966	unsigned long reap;
967	reap = invalidate_mapping_pages(mapping: &inode->i_data, start: `0`, end: -`1`);
968	if (current_is_kswapd())
969	__count_vm_events(item: KSWAPD_INODESTEAL, delta: reap);
970	else
971	__count_vm_events(item: PGINODESTEAL, delta: reap);
972	mm_account_reclaimed_pages(pages: reap);
973	}
974	inode_unpin_lru_isolating(inode);
975	return LRU_RETRY;
976	}
977
978	WARN_ON(inode->i_state & I_NEW);
979	inode->i_state \|= I_FREEING;
980	list_lru_isolate_move(list: lru, item: &inode->i_lru, head: freeable);
981	spin_unlock(lock: &inode->i_lock);
982
983	this_cpu_dec(nr_unused);
984	return LRU_REMOVED;
985	}
986
987	/*
988	* Walk the superblock inode LRU for freeable inodes and attempt to free them.
989	* This is called from the superblock shrinker function with a number of inodes
990	* to trim from the LRU. Inodes to be freed are moved to a temporary list and
991	* then are freed outside inode_lock by dispose_list().
992	*/
993	long prune_icache_sb(struct super_block sb, struct* shrink_control *sc)
994	{
995	LIST_HEAD(freeable);
996	long freed;
997
998	freed = list_lru_shrink_walk(lru: &sb->s_inode_lru, sc,
999	isolate: inode_lru_isolate, cb_arg: &freeable);
1000	dispose_list(head: &freeable);
1001	return freed;
1002	}
1003
1004	static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked);
1005	/*
1006	* Called with the inode lock held.
1007	*/
1008	static struct inode find_inode(struct* super_block *sb,
1009	struct hlist_head *head,
1010	int (test)(struct* inode , void* *),
1011	void *data, bool is_inode_hash_locked)
1012	{
1013	struct inode *inode = NULL;
1014
1015	if (is_inode_hash_locked)
1016	lockdep_assert_held(&inode_hash_lock);
1017	else
1018	lockdep_assert_not_held(&inode_hash_lock);
1019
1020	rcu_read_lock();
1021	repeat:
1022	hlist_for_each_entry_rcu(inode, head, i_hash) {
1023	if (inode->i_sb != sb)
1024	continue;
1025	if (!test(inode, data))
1026	continue;
1027	spin_lock(lock: &inode->i_lock);
1028	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
1029	__wait_on_freeing_inode(inode, is_inode_hash_locked);
1030	goto repeat;
1031	}
1032	if (unlikely(inode->i_state & I_CREATING)) {
1033	spin_unlock(lock: &inode->i_lock);
1034	rcu_read_unlock();
1035	return ERR_PTR(error: -ESTALE);
1036	}
1037	__iget(inode);
1038	spin_unlock(lock: &inode->i_lock);
1039	rcu_read_unlock();
1040	return inode;
1041	}
1042	rcu_read_unlock();
1043	return NULL;
1044	}
1045
1046	/*
1047	* find_inode_fast is the fast path version of find_inode, see the comment at
1048	* iget_locked for details.
1049	*/
1050	static struct inode find_inode_fast(struct* super_block *sb,
1051	struct hlist_head head, unsigned* long ino,
1052	bool is_inode_hash_locked)
1053	{
1054	struct inode *inode = NULL;
1055
1056	if (is_inode_hash_locked)
1057	lockdep_assert_held(&inode_hash_lock);
1058	else
1059	lockdep_assert_not_held(&inode_hash_lock);
1060
1061	rcu_read_lock();
1062	repeat:
1063	hlist_for_each_entry_rcu(inode, head, i_hash) {
1064	if (inode->i_ino != ino)
1065	continue;
1066	if (inode->i_sb != sb)
1067	continue;
1068	spin_lock(lock: &inode->i_lock);
1069	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
1070	__wait_on_freeing_inode(inode, is_inode_hash_locked);
1071	goto repeat;
1072	}
1073	if (unlikely(inode->i_state & I_CREATING)) {
1074	spin_unlock(lock: &inode->i_lock);
1075	rcu_read_unlock();
1076	return ERR_PTR(error: -ESTALE);
1077	}
1078	__iget(inode);
1079	spin_unlock(lock: &inode->i_lock);
1080	rcu_read_unlock();
1081	return inode;
1082	}
1083	rcu_read_unlock();
1084	return NULL;
1085	}
1086
1087	/*
1088	* Each cpu owns a range of LAST_INO_BATCH numbers.
1089	* 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
1090	* to renew the exhausted range.
1091	*
1092	* This does not significantly increase overflow rate because every CPU can
1093	* consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
1094	* NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
1095	* 2^32 range, and is a worst-case. Even a 50% wastage would only increase
1096	* overflow rate by 2x, which does not seem too significant.
1097	*
1098	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
1099	* error if st_ino won't fit in target struct field. Use 32bit counter
1100	* here to attempt to avoid that.
1101	*/
1102	#define LAST_INO_BATCH 1024
1103	static DEFINE_PER_CPU(unsigned int, last_ino);
1104
1105	unsigned int get_next_ino(void)
1106	{
1107	unsigned int *p = &get_cpu_var(last_ino);
1108	unsigned int res = *p;
1109
1110	#ifdef CONFIG_SMP
1111	if (unlikely((res & (LAST_INO_BATCH-`1`)) == `0`)) {
1112	static atomic_t shared_last_ino;
1113	int next = atomic_add_return(LAST_INO_BATCH, v: &shared_last_ino);
1114
1115	res = next - LAST_INO_BATCH;
1116	}
1117	#endif
1118
1119	res++;
1120	/ get_next_ino should not provide a 0 inode number /
1121	if (unlikely(!res))
1122	res++;
1123	*p = res;
1124	put_cpu_var(last_ino);
1125	return res;
1126	}
1127	EXPORT_SYMBOL(get_next_ino);
1128
1129	/**
1130	* new_inode - obtain an inode
1131	* @sb: superblock
1132	*
1133	* Allocates a new inode for given superblock. The default gfp_mask
1134	* for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
1135	* If HIGHMEM pages are unsuitable or it is known that pages allocated
1136	* for the page cache are not reclaimable or migratable,
1137	* mapping_set_gfp_mask() must be called with suitable flags on the
1138	* newly created inode's mapping
1139	*
1140	*/
1141	struct inode new_inode(struct* super_block *sb)
1142	{
1143	struct inode *inode;
1144
1145	inode = alloc_inode(sb);
1146	if (inode)
1147	inode_sb_list_add(inode);
1148	return inode;
1149	}
1150	EXPORT_SYMBOL(new_inode);
1151
1152	#ifdef CONFIG_DEBUG_LOCK_ALLOC
1153	void lockdep_annotate_inode_mutex_key(struct inode *inode)
1154	{
1155	if (S_ISDIR(inode->i_mode)) {
1156	struct file_system_type *type = inode->i_sb->s_type;
1157
1158	/ Set new key only if filesystem hasn't already changed it /
1159	if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
1160	/*
1161	* ensure nobody is actually holding i_rwsem
1162	*/
1163	init_rwsem(&inode->i_rwsem);
1164	lockdep_set_class(&inode->i_rwsem,
1165	&type->i_mutex_dir_key);
1166	}
1167	}
1168	}
1169	EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
1170	#endif
1171
1172	/**
1173	* unlock_new_inode - clear the I_NEW state and wake up any waiters
1174	* @inode: new inode to unlock
1175	*
1176	* Called when the inode is fully initialised to clear the new state of the
1177	* inode and wake up anyone waiting for the inode to finish initialisation.
1178	*/
1179	void unlock_new_inode(struct inode *inode)
1180	{
1181	lockdep_annotate_inode_mutex_key(inode);
1182	spin_lock(lock: &inode->i_lock);
1183	WARN_ON(!(inode->i_state & I_NEW));
1184	inode->i_state &= ~I_NEW & ~I_CREATING;
1185	/*
1186	* Pairs with the barrier in prepare_to_wait_event() to make sure
1187	* ___wait_var_event() either sees the bit cleared or
1188	* waitqueue_active() check in wake_up_var() sees the waiter.
1189	*/
1190	smp_mb();
1191	inode_wake_up_bit(inode, bit: __I_NEW);
1192	spin_unlock(lock: &inode->i_lock);
1193	}
1194	EXPORT_SYMBOL(unlock_new_inode);
1195
1196	void discard_new_inode(struct inode *inode)
1197	{
1198	lockdep_annotate_inode_mutex_key(inode);
1199	spin_lock(lock: &inode->i_lock);
1200	WARN_ON(!(inode->i_state & I_NEW));
1201	inode->i_state &= ~I_NEW;
1202	/*
1203	* Pairs with the barrier in prepare_to_wait_event() to make sure
1204	* ___wait_var_event() either sees the bit cleared or
1205	* waitqueue_active() check in wake_up_var() sees the waiter.
1206	*/
1207	smp_mb();
1208	inode_wake_up_bit(inode, bit: __I_NEW);
1209	spin_unlock(lock: &inode->i_lock);
1210	iput(inode);
1211	}
1212	EXPORT_SYMBOL(discard_new_inode);
1213
1214	/**
1215	* lock_two_nondirectories - take two i_mutexes on non-directory objects
1216	*
1217	* Lock any non-NULL argument. Passed objects must not be directories.
1218	* Zero, one or two objects may be locked by this function.
1219	*
1220	* @inode1: first inode to lock
1221	* @inode2: second inode to lock
1222	*/
1223	void lock_two_nondirectories(struct inode inode1, struct* inode *inode2)
1224	{
1225	if (inode1)
1226	WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
1227	if (inode2)
1228	WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
1229	if (inode1 > inode2)
1230	swap(inode1, inode2);
1231	if (inode1)
1232	inode_lock(inode: inode1);
1233	if (inode2 && inode2 != inode1)
1234	inode_lock_nested(inode: inode2, subclass: I_MUTEX_NONDIR2);
1235	}
1236	EXPORT_SYMBOL(lock_two_nondirectories);
1237
1238	/**
1239	* unlock_two_nondirectories - release locks from lock_two_nondirectories()
1240	* @inode1: first inode to unlock
1241	* @inode2: second inode to unlock
1242	*/
1243	void unlock_two_nondirectories(struct inode inode1, struct* inode *inode2)
1244	{
1245	if (inode1) {
1246	WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
1247	inode_unlock(inode: inode1);
1248	}
1249	if (inode2 && inode2 != inode1) {
1250	WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
1251	inode_unlock(inode: inode2);
1252	}
1253	}
1254	EXPORT_SYMBOL(unlock_two_nondirectories);
1255
1256	/**
1257	* inode_insert5 - obtain an inode from a mounted file system
1258	* @inode: pre-allocated inode to use for insert to cache
1259	* @hashval: hash value (usually inode number) to get
1260	* @test: callback used for comparisons between inodes
1261	* @set: callback used to initialize a new struct inode
1262	* @data: opaque data pointer to pass to @test and @set
1263	*
1264	* Search for the inode specified by @hashval and @data in the inode cache,
1265	* and if present return it with an increased reference count. This is a
1266	* variant of iget5_locked() that doesn't allocate an inode.
1267	*
1268	* If the inode is not present in the cache, insert the pre-allocated inode and
1269	* return it locked, hashed, and with the I_NEW flag set. The file system gets
1270	* to fill it in before unlocking it via unlock_new_inode().
1271	*
1272	* Note that both @test and @set are called with the inode_hash_lock held, so
1273	* they can't sleep.
1274	*/
1275	struct inode inode_insert5(struct* inode inode, unsigned* long hashval,
1276	int (test)(struct* inode , void* *),
1277	int (set)(struct* inode , void* ), void* *data)
1278	{
1279	struct hlist_head *head = inode_hashtable + hash(sb: inode->i_sb, hashval);
1280	struct inode *old;
1281
1282	might_sleep();
1283
1284	again:
1285	spin_lock(lock: &inode_hash_lock);
1286	old = find_inode(sb: inode->i_sb, head, test, data, is_inode_hash_locked: true);
1287	if (unlikely(old)) {
1288	/*
1289	* Uhhuh, somebody else created the same inode under us.
1290	* Use the old inode instead of the preallocated one.
1291	*/
1292	spin_unlock(lock: &inode_hash_lock);
1293	if (IS_ERR(ptr: old))
1294	return NULL;
1295	wait_on_inode(inode: old);
1296	if (unlikely(inode_unhashed(old))) {
1297	iput(old);
1298	goto again;
1299	}
1300	return old;
1301	}
1302
1303	if (set && unlikely(set(inode, data))) {
1304	spin_unlock(lock: &inode_hash_lock);
1305	return NULL;
1306	}
1307
1308	/*
1309	* Return the locked inode with I_NEW set, the
1310	* caller is responsible for filling in the contents
1311	*/
1312	spin_lock(lock: &inode->i_lock);
1313	inode->i_state \|= I_NEW;
1314	hlist_add_head_rcu(n: &inode->i_hash, h: head);
1315	spin_unlock(lock: &inode->i_lock);
1316
1317	spin_unlock(lock: &inode_hash_lock);
1318
1319	/*
1320	* Add inode to the sb list if it's not already. It has I_NEW at this
1321	* point, so it should be safe to test i_sb_list locklessly.
1322	*/
1323	if (list_empty(head: &inode->i_sb_list))
1324	inode_sb_list_add(inode);
1325
1326	return inode;
1327	}
1328	EXPORT_SYMBOL(inode_insert5);
1329
1330	/**
1331	* iget5_locked - obtain an inode from a mounted file system
1332	* @sb: super block of file system
1333	* @hashval: hash value (usually inode number) to get
1334	* @test: callback used for comparisons between inodes
1335	* @set: callback used to initialize a new struct inode
1336	* @data: opaque data pointer to pass to @test and @set
1337	*
1338	* Search for the inode specified by @hashval and @data in the inode cache,
1339	* and if present return it with an increased reference count. This is a
1340	* generalized version of iget_locked() for file systems where the inode
1341	* number is not sufficient for unique identification of an inode.
1342	*
1343	* If the inode is not present in the cache, allocate and insert a new inode
1344	* and return it locked, hashed, and with the I_NEW flag set. The file system
1345	* gets to fill it in before unlocking it via unlock_new_inode().
1346	*
1347	* Note that both @test and @set are called with the inode_hash_lock held, so
1348	* they can't sleep.
1349	*/
1350	struct inode iget5_locked(struct* super_block sb, unsigned* long hashval,
1351	int (test)(struct* inode , void* *),
1352	int (set)(struct* inode , void* ), void* *data)
1353	{
1354	struct inode *inode = ilookup5(sb, hashval, test, data);
1355
1356	if (!inode) {
1357	struct inode *new = alloc_inode(sb);
1358
1359	if (new) {
1360	inode = inode_insert5(new, hashval, test, set, data);
1361	if (unlikely(inode != new))
1362	destroy_inode(inode: new);
1363	}
1364	}
1365	return inode;
1366	}
1367	EXPORT_SYMBOL(iget5_locked);
1368
1369	/**
1370	* iget5_locked_rcu - obtain an inode from a mounted file system
1371	* @sb: super block of file system
1372	* @hashval: hash value (usually inode number) to get
1373	* @test: callback used for comparisons between inodes
1374	* @set: callback used to initialize a new struct inode
1375	* @data: opaque data pointer to pass to @test and @set
1376	*
1377	* This is equivalent to iget5_locked, except the @test callback must
1378	* tolerate the inode not being stable, including being mid-teardown.
1379	*/
1380	struct inode iget5_locked_rcu(struct* super_block sb, unsigned* long hashval,
1381	int (test)(struct* inode , void* *),
1382	int (set)(struct* inode , void* ), void* *data)
1383	{
1384	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1385	struct inode inode, new;
1386
1387	might_sleep();
1388
1389	again:
1390	inode = find_inode(sb, head, test, data, is_inode_hash_locked: false);
1391	if (inode) {
1392	if (IS_ERR(ptr: inode))
1393	return NULL;
1394	wait_on_inode(inode);
1395	if (unlikely(inode_unhashed(inode))) {
1396	iput(inode);
1397	goto again;
1398	}
1399	return inode;
1400	}
1401
1402	new = alloc_inode(sb);
1403	if (new) {
1404	inode = inode_insert5(new, hashval, test, set, data);
1405	if (unlikely(inode != new))
1406	destroy_inode(inode: new);
1407	}
1408	return inode;
1409	}
1410	EXPORT_SYMBOL_GPL(iget5_locked_rcu);
1411
1412	/**
1413	* iget_locked - obtain an inode from a mounted file system
1414	* @sb: super block of file system
1415	* @ino: inode number to get
1416	*
1417	* Search for the inode specified by @ino in the inode cache and if present
1418	* return it with an increased reference count. This is for file systems
1419	* where the inode number is sufficient for unique identification of an inode.
1420	*
1421	* If the inode is not in cache, allocate a new inode and return it locked,
1422	* hashed, and with the I_NEW flag set. The file system gets to fill it in
1423	* before unlocking it via unlock_new_inode().
1424	*/
1425	struct inode iget_locked(struct* super_block sb, unsigned* long ino)
1426	{
1427	struct hlist_head *head = inode_hashtable + hash(sb, hashval: ino);
1428	struct inode *inode;
1429
1430	might_sleep();
1431
1432	again:
1433	inode = find_inode_fast(sb, head, ino, is_inode_hash_locked: false);
1434	if (inode) {
1435	if (IS_ERR(ptr: inode))
1436	return NULL;
1437	wait_on_inode(inode);
1438	if (unlikely(inode_unhashed(inode))) {
1439	iput(inode);
1440	goto again;
1441	}
1442	return inode;
1443	}
1444
1445	inode = alloc_inode(sb);
1446	if (inode) {
1447	struct inode *old;
1448
1449	spin_lock(lock: &inode_hash_lock);
1450	/ We released the lock, so.. /
1451	old = find_inode_fast(sb, head, ino, is_inode_hash_locked: true);
1452	if (!old) {
1453	inode->i_ino = ino;
1454	spin_lock(lock: &inode->i_lock);
1455	inode->i_state = I_NEW;
1456	hlist_add_head_rcu(n: &inode->i_hash, h: head);
1457	spin_unlock(lock: &inode->i_lock);
1458	spin_unlock(lock: &inode_hash_lock);
1459	inode_sb_list_add(inode);
1460
1461	/ Return the locked inode with I_NEW set, the*
1462	* caller is responsible for filling in the contents
1463	*/
1464	return inode;
1465	}
1466
1467	/*
1468	* Uhhuh, somebody else created the same inode under
1469	* us. Use the old inode instead of the one we just
1470	* allocated.
1471	*/
1472	spin_unlock(lock: &inode_hash_lock);
1473	destroy_inode(inode);
1474	if (IS_ERR(ptr: old))
1475	return NULL;
1476	inode = old;
1477	wait_on_inode(inode);
1478	if (unlikely(inode_unhashed(inode))) {
1479	iput(inode);
1480	goto again;
1481	}
1482	}
1483	return inode;
1484	}
1485	EXPORT_SYMBOL(iget_locked);
1486
1487	/*
1488	* search the inode cache for a matching inode number.
1489	* If we find one, then the inode number we are trying to
1490	* allocate is not unique and so we should not use it.
1491	*
1492	* Returns 1 if the inode number is unique, 0 if it is not.
1493	*/
1494	static int test_inode_iunique(struct super_block sb, unsigned* long ino)
1495	{
1496	struct hlist_head *b = inode_hashtable + hash(sb, hashval: ino);
1497	struct inode *inode;
1498
1499	hlist_for_each_entry_rcu(inode, b, i_hash) {
1500	if (inode->i_ino == ino && inode->i_sb == sb)
1501	return `0`;
1502	}
1503	return `1`;
1504	}
1505
1506	/**
1507	* iunique - get a unique inode number
1508	* @sb: superblock
1509	* @max_reserved: highest reserved inode number
1510	*
1511	* Obtain an inode number that is unique on the system for a given
1512	* superblock. This is used by file systems that have no natural
1513	* permanent inode numbering system. An inode number is returned that
1514	* is higher than the reserved limit but unique.
1515	*
1516	* BUGS:
1517	* With a large number of inodes live on the file system this function
1518	* currently becomes quite slow.
1519	*/
1520	ino_t iunique(struct super_block *sb, ino_t max_reserved)
1521	{
1522	/*
1523	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
1524	* error if st_ino won't fit in target struct field. Use 32bit counter
1525	* here to attempt to avoid that.
1526	*/
1527	static DEFINE_SPINLOCK(iunique_lock);
1528	static unsigned int counter;
1529	ino_t res;
1530
1531	rcu_read_lock();
1532	spin_lock(lock: &iunique_lock);
1533	do {
1534	if (counter <= max_reserved)
1535	counter = max_reserved + `1`;
1536	res = counter++;
1537	} while (!test_inode_iunique(sb, ino: res));
1538	spin_unlock(lock: &iunique_lock);
1539	rcu_read_unlock();
1540
1541	return res;
1542	}
1543	EXPORT_SYMBOL(iunique);
1544
1545	struct inode igrab(struct* inode *inode)
1546	{
1547	spin_lock(lock: &inode->i_lock);
1548	if (!(inode->i_state & (I_FREEING\|I_WILL_FREE))) {
1549	__iget(inode);
1550	spin_unlock(lock: &inode->i_lock);
1551	} else {
1552	spin_unlock(lock: &inode->i_lock);
1553	/*
1554	* Handle the case where s_op->clear_inode is not been
1555	* called yet, and somebody is calling igrab
1556	* while the inode is getting freed.
1557	*/
1558	inode = NULL;
1559	}
1560	return inode;
1561	}
1562	EXPORT_SYMBOL(igrab);
1563
1564	/**
1565	* ilookup5_nowait - search for an inode in the inode cache
1566	* @sb: super block of file system to search
1567	* @hashval: hash value (usually inode number) to search for
1568	* @test: callback used for comparisons between inodes
1569	* @data: opaque data pointer to pass to @test
1570	*
1571	* Search for the inode specified by @hashval and @data in the inode cache.
1572	* If the inode is in the cache, the inode is returned with an incremented
1573	* reference count.
1574	*
1575	* Note: I_NEW is not waited upon so you have to be very careful what you do
1576	* with the returned inode. You probably should be using ilookup5() instead.
1577	*
1578	* Note2: @test is called with the inode_hash_lock held, so can't sleep.
1579	*/
1580	struct inode ilookup5_nowait(struct* super_block sb, unsigned* long hashval,
1581	int (test)(struct* inode , void* ), void* *data)
1582	{
1583	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1584	struct inode *inode;
1585
1586	spin_lock(lock: &inode_hash_lock);
1587	inode = find_inode(sb, head, test, data, is_inode_hash_locked: true);
1588	spin_unlock(lock: &inode_hash_lock);
1589
1590	return IS_ERR(ptr: inode) ? NULL : inode;
1591	}
1592	EXPORT_SYMBOL(ilookup5_nowait);
1593
1594	/**
1595	* ilookup5 - search for an inode in the inode cache
1596	* @sb: super block of file system to search
1597	* @hashval: hash value (usually inode number) to search for
1598	* @test: callback used for comparisons between inodes
1599	* @data: opaque data pointer to pass to @test
1600	*
1601	* Search for the inode specified by @hashval and @data in the inode cache,
1602	* and if the inode is in the cache, return the inode with an incremented
1603	* reference count. Waits on I_NEW before returning the inode.
1604	* returned with an incremented reference count.
1605	*
1606	* This is a generalized version of ilookup() for file systems where the
1607	* inode number is not sufficient for unique identification of an inode.
1608	*
1609	* Note: @test is called with the inode_hash_lock held, so can't sleep.
1610	*/
1611	struct inode ilookup5(struct* super_block sb, unsigned* long hashval,
1612	int (test)(struct* inode , void* ), void* *data)
1613	{
1614	struct inode *inode;
1615
1616	might_sleep();
1617
1618	again:
1619	inode = ilookup5_nowait(sb, hashval, test, data);
1620	if (inode) {
1621	wait_on_inode(inode);
1622	if (unlikely(inode_unhashed(inode))) {
1623	iput(inode);
1624	goto again;
1625	}
1626	}
1627	return inode;
1628	}
1629	EXPORT_SYMBOL(ilookup5);
1630
1631	/**
1632	* ilookup - search for an inode in the inode cache
1633	* @sb: super block of file system to search
1634	* @ino: inode number to search for
1635	*
1636	* Search for the inode @ino in the inode cache, and if the inode is in the
1637	* cache, the inode is returned with an incremented reference count.
1638	*/
1639	struct inode ilookup(struct* super_block sb, unsigned* long ino)
1640	{
1641	struct hlist_head *head = inode_hashtable + hash(sb, hashval: ino);
1642	struct inode *inode;
1643
1644	might_sleep();
1645
1646	again:
1647	inode = find_inode_fast(sb, head, ino, is_inode_hash_locked: false);
1648
1649	if (inode) {
1650	if (IS_ERR(ptr: inode))
1651	return NULL;
1652	wait_on_inode(inode);
1653	if (unlikely(inode_unhashed(inode))) {
1654	iput(inode);
1655	goto again;
1656	}
1657	}
1658	return inode;
1659	}
1660	EXPORT_SYMBOL(ilookup);
1661
1662	/**
1663	* find_inode_nowait - find an inode in the inode cache
1664	* @sb: super block of file system to search
1665	* @hashval: hash value (usually inode number) to search for
1666	* @match: callback used for comparisons between inodes
1667	* @data: opaque data pointer to pass to @match
1668	*
1669	* Search for the inode specified by @hashval and @data in the inode
1670	* cache, where the helper function @match will return 0 if the inode
1671	* does not match, 1 if the inode does match, and -1 if the search
1672	* should be stopped. The @match function must be responsible for
1673	* taking the i_lock spin_lock and checking i_state for an inode being
1674	* freed or being initialized, and incrementing the reference count
1675	* before returning 1. It also must not sleep, since it is called with
1676	* the inode_hash_lock spinlock held.
1677	*
1678	* This is a even more generalized version of ilookup5() when the
1679	* function must never block --- find_inode() can block in
1680	* __wait_on_freeing_inode() --- or when the caller can not increment
1681	* the reference count because the resulting iput() might cause an
1682	* inode eviction. The tradeoff is that the @match funtion must be
1683	* very carefully implemented.
1684	*/
1685	struct inode find_inode_nowait(struct* super_block *sb,
1686	unsigned long hashval,
1687	int (match)(struct* inode , unsigned* long,
1688	void *),
1689	void *data)
1690	{
1691	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1692	struct inode inode, ret_inode = NULL;
1693	int mval;
1694
1695	spin_lock(lock: &inode_hash_lock);
1696	hlist_for_each_entry(inode, head, i_hash) {
1697	if (inode->i_sb != sb)
1698	continue;
1699	mval = match(inode, hashval, data);
1700	if (mval == `0`)
1701	continue;
1702	if (mval == `1`)
1703	ret_inode = inode;
1704	goto out;
1705	}
1706	out:
1707	spin_unlock(lock: &inode_hash_lock);
1708	return ret_inode;
1709	}
1710	EXPORT_SYMBOL(find_inode_nowait);
1711
1712	/**
1713	* find_inode_rcu - find an inode in the inode cache
1714	* @sb: Super block of file system to search
1715	* @hashval: Key to hash
1716	* @test: Function to test match on an inode
1717	* @data: Data for test function
1718	*
1719	* Search for the inode specified by @hashval and @data in the inode cache,
1720	* where the helper function @test will return 0 if the inode does not match
1721	* and 1 if it does. The @test function must be responsible for taking the
1722	* i_lock spin_lock and checking i_state for an inode being freed or being
1723	* initialized.
1724	*
1725	* If successful, this will return the inode for which the @test function
1726	* returned 1 and NULL otherwise.
1727	*
1728	* The @test function is not permitted to take a ref on any inode presented.
1729	* It is also not permitted to sleep.
1730	*
1731	* The caller must hold the RCU read lock.
1732	*/
1733	struct inode find_inode_rcu(struct* super_block sb, unsigned* long hashval,
1734	int (test)(struct* inode , void* ), void* *data)
1735	{
1736	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1737	struct inode *inode;
1738
1739	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
1740	"suspicious find_inode_rcu() usage");
1741
1742	hlist_for_each_entry_rcu(inode, head, i_hash) {
1743	if (inode->i_sb == sb &&
1744	!(READ_ONCE(inode->i_state) & (I_FREEING \| I_WILL_FREE)) &&
1745	test(inode, data))
1746	return inode;
1747	}
1748	return NULL;
1749	}
1750	EXPORT_SYMBOL(find_inode_rcu);
1751
1752	/**
1753	* find_inode_by_ino_rcu - Find an inode in the inode cache
1754	* @sb: Super block of file system to search
1755	* @ino: The inode number to match
1756	*
1757	* Search for the inode specified by @hashval and @data in the inode cache,
1758	* where the helper function @test will return 0 if the inode does not match
1759	* and 1 if it does. The @test function must be responsible for taking the
1760	* i_lock spin_lock and checking i_state for an inode being freed or being
1761	* initialized.
1762	*
1763	* If successful, this will return the inode for which the @test function
1764	* returned 1 and NULL otherwise.
1765	*
1766	* The @test function is not permitted to take a ref on any inode presented.
1767	* It is also not permitted to sleep.
1768	*
1769	* The caller must hold the RCU read lock.
1770	*/
1771	struct inode find_inode_by_ino_rcu(struct* super_block *sb,
1772	unsigned long ino)
1773	{
1774	struct hlist_head *head = inode_hashtable + hash(sb, hashval: ino);
1775	struct inode *inode;
1776
1777	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
1778	"suspicious find_inode_by_ino_rcu() usage");
1779
1780	hlist_for_each_entry_rcu(inode, head, i_hash) {
1781	if (inode->i_ino == ino &&
1782	inode->i_sb == sb &&
1783	!(READ_ONCE(inode->i_state) & (I_FREEING \| I_WILL_FREE)))
1784	return inode;
1785	}
1786	return NULL;
1787	}
1788	EXPORT_SYMBOL(find_inode_by_ino_rcu);
1789
1790	int insert_inode_locked(struct inode *inode)
1791	{
1792	struct super_block *sb = inode->i_sb;
1793	ino_t ino = inode->i_ino;
1794	struct hlist_head *head = inode_hashtable + hash(sb, hashval: ino);
1795
1796	might_sleep();
1797
1798	while (`1`) {
1799	struct inode *old = NULL;
1800	spin_lock(lock: &inode_hash_lock);
1801	hlist_for_each_entry(old, head, i_hash) {
1802	if (old->i_ino != ino)
1803	continue;
1804	if (old->i_sb != sb)
1805	continue;
1806	spin_lock(lock: &old->i_lock);
1807	if (old->i_state & (I_FREEING\|I_WILL_FREE)) {
1808	spin_unlock(lock: &old->i_lock);
1809	continue;
1810	}
1811	break;
1812	}
1813	if (likely(!old)) {
1814	spin_lock(lock: &inode->i_lock);
1815	inode->i_state \|= I_NEW \| I_CREATING;
1816	hlist_add_head_rcu(n: &inode->i_hash, h: head);
1817	spin_unlock(lock: &inode->i_lock);
1818	spin_unlock(lock: &inode_hash_lock);
1819	return `0`;
1820	}
1821	if (unlikely(old->i_state & I_CREATING)) {
1822	spin_unlock(lock: &old->i_lock);
1823	spin_unlock(lock: &inode_hash_lock);
1824	return -EBUSY;
1825	}
1826	__iget(inode: old);
1827	spin_unlock(lock: &old->i_lock);
1828	spin_unlock(lock: &inode_hash_lock);
1829	wait_on_inode(inode: old);
1830	if (unlikely(!inode_unhashed(old))) {
1831	iput(old);
1832	return -EBUSY;
1833	}
1834	iput(old);
1835	}
1836	}
1837	EXPORT_SYMBOL(insert_inode_locked);
1838
1839	int insert_inode_locked4(struct inode inode, unsigned* long hashval,
1840	int (test)(struct* inode , void* ), void* *data)
1841	{
1842	struct inode *old;
1843
1844	might_sleep();
1845
1846	inode->i_state \|= I_CREATING;
1847	old = inode_insert5(inode, hashval, test, NULL, data);
1848
1849	if (old != inode) {
1850	iput(old);
1851	return -EBUSY;
1852	}
1853	return `0`;
1854	}
1855	EXPORT_SYMBOL(insert_inode_locked4);
1856
1857
1858	int inode_just_drop(struct inode *inode)
1859	{
1860	return `1`;
1861	}
1862	EXPORT_SYMBOL(inode_just_drop);
1863
1864	/*
1865	* Called when we're dropping the last reference
1866	* to an inode.
1867	*
1868	* Call the FS "drop_inode()" function, defaulting to
1869	* the legacy UNIX filesystem behaviour. If it tells
1870	* us to evict inode, do so. Otherwise, retain inode
1871	* in cache if fs is alive, sync and evict if fs is
1872	* shutting down.
1873	*/
1874	static void iput_final(struct inode *inode)
1875	{
1876	struct super_block *sb = inode->i_sb;
1877	const struct super_operations *op = inode->i_sb->s_op;
1878	unsigned long state;
1879	int drop;
1880
1881	WARN_ON(inode->i_state & I_NEW);
1882
1883	if (op->drop_inode)
1884	drop = op->drop_inode(inode);
1885	else
1886	drop = inode_generic_drop(inode);
1887
1888	if (!drop &&
1889	!(inode->i_state & I_DONTCACHE) &&
1890	(sb->s_flags & SB_ACTIVE)) {
1891	__inode_add_lru(inode, rotate: true);
1892	spin_unlock(lock: &inode->i_lock);
1893	return;
1894	}
1895
1896	state = inode->i_state;
1897	if (!drop) {
1898	WRITE_ONCE(inode->i_state, state \| I_WILL_FREE);
1899	spin_unlock(lock: &inode->i_lock);
1900
1901	write_inode_now(inode, sync: `1`);
1902
1903	spin_lock(lock: &inode->i_lock);
1904	state = inode->i_state;
1905	WARN_ON(state & I_NEW);
1906	state &= ~I_WILL_FREE;
1907	}
1908
1909	WRITE_ONCE(inode->i_state, state \| I_FREEING);
1910	if (!list_empty(head: &inode->i_lru))
1911	inode_lru_list_del(inode);
1912	spin_unlock(lock: &inode->i_lock);
1913
1914	evict(inode);
1915	}
1916
1917	/**
1918	* iput - put an inode
1919	* @inode: inode to put
1920	*
1921	* Puts an inode, dropping its usage count. If the inode use count hits
1922	* zero, the inode is then freed and may also be destroyed.
1923	*
1924	* Consequently, iput() can sleep.
1925	*/
1926	void iput(struct inode *inode)
1927	{
1928	might_sleep();
1929	if (unlikely(!inode))
1930	return;
1931
1932	retry:
1933	lockdep_assert_not_held(&inode->i_lock);
1934	VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode);
1935	/*
1936	* Note this assert is technically racy as if the count is bogusly
1937	* equal to one, then two CPUs racing to further drop it can both
1938	* conclude it's fine.
1939	*/
1940	VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < `1`, inode);
1941
1942	if (atomic_add_unless(v: &inode->i_count, a: -`1`, u: `1`))
1943	return;
1944
1945	if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) {
1946	trace_writeback_lazytime_iput(inode);
1947	mark_inode_dirty_sync(inode);
1948	goto retry;
1949	}
1950
1951	spin_lock(lock: &inode->i_lock);
1952	if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) {
1953	spin_unlock(lock: &inode->i_lock);
1954	goto retry;
1955	}
1956
1957	if (!atomic_dec_and_test(v: &inode->i_count)) {
1958	spin_unlock(lock: &inode->i_lock);
1959	return;
1960	}
1961
1962	/*
1963	* iput_final() drops ->i_lock, we can't assert on it as the inode may
1964	* be deallocated by the time the call returns.
1965	*/
1966	iput_final(inode);
1967	}
1968	EXPORT_SYMBOL(iput);
1969
1970	#ifdef CONFIG_BLOCK
1971	/**
1972	* bmap - find a block number in a file
1973	* @inode: inode owning the block number being requested
1974	* @block: pointer containing the block to find
1975	*
1976	* Replaces the value in ``*block`` with the block number on the device holding
1977	* corresponding to the requested block number in the file.
1978	* That is, asked for block 4 of inode 1 the function will replace the
1979	* 4 in ``*block``, with disk block relative to the disk start that holds that
1980	* block of the file.
1981	*
1982	* Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
1983	* hole, returns 0 and ``*block`` is also set to 0.
1984	*/
1985	int bmap(struct inode inode, sector_t block)
1986	{
1987	if (!inode->i_mapping->a_ops->bmap)
1988	return -EINVAL;
1989
1990	block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
1991	return `0`;
1992	}
1993	EXPORT_SYMBOL(bmap);
1994	#endif
1995
1996	/*
1997	* With relative atime, only update atime if the previous atime is
1998	* earlier than or equal to either the ctime or mtime,
1999	* or if at least a day has passed since the last atime update.
2000	*/
2001	static bool relatime_need_update(struct vfsmount mnt, struct* inode *inode,
2002	struct timespec64 now)
2003	{
2004	struct timespec64 atime, mtime, ctime;
2005
2006	if (!(mnt->mnt_flags & MNT_RELATIME))
2007	return true;
2008	/*
2009	* Is mtime younger than or equal to atime? If yes, update atime:
2010	*/
2011	atime = inode_get_atime(inode);
2012	mtime = inode_get_mtime(inode);
2013	if (timespec64_compare(lhs: &mtime, rhs: &atime) >= `0`)
2014	return true;
2015	/*
2016	* Is ctime younger than or equal to atime? If yes, update atime:
2017	*/
2018	ctime = inode_get_ctime(inode);
2019	if (timespec64_compare(lhs: &ctime, rhs: &atime) >= `0`)
2020	return true;
2021
2022	/*
2023	* Is the previous atime value older than a day? If yes,
2024	* update atime:
2025	*/
2026	if ((long)(now.tv_sec - atime.tv_sec) >= `24``60``60`)
2027	return true;
2028	/*
2029	* Good, we can skip the atime update:
2030	*/
2031	return false;
2032	}
2033
2034	/**
2035	* inode_update_timestamps - update the timestamps on the inode
2036	* @inode: inode to be updated
2037	* @flags: S_* flags that needed to be updated
2038	*
2039	* The update_time function is called when an inode's timestamps need to be
2040	* updated for a read or write operation. This function handles updating the
2041	* actual timestamps. It's up to the caller to ensure that the inode is marked
2042	* dirty appropriately.
2043	*
2044	* In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated,
2045	* attempt to update all three of them. S_ATIME updates can be handled
2046	* independently of the rest.
2047	*
2048	* Returns a set of S_* flags indicating which values changed.
2049	*/
2050	int inode_update_timestamps(struct inode inode, int* flags)
2051	{
2052	int updated = `0`;
2053	struct timespec64 now;
2054
2055	if (flags & (S_MTIME\|S_CTIME\|S_VERSION)) {
2056	struct timespec64 ctime = inode_get_ctime(inode);
2057	struct timespec64 mtime = inode_get_mtime(inode);
2058
2059	now = inode_set_ctime_current(inode);
2060	if (!timespec64_equal(a: &now, b: &ctime))
2061	updated \|= S_CTIME;
2062	if (!timespec64_equal(a: &now, b: &mtime)) {
2063	inode_set_mtime_to_ts(inode, ts: now);
2064	updated \|= S_MTIME;
2065	}
2066	if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, force: updated))
2067	updated \|= S_VERSION;
2068	} else {
2069	now = current_time(inode);
2070	}
2071
2072	if (flags & S_ATIME) {
2073	struct timespec64 atime = inode_get_atime(inode);
2074
2075	if (!timespec64_equal(a: &now, b: &atime)) {
2076	inode_set_atime_to_ts(inode, ts: now);
2077	updated \|= S_ATIME;
2078	}
2079	}
2080	return updated;
2081	}
2082	EXPORT_SYMBOL(inode_update_timestamps);
2083
2084	/**
2085	* generic_update_time - update the timestamps on the inode
2086	* @inode: inode to be updated
2087	* @flags: S_* flags that needed to be updated
2088	*
2089	* The update_time function is called when an inode's timestamps need to be
2090	* updated for a read or write operation. In the case where any of S_MTIME, S_CTIME,
2091	* or S_VERSION need to be updated we attempt to update all three of them. S_ATIME
2092	* updates can be handled done independently of the rest.
2093	*
2094	* Returns a S_* mask indicating which fields were updated.
2095	*/
2096	int generic_update_time(struct inode inode, int* flags)
2097	{
2098	int updated = inode_update_timestamps(inode, flags);
2099	int dirty_flags = `0`;
2100
2101	if (updated & (S_ATIME\|S_MTIME\|S_CTIME))
2102	dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC;
2103	if (updated & S_VERSION)
2104	dirty_flags \|= I_DIRTY_SYNC;
2105	__mark_inode_dirty(inode, dirty_flags);
2106	return updated;
2107	}
2108	EXPORT_SYMBOL(generic_update_time);
2109
2110	/*
2111	* This does the actual work of updating an inodes time or version. Must have
2112	* had called mnt_want_write() before calling this.
2113	*/
2114	int inode_update_time(struct inode inode, int* flags)
2115	{
2116	if (inode->i_op->update_time)
2117	return inode->i_op->update_time(inode, flags);
2118	generic_update_time(inode, flags);
2119	return `0`;
2120	}
2121	EXPORT_SYMBOL(inode_update_time);
2122
2123	/**
2124	* atime_needs_update - update the access time
2125	* @path: the &struct path to update
2126	* @inode: inode to update
2127	*
2128	* Update the accessed time on an inode and mark it for writeback.
2129	* This function automatically handles read only file systems and media,
2130	* as well as the "noatime" flag and inode specific "noatime" markers.
2131	*/
2132	bool atime_needs_update(const struct path path, struct* inode *inode)
2133	{
2134	struct vfsmount *mnt = path->mnt;
2135	struct timespec64 now, atime;
2136
2137	if (inode->i_flags & S_NOATIME)
2138	return false;
2139
2140	/ Atime updates will likely cause i_uid and i_gid to be written*
2141	* back improprely if their true value is unknown to the vfs.
2142	*/
2143	if (HAS_UNMAPPED_ID(idmap: mnt_idmap(mnt), inode))
2144	return false;
2145
2146	if (IS_NOATIME(inode))
2147	return false;
2148	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
2149	return false;
2150
2151	if (mnt->mnt_flags & MNT_NOATIME)
2152	return false;
2153	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
2154	return false;
2155
2156	now = current_time(inode);
2157
2158	if (!relatime_need_update(mnt, inode, now))
2159	return false;
2160
2161	atime = inode_get_atime(inode);
2162	if (timespec64_equal(a: &atime, b: &now))
2163	return false;
2164
2165	return true;
2166	}
2167
2168	void touch_atime(const struct path *path)
2169	{
2170	struct vfsmount *mnt = path->mnt;
2171	struct inode *inode = d_inode(dentry: path->dentry);
2172
2173	if (!atime_needs_update(path, inode))
2174	return;
2175
2176	if (!sb_start_write_trylock(sb: inode->i_sb))
2177	return;
2178
2179	if (mnt_get_write_access(mnt) != `0`)
2180	goto skip_update;
2181	/*
2182	* File systems can error out when updating inodes if they need to
2183	* allocate new space to modify an inode (such is the case for
2184	* Btrfs), but since we touch atime while walking down the path we
2185	* really don't care if we failed to update the atime of the file,
2186	* so just ignore the return value.
2187	* We may also fail on filesystems that have the ability to make parts
2188	* of the fs read only, e.g. subvolumes in Btrfs.
2189	*/
2190	inode_update_time(inode, S_ATIME);
2191	mnt_put_write_access(mnt);
2192	skip_update:
2193	sb_end_write(sb: inode->i_sb);
2194	}
2195	EXPORT_SYMBOL(touch_atime);
2196
2197	/*
2198	* Return mask of changes for notify_change() that need to be done as a
2199	* response to write or truncate. Return 0 if nothing has to be changed.
2200	* Negative value on error (change should be denied).
2201	*/
2202	int dentry_needs_remove_privs(struct mnt_idmap *idmap,
2203	struct dentry *dentry)
2204	{
2205	struct inode *inode = d_inode(dentry);
2206	int mask = `0`;
2207	int ret;
2208
2209	if (IS_NOSEC(inode))
2210	return `0`;
2211
2212	mask = setattr_should_drop_suidgid(idmap, inode);
2213	ret = security_inode_need_killpriv(dentry);
2214	if (ret < `0`)
2215	return ret;
2216	if (ret)
2217	mask \|= ATTR_KILL_PRIV;
2218	return mask;
2219	}
2220
2221	static int __remove_privs(struct mnt_idmap *idmap,
2222	struct dentry dentry, int* kill)
2223	{
2224	struct iattr newattrs;
2225
2226	newattrs.ia_valid = ATTR_FORCE \| kill;
2227	/*
2228	* Note we call this on write, so notify_change will not
2229	* encounter any conflicting delegations:
2230	*/
2231	return notify_change(idmap, dentry, &newattrs, NULL);
2232	}
2233
2234	static int file_remove_privs_flags(struct file file, unsigned* int flags)
2235	{
2236	struct dentry *dentry = file_dentry(file);
2237	struct inode *inode = file_inode(f: file);
2238	int error = `0`;
2239	int kill;
2240
2241	if (IS_NOSEC(inode) \|\| !S_ISREG(inode->i_mode))
2242	return `0`;
2243
2244	kill = dentry_needs_remove_privs(idmap: file_mnt_idmap(file), dentry);
2245	if (kill < `0`)
2246	return kill;
2247
2248	if (kill) {
2249	if (flags & IOCB_NOWAIT)
2250	return -EAGAIN;
2251
2252	error = __remove_privs(idmap: file_mnt_idmap(file), dentry, kill);
2253	}
2254
2255	if (!error)
2256	inode_has_no_xattr(inode);
2257	return error;
2258	}
2259
2260	/**
2261	* file_remove_privs - remove special file privileges (suid, capabilities)
2262	* @file: file to remove privileges from
2263	*
2264	* When file is modified by a write or truncation ensure that special
2265	* file privileges are removed.
2266	*
2267	* Return: 0 on success, negative errno on failure.
2268	*/
2269	int file_remove_privs(struct file *file)
2270	{
2271	return file_remove_privs_flags(file, flags: `0`);
2272	}
2273	EXPORT_SYMBOL(file_remove_privs);
2274
2275	/**
2276	* current_time - Return FS time (possibly fine-grained)
2277	* @inode: inode.
2278	*
2279	* Return the current time truncated to the time granularity supported by
2280	* the fs, as suitable for a ctime/mtime change. If the ctime is flagged
2281	* as having been QUERIED, get a fine-grained timestamp, but don't update
2282	* the floor.
2283	*
2284	* For a multigrain inode, this is effectively an estimate of the timestamp
2285	* that a file would receive. An actual update must go through
2286	* inode_set_ctime_current().
2287	*/
2288	struct timespec64 current_time(struct inode *inode)
2289	{
2290	struct timespec64 now;
2291	u32 cns;
2292
2293	ktime_get_coarse_real_ts64_mg(ts: &now);
2294
2295	if (!is_mgtime(inode))
2296	goto out;
2297
2298	/ If nothing has queried it, then coarse time is fine /
2299	cns = smp_load_acquire(&inode->i_ctime_nsec);
2300	if (cns & I_CTIME_QUERIED) {
2301	/*
2302	* If there is no apparent change, then get a fine-grained
2303	* timestamp.
2304	*/
2305	if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
2306	ktime_get_real_ts64(tv: &now);
2307	}
2308	out:
2309	return timestamp_truncate(t: now, inode);
2310	}
2311	EXPORT_SYMBOL(current_time);
2312
2313	static int inode_needs_update_time(struct inode *inode)
2314	{
2315	struct timespec64 now, ts;
2316	int sync_it = `0`;
2317
2318	/ First try to exhaust all avenues to not sync /
2319	if (IS_NOCMTIME(inode))
2320	return `0`;
2321
2322	now = current_time(inode);
2323
2324	ts = inode_get_mtime(inode);
2325	if (!timespec64_equal(a: &ts, b: &now))
2326	sync_it \|= S_MTIME;
2327
2328	ts = inode_get_ctime(inode);
2329	if (!timespec64_equal(a: &ts, b: &now))
2330	sync_it \|= S_CTIME;
2331
2332	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
2333	sync_it \|= S_VERSION;
2334
2335	return sync_it;
2336	}
2337
2338	static int __file_update_time(struct file file, int* sync_mode)
2339	{
2340	int ret = `0`;
2341	struct inode *inode = file_inode(f: file);
2342
2343	/ try to update time settings /
2344	if (!mnt_get_write_access_file(file)) {
2345	ret = inode_update_time(inode, sync_mode);
2346	mnt_put_write_access_file(file);
2347	}
2348
2349	return ret;
2350	}
2351
2352	/**
2353	* file_update_time - update mtime and ctime time
2354	* @file: file accessed
2355	*
2356	* Update the mtime and ctime members of an inode and mark the inode for
2357	* writeback. Note that this function is meant exclusively for usage in
2358	* the file write path of filesystems, and filesystems may choose to
2359	* explicitly ignore updates via this function with the _NOCMTIME inode
2360	* flag, e.g. for network filesystem where these imestamps are handled
2361	* by the server. This can return an error for file systems who need to
2362	* allocate space in order to update an inode.
2363	*
2364	* Return: 0 on success, negative errno on failure.
2365	*/
2366	int file_update_time(struct file *file)
2367	{
2368	int ret;
2369	struct inode *inode = file_inode(f: file);
2370
2371	ret = inode_needs_update_time(inode);
2372	if (ret <= `0`)
2373	return ret;
2374
2375	return __file_update_time(file, sync_mode: ret);
2376	}
2377	EXPORT_SYMBOL(file_update_time);
2378
2379	/**
2380	* file_modified_flags - handle mandated vfs changes when modifying a file
2381	* @file: file that was modified
2382	* @flags: kiocb flags
2383	*
2384	* When file has been modified ensure that special
2385	* file privileges are removed and time settings are updated.
2386	*
2387	* If IOCB_NOWAIT is set, special file privileges will not be removed and
2388	* time settings will not be updated. It will return -EAGAIN.
2389	*
2390	* Context: Caller must hold the file's inode lock.
2391	*
2392	* Return: 0 on success, negative errno on failure.
2393	*/
2394	static int file_modified_flags(struct file file, int* flags)
2395	{
2396	int ret;
2397	struct inode *inode = file_inode(f: file);
2398
2399	/*
2400	* Clear the security bits if the process is not being run by root.
2401	* This keeps people from modifying setuid and setgid binaries.
2402	*/
2403	ret = file_remove_privs_flags(file, flags);
2404	if (ret)
2405	return ret;
2406
2407	if (unlikely(file->f_mode & FMODE_NOCMTIME))
2408	return `0`;
2409
2410	ret = inode_needs_update_time(inode);
2411	if (ret <= `0`)
2412	return ret;
2413	if (flags & IOCB_NOWAIT)
2414	return -EAGAIN;
2415
2416	return __file_update_time(file, sync_mode: ret);
2417	}
2418
2419	/**
2420	* file_modified - handle mandated vfs changes when modifying a file
2421	* @file: file that was modified
2422	*
2423	* When file has been modified ensure that special
2424	* file privileges are removed and time settings are updated.
2425	*
2426	* Context: Caller must hold the file's inode lock.
2427	*
2428	* Return: 0 on success, negative errno on failure.
2429	*/
2430	int file_modified(struct file *file)
2431	{
2432	return file_modified_flags(file, flags: `0`);
2433	}
2434	EXPORT_SYMBOL(file_modified);
2435
2436	/**
2437	* kiocb_modified - handle mandated vfs changes when modifying a file
2438	* @iocb: iocb that was modified
2439	*
2440	* When file has been modified ensure that special
2441	* file privileges are removed and time settings are updated.
2442	*
2443	* Context: Caller must hold the file's inode lock.
2444	*
2445	* Return: 0 on success, negative errno on failure.
2446	*/
2447	int kiocb_modified(struct kiocb *iocb)
2448	{
2449	return file_modified_flags(file: iocb->ki_filp, flags: iocb->ki_flags);
2450	}
2451	EXPORT_SYMBOL_GPL(kiocb_modified);
2452
2453	int inode_needs_sync(struct inode *inode)
2454	{
2455	if (IS_SYNC(inode))
2456	return `1`;
2457	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
2458	return `1`;
2459	return `0`;
2460	}
2461	EXPORT_SYMBOL(inode_needs_sync);
2462
2463	/*
2464	* If we try to find an inode in the inode hash while it is being
2465	* deleted, we have to wait until the filesystem completes its
2466	* deletion before reporting that it isn't found. This function waits
2467	* until the deletion _might_ have completed. Callers are responsible
2468	* to recheck inode state.
2469	*
2470	* It doesn't matter if I_NEW is not set initially, a call to
2471	* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
2472	* will DTRT.
2473	*/
2474	static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
2475	{
2476	struct wait_bit_queue_entry wqe;
2477	struct wait_queue_head *wq_head;
2478
2479	/*
2480	* Handle racing against evict(), see that routine for more details.
2481	*/
2482	if (unlikely(inode_unhashed(inode))) {
2483	WARN_ON(is_inode_hash_locked);
2484	spin_unlock(lock: &inode->i_lock);
2485	return;
2486	}
2487
2488	wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
2489	prepare_to_wait_event(wq_head, wq_entry: &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
2490	spin_unlock(lock: &inode->i_lock);
2491	rcu_read_unlock();
2492	if (is_inode_hash_locked)
2493	spin_unlock(lock: &inode_hash_lock);
2494	schedule();
2495	finish_wait(wq_head, wq_entry: &wqe.wq_entry);
2496	if (is_inode_hash_locked)
2497	spin_lock(lock: &inode_hash_lock);
2498	rcu_read_lock();
2499	}
2500
2501	static __initdata unsigned long ihash_entries;
2502	static int __init set_ihash_entries(char *str)
2503	{
2504	if (!str)
2505	return `0`;
2506	ihash_entries = simple_strtoul(str, &str, `0`);
2507	return `1`;
2508	}
2509	__setup("ihash_entries=", set_ihash_entries);
2510
2511	/*
2512	* Initialize the waitqueues and inode hash table.
2513	*/
2514	void __init inode_init_early(void)
2515	{
2516	/ If hashes are distributed across NUMA nodes, defer*
2517	* hash allocation until vmalloc space is available.
2518	*/
2519	if (hashdist)
2520	return;
2521
2522	inode_hashtable =
2523	alloc_large_system_hash(tablename: "Inode-cache",
2524	bucketsize: sizeof(struct hlist_head),
2525	numentries: ihash_entries,
2526	scale: `14`,
2527	HASH_EARLY \| HASH_ZERO,
2528	hash_shift: &i_hash_shift,
2529	hash_mask: &i_hash_mask,
2530	low_limit: `0`,
2531	high_limit: `0`);
2532	}
2533
2534	void __init inode_init(void)
2535	{
2536	/ inode slab cache /
2537	inode_cachep = kmem_cache_create("inode_cache",
2538	sizeof(struct inode),
2539	`0`,
2540	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
2541	SLAB_ACCOUNT),
2542	init_once);
2543
2544	/ Hash may have been set up in inode_init_early /
2545	if (!hashdist)
2546	return;
2547
2548	inode_hashtable =
2549	alloc_large_system_hash(tablename: "Inode-cache",
2550	bucketsize: sizeof(struct hlist_head),
2551	numentries: ihash_entries,
2552	scale: `14`,
2553	HASH_ZERO,
2554	hash_shift: &i_hash_shift,
2555	hash_mask: &i_hash_mask,
2556	low_limit: `0`,
2557	high_limit: `0`);
2558	}
2559
2560	void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
2561	{
2562	inode->i_mode = mode;
2563	switch (inode->i_mode & S_IFMT) {
2564	case S_IFCHR:
2565	inode->i_fop = &def_chr_fops;
2566	inode->i_rdev = rdev;
2567	break;
2568	case S_IFBLK:
2569	if (IS_ENABLED(CONFIG_BLOCK))
2570	inode->i_fop = &def_blk_fops;
2571	inode->i_rdev = rdev;
2572	break;
2573	case S_IFIFO:
2574	inode->i_fop = &pipefifo_fops;
2575	break;
2576	case S_IFSOCK:
2577	/ leave it no_open_fops /
2578	break;
2579	default:
2580	printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
2581	" inode %s:%lu\n", mode, inode->i_sb->s_id,
2582	inode->i_ino);
2583	break;
2584	}
2585	}
2586	EXPORT_SYMBOL(init_special_inode);
2587
2588	/**
2589	* inode_init_owner - Init uid,gid,mode for new inode according to posix standards
2590	* @idmap: idmap of the mount the inode was created from
2591	* @inode: New inode
2592	* @dir: Directory inode
2593	* @mode: mode of the new inode
2594	*
2595	* If the inode has been created through an idmapped mount the idmap of
2596	* the vfsmount must be passed through @idmap. This function will then take
2597	* care to map the inode according to @idmap before checking permissions
2598	* and initializing i_uid and i_gid. On non-idmapped mounts or if permission
2599	* checking is to be performed on the raw inode simply pass @nop_mnt_idmap.
2600	*/
2601	void inode_init_owner(struct mnt_idmap idmap, struct* inode *inode,
2602	const struct inode *dir, umode_t mode)
2603	{
2604	inode_fsuid_set(inode, idmap);
2605	if (dir && dir->i_mode & S_ISGID) {
2606	inode->i_gid = dir->i_gid;
2607
2608	/ Directories are special, and always inherit S_ISGID /
2609	if (S_ISDIR(mode))
2610	mode \|= S_ISGID;
2611	} else
2612	inode_fsgid_set(inode, idmap);
2613	inode->i_mode = mode;
2614	}
2615	EXPORT_SYMBOL(inode_init_owner);
2616
2617	/**
2618	* inode_owner_or_capable - check current task permissions to inode
2619	* @idmap: idmap of the mount the inode was found from
2620	* @inode: inode being checked
2621	*
2622	* Return true if current either has CAP_FOWNER in a namespace with the
2623	* inode owner uid mapped, or owns the file.
2624	*
2625	* If the inode has been found through an idmapped mount the idmap of
2626	* the vfsmount must be passed through @idmap. This function will then take
2627	* care to map the inode according to @idmap before checking permissions.
2628	* On non-idmapped mounts or if permission checking is to be performed on the
2629	* raw inode simply pass @nop_mnt_idmap.
2630	*/
2631	bool inode_owner_or_capable(struct mnt_idmap *idmap,
2632	const struct inode *inode)
2633	{
2634	vfsuid_t vfsuid;
2635	struct user_namespace *ns;
2636
2637	vfsuid = i_uid_into_vfsuid(idmap, inode);
2638	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
2639	return true;
2640
2641	ns = current_user_ns();
2642	if (vfsuid_has_mapping(userns: ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
2643	return true;
2644	return false;
2645	}
2646	EXPORT_SYMBOL(inode_owner_or_capable);
2647
2648	/*
2649	* Direct i/o helper functions
2650	*/
2651	bool inode_dio_finished(const struct inode *inode)
2652	{
2653	return atomic_read(v: &inode->i_dio_count) == `0`;
2654	}
2655	EXPORT_SYMBOL(inode_dio_finished);
2656
2657	/**
2658	* inode_dio_wait - wait for outstanding DIO requests to finish
2659	* @inode: inode to wait for
2660	*
2661	* Waits for all pending direct I/O requests to finish so that we can
2662	* proceed with a truncate or equivalent operation.
2663	*
2664	* Must be called under a lock that serializes taking new references
2665	* to i_dio_count, usually by inode->i_rwsem.
2666	*/
2667	void inode_dio_wait(struct inode *inode)
2668	{
2669	wait_var_event(&inode->i_dio_count, inode_dio_finished(inode));
2670	}
2671	EXPORT_SYMBOL(inode_dio_wait);
2672
2673	void inode_dio_wait_interruptible(struct inode *inode)
2674	{
2675	wait_var_event_interruptible(&inode->i_dio_count,
2676	inode_dio_finished(inode));
2677	}
2678	EXPORT_SYMBOL(inode_dio_wait_interruptible);
2679
2680	/*
2681	* inode_set_flags - atomically set some inode flags
2682	*
2683	* Note: the caller should be holding i_rwsem exclusively, or else be sure that
2684	* they have exclusive access to the inode structure (i.e., while the
2685	* inode is being instantiated). The reason for the cmpxchg() loop
2686	* --- which wouldn't be necessary if all code paths which modify
2687	* i_flags actually followed this rule, is that there is at least one
2688	* code path which doesn't today so we use cmpxchg() out of an abundance
2689	* of caution.
2690	*
2691	* In the long run, i_rwsem is overkill, and we should probably look
2692	* at using the i_lock spinlock to protect i_flags, and then make sure
2693	* it is so documented in include/linux/fs.h and that all code follows
2694	* the locking convention!!
2695	*/
2696	void inode_set_flags(struct inode inode, unsigned* int flags,
2697	unsigned int mask)
2698	{
2699	WARN_ON_ONCE(flags & ~mask);
2700	set_mask_bits(&inode->i_flags, mask, flags);
2701	}
2702	EXPORT_SYMBOL(inode_set_flags);
2703
2704	void inode_nohighmem(struct inode *inode)
2705	{
2706	mapping_set_gfp_mask(m: inode->i_mapping, GFP_USER);
2707	}
2708	EXPORT_SYMBOL(inode_nohighmem);
2709
2710	struct timespec64 inode_set_ctime_to_ts(struct inode inode, struct* timespec64 ts)
2711	{
2712	trace_inode_set_ctime_to_ts(inode, ctime: &ts);
2713	set_normalized_timespec64(ts: &ts, sec: ts.tv_sec, nsec: ts.tv_nsec);
2714	inode->i_ctime_sec = ts.tv_sec;
2715	inode->i_ctime_nsec = ts.tv_nsec;
2716	return ts;
2717	}
2718	EXPORT_SYMBOL(inode_set_ctime_to_ts);
2719
2720	/**
2721	* timestamp_truncate - Truncate timespec to a granularity
2722	* @t: Timespec
2723	* @inode: inode being updated
2724	*
2725	* Truncate a timespec to the granularity supported by the fs
2726	* containing the inode. Always rounds down. gran must
2727	* not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
2728	*/
2729	struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
2730	{
2731	struct super_block *sb = inode->i_sb;
2732	unsigned int gran = sb->s_time_gran;
2733
2734	t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
2735	if (unlikely(t.tv_sec == sb->s_time_max \|\| t.tv_sec == sb->s_time_min))
2736	t.tv_nsec = `0`;
2737
2738	/ Avoid division in the common cases 1 ns and 1 s. /
2739	if (gran == `1`)
2740	; / nothing /
2741	else if (gran == NSEC_PER_SEC)
2742	t.tv_nsec = `0`;
2743	else if (gran > `1` && gran < NSEC_PER_SEC)
2744	t.tv_nsec -= t.tv_nsec % gran;
2745	else
2746	WARN(`1`, "invalid file time granularity: %u", gran);
2747	return t;
2748	}
2749	EXPORT_SYMBOL(timestamp_truncate);
2750
2751	/**
2752	* inode_set_ctime_current - set the ctime to current_time
2753	* @inode: inode
2754	*
2755	* Set the inode's ctime to the current value for the inode. Returns the
2756	* current value that was assigned. If this is not a multigrain inode, then we
2757	* set it to the later of the coarse time and floor value.
2758	*
2759	* If it is multigrain, then we first see if the coarse-grained timestamp is
2760	* distinct from what is already there. If so, then use that. Otherwise, get a
2761	* fine-grained timestamp.
2762	*
2763	* After that, try to swap the new value into i_ctime_nsec. Accept the
2764	* resulting ctime, regardless of the outcome of the swap. If it has
2765	* already been replaced, then that timestamp is later than the earlier
2766	* unacceptable one, and is thus acceptable.
2767	*/
2768	struct timespec64 inode_set_ctime_current(struct inode *inode)
2769	{
2770	struct timespec64 now;
2771	u32 cns, cur;
2772
2773	ktime_get_coarse_real_ts64_mg(ts: &now);
2774	now = timestamp_truncate(now, inode);
2775
2776	/ Just return that if this is not a multigrain fs /
2777	if (!is_mgtime(inode)) {
2778	inode_set_ctime_to_ts(inode, now);
2779	goto out;
2780	}
2781
2782	/*
2783	* A fine-grained time is only needed if someone has queried
2784	* for timestamps, and the current coarse grained time isn't
2785	* later than what's already there.
2786	*/
2787	cns = smp_load_acquire(&inode->i_ctime_nsec);
2788	if (cns & I_CTIME_QUERIED) {
2789	struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
2790	.tv_nsec = cns & ~I_CTIME_QUERIED };
2791
2792	if (timespec64_compare(lhs: &now, rhs: &ctime) <= `0`) {
2793	ktime_get_real_ts64_mg(ts: &now);
2794	now = timestamp_truncate(now, inode);
2795	mgtime_counter_inc(mg_fine_stamps);
2796	}
2797	}
2798	mgtime_counter_inc(mg_ctime_updates);
2799
2800	/ No need to cmpxchg if it's exactly the same /
2801	if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) {
2802	trace_ctime_xchg_skip(inode, ctime: &now);
2803	goto out;
2804	}
2805	cur = cns;
2806	retry:
2807	/ Try to swap the nsec value into place. /
2808	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
2809	/ If swap occurred, then we're (mostly) done /
2810	inode->i_ctime_sec = now.tv_sec;
2811	trace_ctime_ns_xchg(inode, old: cns, new: now.tv_nsec, cur);
2812	mgtime_counter_inc(mg_ctime_swaps);
2813	} else {
2814	/*
2815	* Was the change due to someone marking the old ctime QUERIED?
2816	* If so then retry the swap. This can only happen once since
2817	* the only way to clear I_CTIME_QUERIED is to stamp the inode
2818	* with a new ctime.
2819	*/
2820	if (!(cns & I_CTIME_QUERIED) && (cns \| I_CTIME_QUERIED) == cur) {
2821	cns = cur;
2822	goto retry;
2823	}
2824	/ Otherwise, keep the existing ctime /
2825	now.tv_sec = inode->i_ctime_sec;
2826	now.tv_nsec = cur & ~I_CTIME_QUERIED;
2827	}
2828	out:
2829	return now;
2830	}
2831	EXPORT_SYMBOL(inode_set_ctime_current);
2832
2833	/**
2834	* inode_set_ctime_deleg - try to update the ctime on a delegated inode
2835	* @inode: inode to update
2836	* @update: timespec64 to set the ctime
2837	*
2838	* Attempt to atomically update the ctime on behalf of a delegation holder.
2839	*
2840	* The nfs server can call back the holder of a delegation to get updated
2841	* inode attributes, including the mtime. When updating the mtime, update
2842	* the ctime to a value at least equal to that.
2843	*
2844	* This can race with concurrent updates to the inode, in which
2845	* case the update is skipped.
2846	*
2847	* Note that this works even when multigrain timestamps are not enabled,
2848	* so it is used in either case.
2849	*/
2850	struct timespec64 inode_set_ctime_deleg(struct inode inode, struct* timespec64 update)
2851	{
2852	struct timespec64 now, cur_ts;
2853	u32 cur, old;
2854
2855	/ pairs with try_cmpxchg below /
2856	cur = smp_load_acquire(&inode->i_ctime_nsec);
2857	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
2858	cur_ts.tv_sec = inode->i_ctime_sec;
2859
2860	/ If the update is older than the existing value, skip it. /
2861	if (timespec64_compare(lhs: &update, rhs: &cur_ts) <= `0`)
2862	return cur_ts;
2863
2864	ktime_get_coarse_real_ts64_mg(ts: &now);
2865
2866	/ Clamp the update to "now" if it's in the future /
2867	if (timespec64_compare(lhs: &update, rhs: &now) > `0`)
2868	update = now;
2869
2870	update = timestamp_truncate(update, inode);
2871
2872	/ No need to update if the values are already the same /
2873	if (timespec64_equal(a: &update, b: &cur_ts))
2874	return cur_ts;
2875
2876	/*
2877	* Try to swap the nsec value into place. If it fails, that means
2878	* it raced with an update due to a write or similar activity. That
2879	* stamp takes precedence, so just skip the update.
2880	*/
2881	retry:
2882	old = cur;
2883	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
2884	inode->i_ctime_sec = update.tv_sec;
2885	mgtime_counter_inc(mg_ctime_swaps);
2886	return update;
2887	}
2888
2889	/*
2890	* Was the change due to another task marking the old ctime QUERIED?
2891	*
2892	* If so, then retry the swap. This can only happen once since
2893	* the only way to clear I_CTIME_QUERIED is to stamp the inode
2894	* with a new ctime.
2895	*/
2896	if (!(old & I_CTIME_QUERIED) && (cur == (old \| I_CTIME_QUERIED)))
2897	goto retry;
2898
2899	/ Otherwise, it was a new timestamp. /
2900	cur_ts.tv_sec = inode->i_ctime_sec;
2901	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
2902	return cur_ts;
2903	}
2904	EXPORT_SYMBOL(inode_set_ctime_deleg);
2905
2906	/**
2907	* in_group_or_capable - check whether caller is CAP_FSETID privileged
2908	* @idmap: idmap of the mount @inode was found from
2909	* @inode: inode to check
2910	* @vfsgid: the new/current vfsgid of @inode
2911	*
2912	* Check whether @vfsgid is in the caller's group list or if the caller is
2913	* privileged with CAP_FSETID over @inode. This can be used to determine
2914	* whether the setgid bit can be kept or must be dropped.
2915	*
2916	* Return: true if the caller is sufficiently privileged, false if not.
2917	*/
2918	bool in_group_or_capable(struct mnt_idmap *idmap,
2919	const struct inode *inode, vfsgid_t vfsgid)
2920	{
2921	if (vfsgid_in_group_p(vfsgid))
2922	return true;
2923	if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
2924	return true;
2925	return false;
2926	}
2927	EXPORT_SYMBOL(in_group_or_capable);
2928
2929	/**
2930	* mode_strip_sgid - handle the sgid bit for non-directories
2931	* @idmap: idmap of the mount the inode was created from
2932	* @dir: parent directory inode
2933	* @mode: mode of the file to be created in @dir
2934	*
2935	* If the @mode of the new file has both the S_ISGID and S_IXGRP bit
2936	* raised and @dir has the S_ISGID bit raised ensure that the caller is
2937	* either in the group of the parent directory or they have CAP_FSETID
2938	* in their user namespace and are privileged over the parent directory.
2939	* In all other cases, strip the S_ISGID bit from @mode.
2940	*
2941	* Return: the new mode to use for the file
2942	*/
2943	umode_t mode_strip_sgid(struct mnt_idmap *idmap,
2944	const struct inode *dir, umode_t mode)
2945	{
2946	if ((mode & (S_ISGID \| S_IXGRP)) != (S_ISGID \| S_IXGRP))
2947	return mode;
2948	if (S_ISDIR(mode) \|\| !dir \|\| !(dir->i_mode & S_ISGID))
2949	return mode;
2950	if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, inode: dir)))
2951	return mode;
2952	return mode & ~S_ISGID;
2953	}
2954	EXPORT_SYMBOL(mode_strip_sgid);
2955
2956	#ifdef CONFIG_DEBUG_VFS
2957	/*
2958	* Dump an inode.
2959	*
2960	* TODO: add a proper inode dumping routine, this is a stub to get debug off the
2961	* ground.
2962	*
2963	* TODO: handle getting to fs type with get_kernel_nofault()?
2964	* See dump_mapping() above.
2965	*/
2966	void dump_inode(struct inode inode, const* char *reason)
2967	{
2968	struct super_block *sb = inode->i_sb;
2969
2970	pr_warn("%s encountered for inode %px\n"
2971	"fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
2972	reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
2973	inode->i_flags, inode->i_state, atomic_read(&inode->i_count));
2974	}
2975
2976	EXPORT_SYMBOL(dump_inode);
2977	#endif
2978

Browse the source code of Linux/fs/inode.c