file.c source code [Linux/fs/file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/file.c
4	*
5	* Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
6	*
7	* Manage the dynamic fd arrays in the process files_struct.
8	*/
9
10	#include <linux/syscalls.h>
11	#include <linux/export.h>
12	#include <linux/fs.h>
13	#include <linux/kernel.h>
14	#include <linux/mm.h>
15	#include <linux/sched/signal.h>
16	#include <linux/slab.h>
17	#include <linux/file.h>
18	#include <linux/fdtable.h>
19	#include <linux/bitops.h>
20	#include <linux/spinlock.h>
21	#include <linux/rcupdate.h>
22	#include <linux/close_range.h>
23	#include <linux/file_ref.h>
24	#include <net/sock.h>
25	#include <linux/init_task.h>
26
27	#include "internal.h"
28
29	static noinline bool __file_ref_put_badval(file_ref_t ref, unsigned* long cnt)
30	{
31	/*
32	* If the reference count was already in the dead zone, then this
33	* put() operation is imbalanced. Warn, put the reference count back to
34	* DEAD and tell the caller to not deconstruct the object.
35	*/
36	if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
37	atomic_long_set(v: &ref->refcnt, FILE_REF_DEAD);
38	return false;
39	}
40
41	/*
42	* This is a put() operation on a saturated refcount. Restore the
43	* mean saturation value and tell the caller to not deconstruct the
44	* object.
45	*/
46	if (cnt > FILE_REF_MAXREF)
47	atomic_long_set(v: &ref->refcnt, FILE_REF_SATURATED);
48	return false;
49	}
50
51	/**
52	* __file_ref_put - Slowpath of file_ref_put()
53	* @ref: Pointer to the reference count
54	* @cnt: Current reference count
55	*
56	* Invoked when the reference count is outside of the valid zone.
57	*
58	* Return:
59	* True if this was the last reference with no future references
60	* possible. This signals the caller that it can safely schedule the
61	* object, which is protected by the reference counter, for
62	* deconstruction.
63	*
64	* False if there are still active references or the put() raced
65	* with a concurrent get()/put() pair. Caller is not allowed to
66	* deconstruct the protected object.
67	*/
68	bool __file_ref_put(file_ref_t ref, unsigned* long cnt)
69	{
70	/ Did this drop the last reference? /
71	if (likely(cnt == FILE_REF_NOREF)) {
72	/*
73	* Carefully try to set the reference count to FILE_REF_DEAD.
74	*
75	* This can fail if a concurrent get() operation has
76	* elevated it again or the corresponding put() even marked
77	* it dead already. Both are valid situations and do not
78	* require a retry. If this fails the caller is not
79	* allowed to deconstruct the object.
80	*/
81	if (!atomic_long_try_cmpxchg_release(v: &ref->refcnt, old: &cnt, FILE_REF_DEAD))
82	return false;
83
84	/*
85	* The caller can safely schedule the object for
86	* deconstruction. Provide acquire ordering.
87	*/
88	smp_acquire__after_ctrl_dep();
89	return true;
90	}
91
92	return __file_ref_put_badval(ref, cnt);
93	}
94	EXPORT_SYMBOL_GPL(__file_ref_put);
95
96	unsigned int sysctl_nr_open __read_mostly = `1024`*`1024`;
97	unsigned int sysctl_nr_open_min = BITS_PER_LONG;
98	/ our min() is unusable in constant expressions ;-/ /
99	#define __const_min(x, y) ((x) < (y) ? (x) : (y))
100	unsigned int sysctl_nr_open_max =
101	__const_min(INT_MAX, ~(size_t)`0`/sizeof(void *)) & -BITS_PER_LONG;
102
103	static void __free_fdtable(struct fdtable *fdt)
104	{
105	kvfree(addr: fdt->fd);
106	kvfree(addr: fdt->open_fds);
107	kfree(objp: fdt);
108	}
109
110	static void free_fdtable_rcu(struct rcu_head *rcu)
111	{
112	__free_fdtable(container_of(rcu, struct fdtable, rcu));
113	}
114
115	#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
116	#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
117
118	#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
119	/*
120	* Copy 'count' fd bits from the old table to the new table and clear the extra
121	* space if any. This does not copy the file pointers. Called with the files
122	* spinlock held for write.
123	*/
124	static inline void copy_fd_bitmaps(struct fdtable nfdt, struct* fdtable *ofdt,
125	unsigned int copy_words)
126	{
127	unsigned int nwords = fdt_words(nfdt);
128
129	bitmap_copy_and_extend(to: nfdt->open_fds, from: ofdt->open_fds,
130	count: copy_words * BITS_PER_LONG, size: nwords * BITS_PER_LONG);
131	bitmap_copy_and_extend(to: nfdt->close_on_exec, from: ofdt->close_on_exec,
132	count: copy_words * BITS_PER_LONG, size: nwords * BITS_PER_LONG);
133	bitmap_copy_and_extend(to: nfdt->full_fds_bits, from: ofdt->full_fds_bits,
134	count: copy_words, size: nwords);
135	}
136
137	/*
138	* Copy all file descriptors from the old table to the new, expanded table and
139	* clear the extra space. Called with the files spinlock held for write.
140	*/
141	static void copy_fdtable(struct fdtable nfdt, struct* fdtable *ofdt)
142	{
143	size_t cpy, set;
144
145	BUG_ON(nfdt->max_fds < ofdt->max_fds);
146
147	cpy = ofdt->max_fds * sizeof(struct file *);
148	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
149	memcpy(to: nfdt->fd, from: ofdt->fd, len: cpy);
150	memset(s: (char *)nfdt->fd + cpy, c: `0`, n: set);
151
152	copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
153	}
154
155	/*
156	* Note how the fdtable bitmap allocations very much have to be a multiple of
157	* BITS_PER_LONG. This is not only because we walk those things in chunks of
158	* 'unsigned long' in some places, but simply because that is how the Linux
159	* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
160	* they are very much "bits in an array of unsigned long".
161	*/
162	static struct fdtable alloc_fdtable(unsigned* int slots_wanted)
163	{
164	struct fdtable *fdt;
165	unsigned int nr;
166	void *data;
167
168	/*
169	* Figure out how many fds we actually want to support in this fdtable.
170	* Allocation steps are keyed to the size of the fdarray, since it
171	* grows far faster than any of the other dynamic data. We try to fit
172	* the fdarray into comfortable page-tuned chunks: starting at 1024B
173	* and growing in powers of two from there on. Since we called only
174	* with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
175	* already gives BITS_PER_LONG slots), the above boils down to
176	* 1. use the smallest power of two large enough to give us that many
177	* slots.
178	* 2. on 32bit skip 64 and 128 - the minimal capacity we want there is
179	* 256 slots (i.e. 1Kb fd array).
180	* 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there
181	* and we are never going to be asked for 64 or less.
182	*/
183	if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < `256`)
184	nr = `256`;
185	else
186	nr = roundup_pow_of_two(slots_wanted);
187	/*
188	* Note that this can drive nr below what we had passed if sysctl_nr_open
189	* had been set lower between the check in expand_files() and here.
190	*
191	* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
192	* bitmaps handling below becomes unpleasant, to put it mildly...
193	*/
194	if (unlikely(nr > sysctl_nr_open)) {
195	nr = round_down(sysctl_nr_open, BITS_PER_LONG);
196	if (nr < slots_wanted)
197	return ERR_PTR(error: -EMFILE);
198	}
199
200	/*
201	* Check if the allocation size would exceed INT_MAX. kvmalloc_array()
202	* and kvmalloc() will warn if the allocation size is greater than
203	* INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
204	*
205	* This can happen when sysctl_nr_open is set to a very high value and
206	* a process tries to use a file descriptor near that limit. For example,
207	* if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
208	* systemd typically sets it to - then trying to use a file descriptor
209	* close to that value will require allocating a file descriptor table
210	* that exceeds 8GB in size.
211	*/
212	if (unlikely(nr > INT_MAX / sizeof(struct file *)))
213	return ERR_PTR(error: -EMFILE);
214
215	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
216	if (!fdt)
217	goto out;
218	fdt->max_fds = nr;
219	data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
220	if (!data)
221	goto out_fdt;
222	fdt->fd = data;
223
224	data = kvmalloc(max_t(size_t,
225	`2` * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
226	GFP_KERNEL_ACCOUNT);
227	if (!data)
228	goto out_arr;
229	fdt->open_fds = data;
230	data += nr / BITS_PER_BYTE;
231	fdt->close_on_exec = data;
232	data += nr / BITS_PER_BYTE;
233	fdt->full_fds_bits = data;
234
235	return fdt;
236
237	out_arr:
238	kvfree(addr: fdt->fd);
239	out_fdt:
240	kfree(objp: fdt);
241	out:
242	return ERR_PTR(error: -ENOMEM);
243	}
244
245	/*
246	* Expand the file descriptor table.
247	* This function will allocate a new fdtable and both fd array and fdset, of
248	* the given size.
249	* Return <0 error code on error; 0 on successful completion.
250	* The files->file_lock should be held on entry, and will be held on exit.
251	*/
252	static int expand_fdtable(struct files_struct files, unsigned* int nr)
253	__releases(files->file_lock)
254	__acquires(files->file_lock)
255	{
256	struct fdtable new_fdt, cur_fdt;
257
258	spin_unlock(lock: &files->file_lock);
259	new_fdt = alloc_fdtable(slots_wanted: nr + `1`);
260
261	/ make sure all fd_install() have seen resize_in_progress*
262	* or have finished their rcu_read_lock_sched() section.
263	*/
264	if (atomic_read(v: &files->count) > `1`)
265	synchronize_rcu();
266
267	spin_lock(lock: &files->file_lock);
268	if (IS_ERR(ptr: new_fdt))
269	return PTR_ERR(ptr: new_fdt);
270	cur_fdt = files_fdtable(files);
271	BUG_ON(nr < cur_fdt->max_fds);
272	copy_fdtable(nfdt: new_fdt, ofdt: cur_fdt);
273	rcu_assign_pointer(files->fdt, new_fdt);
274	if (cur_fdt != &files->fdtab)
275	call_rcu(head: &cur_fdt->rcu, func: free_fdtable_rcu);
276	/ coupled with smp_rmb() in fd_install() /
277	smp_wmb();
278	return `0`;
279	}
280
281	/*
282	* Expand files.
283	* This function will expand the file structures, if the requested size exceeds
284	* the current capacity and there is room for expansion.
285	* Return <0 error code on error; 0 on success.
286	* The files->file_lock should be held on entry, and will be held on exit.
287	*/
288	static int expand_files(struct files_struct files, unsigned* int nr)
289	__releases(files->file_lock)
290	__acquires(files->file_lock)
291	{
292	struct fdtable *fdt;
293	int error;
294
295	repeat:
296	fdt = files_fdtable(files);
297
298	/ Do we need to expand? /
299	if (nr < fdt->max_fds)
300	return `0`;
301
302	if (unlikely(files->resize_in_progress)) {
303	spin_unlock(lock: &files->file_lock);
304	wait_event(files->resize_wait, !files->resize_in_progress);
305	spin_lock(lock: &files->file_lock);
306	goto repeat;
307	}
308
309	/ Can we expand? /
310	if (unlikely(nr >= sysctl_nr_open))
311	return -EMFILE;
312
313	/ All good, so we try /
314	files->resize_in_progress = true;
315	error = expand_fdtable(files, nr);
316	files->resize_in_progress = false;
317
318	wake_up_all(&files->resize_wait);
319	return error;
320	}
321
322	static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
323	bool set)
324	{
325	if (set) {
326	__set_bit(fd, fdt->close_on_exec);
327	} else {
328	if (test_bit(fd, fdt->close_on_exec))
329	__clear_bit(fd, fdt->close_on_exec);
330	}
331	}
332
333	static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
334	{
335	__set_bit(fd, fdt->open_fds);
336	__set_close_on_exec(fd, fdt, set);
337	fd /= BITS_PER_LONG;
338	if (!~fdt->open_fds[fd])
339	__set_bit(fd, fdt->full_fds_bits);
340	}
341
342	static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
343	{
344	__clear_bit(fd, fdt->open_fds);
345	fd /= BITS_PER_LONG;
346	if (test_bit(fd, fdt->full_fds_bits))
347	__clear_bit(fd, fdt->full_fds_bits);
348	}
349
350	static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
351	{
352	return test_bit(fd, fdt->open_fds);
353	}
354
355	/*
356	* Note that a sane fdtable size always has to be a multiple of
357	* BITS_PER_LONG, since we have bitmaps that are sized by this.
358	*
359	* punch_hole is optional - when close_range() is asked to unshare
360	* and close, we don't need to copy descriptors in that range, so
361	* a smaller cloned descriptor table might suffice if the last
362	* currently opened descriptor falls into that range.
363	*/
364	static unsigned int sane_fdtable_size(struct fdtable fdt, struct* fd_range *punch_hole)
365	{
366	unsigned int last = find_last_bit(addr: fdt->open_fds, size: fdt->max_fds);
367
368	if (last == fdt->max_fds)
369	return NR_OPEN_DEFAULT;
370	if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
371	last = find_last_bit(addr: fdt->open_fds, size: punch_hole->from);
372	if (last == punch_hole->from)
373	return NR_OPEN_DEFAULT;
374	}
375	return ALIGN(last + `1`, BITS_PER_LONG);
376	}
377
378	/*
379	* Allocate a new descriptor table and copy contents from the passed in
380	* instance. Returns a pointer to cloned table on success, ERR_PTR()
381	* on failure. For 'punch_hole' see sane_fdtable_size().
382	*/
383	struct files_struct dup_fd(struct* files_struct oldf, struct* fd_range *punch_hole)
384	{
385	struct files_struct *newf;
386	struct file old_fds, new_fds;
387	unsigned int open_files, i;
388	struct fdtable old_fdt, new_fdt;
389
390	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
391	if (!newf)
392	return ERR_PTR(error: -ENOMEM);
393
394	atomic_set(v: &newf->count, i: `1`);
395
396	spin_lock_init(&newf->file_lock);
397	newf->resize_in_progress = false;
398	init_waitqueue_head(&newf->resize_wait);
399	newf->next_fd = `0`;
400	new_fdt = &newf->fdtab;
401	new_fdt->max_fds = NR_OPEN_DEFAULT;
402	new_fdt->close_on_exec = newf->close_on_exec_init;
403	new_fdt->open_fds = newf->open_fds_init;
404	new_fdt->full_fds_bits = newf->full_fds_bits_init;
405	new_fdt->fd = &newf->fd_array[`0`];
406
407	spin_lock(lock: &oldf->file_lock);
408	old_fdt = files_fdtable(oldf);
409	open_files = sane_fdtable_size(fdt: old_fdt, punch_hole);
410
411	/*
412	* Check whether we need to allocate a larger fd array and fd set.
413	*/
414	while (unlikely(open_files > new_fdt->max_fds)) {
415	spin_unlock(lock: &oldf->file_lock);
416
417	if (new_fdt != &newf->fdtab)
418	__free_fdtable(fdt: new_fdt);
419
420	new_fdt = alloc_fdtable(slots_wanted: open_files);
421	if (IS_ERR(ptr: new_fdt)) {
422	kmem_cache_free(s: files_cachep, objp: newf);
423	return ERR_CAST(ptr: new_fdt);
424	}
425
426	/*
427	* Reacquire the oldf lock and a pointer to its fd table
428	* who knows it may have a new bigger fd table. We need
429	* the latest pointer.
430	*/
431	spin_lock(lock: &oldf->file_lock);
432	old_fdt = files_fdtable(oldf);
433	open_files = sane_fdtable_size(fdt: old_fdt, punch_hole);
434	}
435
436	copy_fd_bitmaps(nfdt: new_fdt, ofdt: old_fdt, copy_words: open_files / BITS_PER_LONG);
437
438	old_fds = old_fdt->fd;
439	new_fds = new_fdt->fd;
440
441	/*
442	* We may be racing against fd allocation from other threads using this
443	* files_struct, despite holding ->file_lock.
444	*
445	* alloc_fd() might have already claimed a slot, while fd_install()
446	* did not populate it yet. Note the latter operates locklessly, so
447	* the file can show up as we are walking the array below.
448	*
449	* At the same time we know no files will disappear as all other
450	* operations take the lock.
451	*
452	* Instead of trying to placate userspace racing with itself, we
453	* ref the file if we see it and mark the fd slot as unused otherwise.
454	*/
455	for (i = open_files; i != `0`; i--) {
456	struct file f = rcu_dereference_raw(old_fds++);
457	if (f) {
458	get_file(f);
459	} else {
460	__clear_open_fd(fd: open_files - i, fdt: new_fdt);
461	}
462	rcu_assign_pointer(*new_fds++, f);
463	}
464	spin_unlock(lock: &oldf->file_lock);
465
466	/ clear the remainder /
467	memset(s: new_fds, c: `0`, n: (new_fdt->max_fds - open_files) * sizeof(struct file *));
468
469	rcu_assign_pointer(newf->fdt, new_fdt);
470
471	return newf;
472	}
473
474	static struct fdtable close_files(struct* files_struct * files)
475	{
476	/*
477	* It is safe to dereference the fd table without RCU or
478	* ->file_lock because this is the last reference to the
479	* files structure.
480	*/
481	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
482	unsigned int i, j = `0`;
483
484	for (;;) {
485	unsigned long set;
486	i = j * BITS_PER_LONG;
487	if (i >= fdt->max_fds)
488	break;
489	set = fdt->open_fds[j++];
490	while (set) {
491	if (set & `1`) {
492	struct file *file = fdt->fd[i];
493	if (file) {
494	filp_close(file, id: files);
495	cond_resched();
496	}
497	}
498	i++;
499	set >>= `1`;
500	}
501	}
502
503	return fdt;
504	}
505
506	void put_files_struct(struct files_struct *files)
507	{
508	if (atomic_dec_and_test(v: &files->count)) {
509	struct fdtable *fdt = close_files(files);
510
511	/ free the arrays if they are not embedded /
512	if (fdt != &files->fdtab)
513	__free_fdtable(fdt);
514	kmem_cache_free(s: files_cachep, objp: files);
515	}
516	}
517
518	void exit_files(struct task_struct *tsk)
519	{
520	struct files_struct * files = tsk->files;
521
522	if (files) {
523	task_lock(p: tsk);
524	tsk->files = NULL;
525	task_unlock(p: tsk);
526	put_files_struct(files);
527	}
528	}
529
530	struct files_struct init_files = {
531	.count = ATOMIC_INIT(`1`),
532	.fdt = &init_files.fdtab,
533	.fdtab = {
534	.max_fds = NR_OPEN_DEFAULT,
535	.fd = &init_files.fd_array[`0`],
536	.close_on_exec = init_files.close_on_exec_init,
537	.open_fds = init_files.open_fds_init,
538	.full_fds_bits = init_files.full_fds_bits_init,
539	},
540	.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
541	.resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
542	};
543
544	static unsigned int find_next_fd(struct fdtable fdt, unsigned* int start)
545	{
546	unsigned int maxfd = fdt->max_fds; / always multiple of BITS_PER_LONG /
547	unsigned int maxbit = maxfd / BITS_PER_LONG;
548	unsigned int bitbit = start / BITS_PER_LONG;
549	unsigned int bit;
550
551	/*
552	* Try to avoid looking at the second level bitmap
553	*/
554	bit = find_next_zero_bit(addr: &fdt->open_fds[bitbit], BITS_PER_LONG,
555	offset: start & (BITS_PER_LONG - `1`));
556	if (bit < BITS_PER_LONG)
557	return bit + bitbit * BITS_PER_LONG;
558
559	bitbit = find_next_zero_bit(addr: fdt->full_fds_bits, size: maxbit, offset: bitbit) * BITS_PER_LONG;
560	if (bitbit >= maxfd)
561	return maxfd;
562	if (bitbit > start)
563	start = bitbit;
564	return find_next_zero_bit(addr: fdt->open_fds, size: maxfd, offset: start);
565	}
566
567	/*
568	* allocate a file descriptor, mark it busy.
569	*/
570	static int alloc_fd(unsigned start, unsigned end, unsigned flags)
571	{
572	struct files_struct *files = current->files;
573	unsigned int fd;
574	int error;
575	struct fdtable *fdt;
576
577	spin_lock(lock: &files->file_lock);
578	repeat:
579	fdt = files_fdtable(files);
580	fd = start;
581	if (fd < files->next_fd)
582	fd = files->next_fd;
583
584	if (likely(fd < fdt->max_fds))
585	fd = find_next_fd(fdt, start: fd);
586
587	/*
588	* N.B. For clone tasks sharing a files structure, this test
589	* will limit the total number of files that can be opened.
590	*/
591	error = -EMFILE;
592	if (unlikely(fd >= end))
593	goto out;
594
595	if (unlikely(fd >= fdt->max_fds)) {
596	error = expand_files(files, nr: fd);
597	if (error < `0`)
598	goto out;
599
600	goto repeat;
601	}
602
603	if (start <= files->next_fd)
604	files->next_fd = fd + `1`;
605
606	__set_open_fd(fd, fdt, set: flags & O_CLOEXEC);
607	error = fd;
608	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
609
610	out:
611	spin_unlock(lock: &files->file_lock);
612	return error;
613	}
614
615	int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
616	{
617	return alloc_fd(start: `0`, end: nofile, flags);
618	}
619
620	int get_unused_fd_flags(unsigned flags)
621	{
622	return __get_unused_fd_flags(flags, nofile: rlimit(RLIMIT_NOFILE));
623	}
624	EXPORT_SYMBOL(get_unused_fd_flags);
625
626	static void __put_unused_fd(struct files_struct files, unsigned* int fd)
627	{
628	struct fdtable *fdt = files_fdtable(files);
629	__clear_open_fd(fd, fdt);
630	if (fd < files->next_fd)
631	files->next_fd = fd;
632	}
633
634	void put_unused_fd(unsigned int fd)
635	{
636	struct files_struct *files = current->files;
637	spin_lock(lock: &files->file_lock);
638	__put_unused_fd(files, fd);
639	spin_unlock(lock: &files->file_lock);
640	}
641
642	EXPORT_SYMBOL(put_unused_fd);
643
644	/**
645	* fd_install - install a file pointer in the fd array
646	* @fd: file descriptor to install the file in
647	* @file: the file to install
648	*
649	* This consumes the "file" refcount, so callers should treat it
650	* as if they had called fput(file).
651	*/
652	void fd_install(unsigned int fd, struct file *file)
653	{
654	struct files_struct *files = current->files;
655	struct fdtable *fdt;
656
657	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
658	return;
659
660	rcu_read_lock_sched();
661
662	if (unlikely(files->resize_in_progress)) {
663	rcu_read_unlock_sched();
664	spin_lock(lock: &files->file_lock);
665	fdt = files_fdtable(files);
666	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
667	rcu_assign_pointer(fdt->fd[fd], file);
668	spin_unlock(lock: &files->file_lock);
669	return;
670	}
671	/ coupled with smp_wmb() in expand_fdtable() /
672	smp_rmb();
673	fdt = rcu_dereference_sched(files->fdt);
674	VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
675	rcu_assign_pointer(fdt->fd[fd], file);
676	rcu_read_unlock_sched();
677	}
678
679	EXPORT_SYMBOL(fd_install);
680
681	/**
682	* file_close_fd_locked - return file associated with fd
683	* @files: file struct to retrieve file from
684	* @fd: file descriptor to retrieve file for
685	*
686	* Doesn't take a separate reference count.
687	*
688	* Context: files_lock must be held.
689	*
690	* Returns: The file associated with @fd (NULL if @fd is not open)
691	*/
692	struct file file_close_fd_locked(struct* files_struct files, unsigned* fd)
693	{
694	struct fdtable *fdt = files_fdtable(files);
695	struct file *file;
696
697	lockdep_assert_held(&files->file_lock);
698
699	if (fd >= fdt->max_fds)
700	return NULL;
701
702	fd = array_index_nospec(fd, fdt->max_fds);
703	file = rcu_dereference_raw(fdt->fd[fd]);
704	if (file) {
705	rcu_assign_pointer(fdt->fd[fd], NULL);
706	__put_unused_fd(files, fd);
707	}
708	return file;
709	}
710
711	int close_fd(unsigned fd)
712	{
713	struct files_struct *files = current->files;
714	struct file *file;
715
716	spin_lock(lock: &files->file_lock);
717	file = file_close_fd_locked(files, fd);
718	spin_unlock(lock: &files->file_lock);
719	if (!file)
720	return -EBADF;
721
722	return filp_close(file, id: files);
723	}
724	EXPORT_SYMBOL(close_fd);
725
726	/**
727	* last_fd - return last valid index into fd table
728	* @fdt: File descriptor table.
729	*
730	* Context: Either rcu read lock or files_lock must be held.
731	*
732	* Returns: Last valid index into fdtable.
733	*/
734	static inline unsigned last_fd(struct fdtable *fdt)
735	{
736	return fdt->max_fds - `1`;
737	}
738
739	static inline void __range_cloexec(struct files_struct *cur_fds,
740	unsigned int fd, unsigned int max_fd)
741	{
742	struct fdtable *fdt;
743
744	/ make sure we're using the correct maximum value /
745	spin_lock(lock: &cur_fds->file_lock);
746	fdt = files_fdtable(cur_fds);
747	max_fd = min(last_fd(fdt), max_fd);
748	if (fd <= max_fd)
749	bitmap_set(map: fdt->close_on_exec, start: fd, nbits: max_fd - fd + `1`);
750	spin_unlock(lock: &cur_fds->file_lock);
751	}
752
753	static inline void __range_close(struct files_struct files, unsigned* int fd,
754	unsigned int max_fd)
755	{
756	struct file *file;
757	unsigned n;
758
759	spin_lock(lock: &files->file_lock);
760	n = last_fd(files_fdtable(files));
761	max_fd = min(max_fd, n);
762
763	for (; fd <= max_fd; fd++) {
764	file = file_close_fd_locked(files, fd);
765	if (file) {
766	spin_unlock(lock: &files->file_lock);
767	filp_close(file, id: files);
768	cond_resched();
769	spin_lock(lock: &files->file_lock);
770	} else if (need_resched()) {
771	spin_unlock(lock: &files->file_lock);
772	cond_resched();
773	spin_lock(lock: &files->file_lock);
774	}
775	}
776	spin_unlock(lock: &files->file_lock);
777	}
778
779	/**
780	* sys_close_range() - Close all file descriptors in a given range.
781	*
782	* @fd: starting file descriptor to close
783	* @max_fd: last file descriptor to close
784	* @flags: CLOSE_RANGE flags.
785	*
786	* This closes a range of file descriptors. All file descriptors
787	* from @fd up to and including @max_fd are closed.
788	* Currently, errors to close a given file descriptor are ignored.
789	*/
790	SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
791	unsigned int, flags)
792	{
793	struct task_struct *me = current;
794	struct files_struct cur_fds = me->files, fds = NULL;
795
796	if (flags & ~(CLOSE_RANGE_UNSHARE \| CLOSE_RANGE_CLOEXEC))
797	return -EINVAL;
798
799	if (fd > max_fd)
800	return -EINVAL;
801
802	if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(v: &cur_fds->count) > `1`) {
803	struct fd_range range = {fd, max_fd}, *punch_hole = &range;
804
805	/*
806	* If the caller requested all fds to be made cloexec we always
807	* copy all of the file descriptors since they still want to
808	* use them.
809	*/
810	if (flags & CLOSE_RANGE_CLOEXEC)
811	punch_hole = NULL;
812
813	fds = dup_fd(oldf: cur_fds, punch_hole);
814	if (IS_ERR(ptr: fds))
815	return PTR_ERR(ptr: fds);
816	/*
817	* We used to share our file descriptor table, and have now
818	* created a private one, make sure we're using it below.
819	*/
820	swap(cur_fds, fds);
821	}
822
823	if (flags & CLOSE_RANGE_CLOEXEC)
824	__range_cloexec(cur_fds, fd, max_fd);
825	else
826	__range_close(files: cur_fds, fd, max_fd);
827
828	if (fds) {
829	/*
830	* We're done closing the files we were supposed to. Time to install
831	* the new file descriptor table and drop the old one.
832	*/
833	task_lock(p: me);
834	me->files = cur_fds;
835	task_unlock(p: me);
836	put_files_struct(files: fds);
837	}
838
839	return `0`;
840	}
841
842	/**
843	* file_close_fd - return file associated with fd
844	* @fd: file descriptor to retrieve file for
845	*
846	* Doesn't take a separate reference count.
847	*
848	* Returns: The file associated with @fd (NULL if @fd is not open)
849	*/
850	struct file file_close_fd(unsigned* int fd)
851	{
852	struct files_struct *files = current->files;
853	struct file *file;
854
855	spin_lock(lock: &files->file_lock);
856	file = file_close_fd_locked(files, fd);
857	spin_unlock(lock: &files->file_lock);
858
859	return file;
860	}
861
862	void do_close_on_exec(struct files_struct *files)
863	{
864	unsigned i;
865	struct fdtable *fdt;
866
867	/ exec unshares first /
868	spin_lock(lock: &files->file_lock);
869	for (i = `0`; ; i++) {
870	unsigned long set;
871	unsigned fd = i * BITS_PER_LONG;
872	fdt = files_fdtable(files);
873	if (fd >= fdt->max_fds)
874	break;
875	set = fdt->close_on_exec[i];
876	if (!set)
877	continue;
878	fdt->close_on_exec[i] = `0`;
879	for ( ; set ; fd++, set >>= `1`) {
880	struct file *file;
881	if (!(set & `1`))
882	continue;
883	file = fdt->fd[fd];
884	if (!file)
885	continue;
886	rcu_assign_pointer(fdt->fd[fd], NULL);
887	__put_unused_fd(files, fd);
888	spin_unlock(lock: &files->file_lock);
889	filp_close(file, id: files);
890	cond_resched();
891	spin_lock(lock: &files->file_lock);
892	}
893
894	}
895	spin_unlock(lock: &files->file_lock);
896	}
897
898	static struct file __get_file_rcu(struct* file __rcu **f)
899	{
900	struct file __rcu *file;
901	struct file __rcu *file_reloaded;
902	struct file __rcu *file_reloaded_cmp;
903
904	file = rcu_dereference_raw(*f);
905	if (!file)
906	return NULL;
907
908	if (unlikely(!file_ref_get(&file->f_ref)))
909	return ERR_PTR(error: -EAGAIN);
910
911	file_reloaded = rcu_dereference_raw(*f);
912
913	/*
914	* Ensure that all accesses have a dependency on the load from
915	* rcu_dereference_raw() above so we get correct ordering
916	* between reuse/allocation and the pointer check below.
917	*/
918	file_reloaded_cmp = file_reloaded;
919	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
920
921	/*
922	* file_ref_get() above provided a full memory barrier when we
923	* acquired a reference.
924	*
925	* This is paired with the write barrier from assigning to the
926	* __rcu protected file pointer so that if that pointer still
927	* matches the current file, we know we have successfully
928	* acquired a reference to the right file.
929	*
930	* If the pointers don't match the file has been reallocated by
931	* SLAB_TYPESAFE_BY_RCU.
932	*/
933	if (file == file_reloaded_cmp)
934	return file_reloaded;
935
936	fput(file);
937	return ERR_PTR(error: -EAGAIN);
938	}
939
940	/**
941	* get_file_rcu - try go get a reference to a file under rcu
942	* @f: the file to get a reference on
943	*
944	* This function tries to get a reference on @f carefully verifying that
945	* @f hasn't been reused.
946	*
947	* This function should rarely have to be used and only by users who
948	* understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
949	*
950	* Return: Returns @f with the reference count increased or NULL.
951	*/
952	struct file get_file_rcu(struct* file __rcu **f)
953	{
954	for (;;) {
955	struct file __rcu *file;
956
957	file = __get_file_rcu(f);
958	if (!IS_ERR(ptr: file))
959	return file;
960	}
961	}
962	EXPORT_SYMBOL_GPL(get_file_rcu);
963
964	/**
965	* get_file_active - try go get a reference to a file
966	* @f: the file to get a reference on
967	*
968	* In contast to get_file_rcu() the pointer itself isn't part of the
969	* reference counting.
970	*
971	* This function should rarely have to be used and only by users who
972	* understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
973	*
974	* Return: Returns @f with the reference count increased or NULL.
975	*/
976	struct file get_file_active(struct* file **f)
977	{
978	struct file __rcu *file;
979
980	rcu_read_lock();
981	file = __get_file_rcu(f);
982	rcu_read_unlock();
983	if (IS_ERR(ptr: file))
984	file = NULL;
985	return file;
986	}
987	EXPORT_SYMBOL_GPL(get_file_active);
988
989	static inline struct file __fget_files_rcu(struct* files_struct *files,
990	unsigned int fd, fmode_t mask)
991	{
992	for (;;) {
993	struct file *file;
994	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
995	struct file __rcu **fdentry;
996	unsigned long nospec_mask;
997
998	/ Mask is a 0 for invalid fd's, ~0 for valid ones /
999	nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);
1000
1001	/*
1002	* fdentry points to the 'fd' offset, or fdt->fd[0].
1003	* Loading from fdt->fd[0] is always safe, because the
1004	* array always exists.
1005	*/
1006	fdentry = fdt->fd + (fd & nospec_mask);
1007
1008	/ Do the load, then mask any invalid result /
1009	file = rcu_dereference_raw(*fdentry);
1010	file = (void )(nospec_mask & (unsigned* long)file);
1011	if (unlikely(!file))
1012	return NULL;
1013
1014	/*
1015	* Ok, we have a file pointer that was valid at
1016	* some point, but it might have become stale since.
1017	*
1018	* We need to confirm it by incrementing the refcount
1019	* and then check the lookup again.
1020	*
1021	* file_ref_get() gives us a full memory barrier. We
1022	* only really need an 'acquire' one to protect the
1023	* loads below, but we don't have that.
1024	*/
1025	if (unlikely(!file_ref_get(&file->f_ref)))
1026	continue;
1027
1028	/*
1029	* Such a race can take two forms:
1030	*
1031	* (a) the file ref already went down to zero and the
1032	* file hasn't been reused yet or the file count
1033	* isn't zero but the file has already been reused.
1034	*
1035	* (b) the file table entry has changed under us.
1036	* Note that we don't need to re-check the 'fdt->fd'
1037	* pointer having changed, because it always goes
1038	* hand-in-hand with 'fdt'.
1039	*
1040	* If so, we need to put our ref and try again.
1041	*/
1042	if (unlikely(file != rcu_dereference_raw(*fdentry)) \|\|
1043	unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
1044	fput(file);
1045	continue;
1046	}
1047
1048	/*
1049	* This isn't the file we're looking for or we're not
1050	* allowed to get a reference to it.
1051	*/
1052	if (unlikely(file->f_mode & mask)) {
1053	fput(file);
1054	return NULL;
1055	}
1056
1057	/*
1058	* Ok, we have a ref to the file, and checked that it
1059	* still exists.
1060	*/
1061	return file;
1062	}
1063	}
1064
1065	static struct file __fget_files(struct* files_struct files, unsigned* int fd,
1066	fmode_t mask)
1067	{
1068	struct file *file;
1069
1070	rcu_read_lock();
1071	file = __fget_files_rcu(files, fd, mask);
1072	rcu_read_unlock();
1073
1074	return file;
1075	}
1076
1077	static inline struct file __fget(unsigned* int fd, fmode_t mask)
1078	{
1079	return __fget_files(current->files, fd, mask);
1080	}
1081
1082	struct file fget(unsigned* int fd)
1083	{
1084	return __fget(fd, FMODE_PATH);
1085	}
1086	EXPORT_SYMBOL(fget);
1087
1088	struct file fget_raw(unsigned* int fd)
1089	{
1090	return __fget(fd, mask: `0`);
1091	}
1092	EXPORT_SYMBOL(fget_raw);
1093
1094	struct file fget_task(struct* task_struct task, unsigned* int fd)
1095	{
1096	struct file *file = NULL;
1097
1098	task_lock(p: task);
1099	if (task->files)
1100	file = __fget_files(files: task->files, fd, mask: `0`);
1101	task_unlock(p: task);
1102
1103	return file;
1104	}
1105
1106	struct file fget_task_next(struct* task_struct task, unsigned* int *ret_fd)
1107	{
1108	/ Must be called with rcu_read_lock held /
1109	struct files_struct *files;
1110	unsigned int fd = *ret_fd;
1111	struct file *file = NULL;
1112
1113	task_lock(p: task);
1114	files = task->files;
1115	if (files) {
1116	rcu_read_lock();
1117	for (; fd < files_fdtable(files)->max_fds; fd++) {
1118	file = __fget_files_rcu(files, fd, mask: `0`);
1119	if (file)
1120	break;
1121	}
1122	rcu_read_unlock();
1123	}
1124	task_unlock(p: task);
1125	*ret_fd = fd;
1126	return file;
1127	}
1128	EXPORT_SYMBOL(fget_task_next);
1129
1130	/*
1131	* Lightweight file lookup - no refcnt increment if fd table isn't shared.
1132	*
1133	* You can use this instead of fget if you satisfy all of the following
1134	* conditions:
1135	* 1) You must call fput_light before exiting the syscall and returning control
1136	* to userspace (i.e. you cannot remember the returned struct file * after
1137	* returning to userspace).
1138	* 2) You must not call filp_close on the returned struct file * in between
1139	* calls to fget_light and fput_light.
1140	* 3) You must not clone the current task in between the calls to fget_light
1141	* and fput_light.
1142	*
1143	* The fput_needed flag returned by fget_light should be passed to the
1144	* corresponding fput_light.
1145	*
1146	* (As an exception to rule 2, you can call filp_close between fget_light and
1147	* fput_light provided that you capture a real refcount with get_file before
1148	* the call to filp_close, and ensure that this real refcount is fput after
1149	* the fput_light call.)
1150	*
1151	* See also the documentation in rust/kernel/file.rs.
1152	*/
1153	static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
1154	{
1155	struct files_struct *files = current->files;
1156	struct file *file;
1157
1158	/*
1159	* If another thread is concurrently calling close_fd() followed
1160	* by put_files_struct(), we must not observe the old table
1161	* entry combined with the new refcount - otherwise we could
1162	* return a file that is concurrently being freed.
1163	*
1164	* atomic_read_acquire() pairs with atomic_dec_and_test() in
1165	* put_files_struct().
1166	*/
1167	if (likely(atomic_read_acquire(&files->count) == `1`)) {
1168	file = files_lookup_fd_raw(files, fd);
1169	if (!file \|\| unlikely(file->f_mode & mask))
1170	return EMPTY_FD;
1171	return BORROWED_FD(file);
1172	} else {
1173	file = __fget_files(files, fd, mask);
1174	if (!file)
1175	return EMPTY_FD;
1176	return CLONED_FD(file);
1177	}
1178	}
1179	struct fd fdget(unsigned int fd)
1180	{
1181	return __fget_light(fd, FMODE_PATH);
1182	}
1183	EXPORT_SYMBOL(fdget);
1184
1185	struct fd fdget_raw(unsigned int fd)
1186	{
1187	return __fget_light(fd, mask: `0`);
1188	}
1189
1190	/*
1191	* Try to avoid f_pos locking. We only need it if the
1192	* file is marked for FMODE_ATOMIC_POS, and it can be
1193	* accessed multiple ways.
1194	*
1195	* Always do it for directories, because pidfd_getfd()
1196	* can make a file accessible even if it otherwise would
1197	* not be, and for directories this is a correctness
1198	* issue, not a "POSIX requirement".
1199	*/
1200	static inline bool file_needs_f_pos_lock(struct file *file)
1201	{
1202	if (!(file->f_mode & FMODE_ATOMIC_POS))
1203	return false;
1204	if (__file_ref_read_raw(ref: &file->f_ref) != FILE_REF_ONEREF)
1205	return true;
1206	if (file->f_op->iterate_shared)
1207	return true;
1208	return false;
1209	}
1210
1211	bool file_seek_cur_needs_f_lock(struct file *file)
1212	{
1213	if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
1214	return false;
1215
1216	/*
1217	* Note that we are not guaranteed to be called after fdget_pos() on
1218	* this file obj, in which case the caller is expected to provide the
1219	* appropriate locking.
1220	*/
1221
1222	return true;
1223	}
1224
1225	struct fd fdget_pos(unsigned int fd)
1226	{
1227	struct fd f = fdget(fd);
1228	struct file *file = fd_file(f);
1229
1230	if (likely(file) && file_needs_f_pos_lock(file)) {
1231	f.word \|= FDPUT_POS_UNLOCK;
1232	mutex_lock(lock: &file->f_pos_lock);
1233	}
1234	return f;
1235	}
1236
1237	void __f_unlock_pos(struct file *f)
1238	{
1239	mutex_unlock(lock: &f->f_pos_lock);
1240	}
1241
1242	/*
1243	* We only lock f_pos if we have threads or if the file might be
1244	* shared with another process. In both cases we'll have an elevated
1245	* file count (done either by fdget() or by fork()).
1246	*/
1247
1248	void set_close_on_exec(unsigned int fd, int flag)
1249	{
1250	struct files_struct *files = current->files;
1251	spin_lock(lock: &files->file_lock);
1252	__set_close_on_exec(fd, files_fdtable(files), set: flag);
1253	spin_unlock(lock: &files->file_lock);
1254	}
1255
1256	bool get_close_on_exec(unsigned int fd)
1257	{
1258	bool res;
1259	rcu_read_lock();
1260	res = close_on_exec(fd, current->files);
1261	rcu_read_unlock();
1262	return res;
1263	}
1264
1265	static int do_dup2(struct files_struct *files,
1266	struct file file, unsigned* fd, unsigned flags)
1267	__releases(&files->file_lock)
1268	{
1269	struct file *tofree;
1270	struct fdtable *fdt;
1271
1272	/*
1273	* dup2() is expected to close the file installed in the target fd slot
1274	* (if any). However, userspace hand-picking a fd may be racing against
1275	* its own threads which happened to allocate it in open() et al but did
1276	* not populate it yet.
1277	*
1278	* Broadly speaking we may be racing against the following:
1279	* fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL
1280	* file = hard_work_goes_here();
1281	* fd_install(fd, file); // only now ->fd[fd] == file
1282	*
1283	* It is an invariant that a successfully allocated fd has a NULL entry
1284	* in the array until the matching fd_install().
1285	*
1286	* If we fit the window, we have the fd to populate, yet no target file
1287	* to close. Trying to ignore it and install our new file would violate
1288	* the invariant and make fd_install() overwrite our file.
1289	*
1290	* Things can be done(tm) to handle this. However, the issue does not
1291	* concern legitimate programs and we only need to make sure the kernel
1292	* does not trip over it.
1293	*
1294	* The simplest way out is to return an error if we find ourselves here.
1295	*
1296	* POSIX is silent on the issue, we return -EBUSY.
1297	*/
1298	fdt = files_fdtable(files);
1299	fd = array_index_nospec(fd, fdt->max_fds);
1300	tofree = rcu_dereference_raw(fdt->fd[fd]);
1301	if (!tofree && fd_is_open(fd, fdt))
1302	goto Ebusy;
1303	get_file(f: file);
1304	rcu_assign_pointer(fdt->fd[fd], file);
1305	__set_open_fd(fd, fdt, set: flags & O_CLOEXEC);
1306	spin_unlock(lock: &files->file_lock);
1307
1308	if (tofree)
1309	filp_close(tofree, id: files);
1310
1311	return fd;
1312
1313	Ebusy:
1314	spin_unlock(lock: &files->file_lock);
1315	return -EBUSY;
1316	}
1317
1318	int replace_fd(unsigned fd, struct file file, unsigned* flags)
1319	{
1320	int err;
1321	struct files_struct *files = current->files;
1322
1323	if (!file)
1324	return close_fd(fd);
1325
1326	if (fd >= rlimit(RLIMIT_NOFILE))
1327	return -EBADF;
1328
1329	spin_lock(lock: &files->file_lock);
1330	err = expand_files(files, nr: fd);
1331	if (unlikely(err < `0`))
1332	goto out_unlock;
1333	err = do_dup2(files, file, fd, flags);
1334	if (err < `0`)
1335	return err;
1336	return `0`;
1337
1338	out_unlock:
1339	spin_unlock(lock: &files->file_lock);
1340	return err;
1341	}
1342
1343	/**
1344	* receive_fd() - Install received file into file descriptor table
1345	* @file: struct file that was received from another process
1346	* @ufd: __user pointer to write new fd number to
1347	* @o_flags: the O_* flags to apply to the new fd entry
1348	*
1349	* Installs a received file into the file descriptor table, with appropriate
1350	* checks and count updates. Optionally writes the fd number to userspace, if
1351	* @ufd is non-NULL.
1352	*
1353	* This helper handles its own reference counting of the incoming
1354	* struct file.
1355	*
1356	* Returns newly install fd or -ve on error.
1357	*/
1358	int receive_fd(struct file file, int* __user ufd, unsigned* int o_flags)
1359	{
1360	int new_fd;
1361	int error;
1362
1363	error = security_file_receive(file);
1364	if (error)
1365	return error;
1366
1367	new_fd = get_unused_fd_flags(o_flags);
1368	if (new_fd < `0`)
1369	return new_fd;
1370
1371	if (ufd) {
1372	error = put_user(new_fd, ufd);
1373	if (error) {
1374	put_unused_fd(new_fd);
1375	return error;
1376	}
1377	}
1378
1379	fd_install(new_fd, get_file(f: file));
1380	__receive_sock(file);
1381	return new_fd;
1382	}
1383	EXPORT_SYMBOL_GPL(receive_fd);
1384
1385	int receive_fd_replace(int new_fd, struct file file, unsigned* int o_flags)
1386	{
1387	int error;
1388
1389	error = security_file_receive(file);
1390	if (error)
1391	return error;
1392	error = replace_fd(fd: new_fd, file, flags: o_flags);
1393	if (error)
1394	return error;
1395	__receive_sock(file);
1396	return new_fd;
1397	}
1398
1399	static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1400	{
1401	int err = -EBADF;
1402	struct file *file;
1403	struct files_struct *files = current->files;
1404
1405	if ((flags & ~O_CLOEXEC) != `0`)
1406	return -EINVAL;
1407
1408	if (unlikely(oldfd == newfd))
1409	return -EINVAL;
1410
1411	if (newfd >= rlimit(RLIMIT_NOFILE))
1412	return -EBADF;
1413
1414	spin_lock(lock: &files->file_lock);
1415	err = expand_files(files, nr: newfd);
1416	file = files_lookup_fd_locked(files, fd: oldfd);
1417	if (unlikely(!file))
1418	goto Ebadf;
1419	if (unlikely(err < `0`)) {
1420	if (err == -EMFILE)
1421	goto Ebadf;
1422	goto out_unlock;
1423	}
1424	return do_dup2(files, file, fd: newfd, flags);
1425
1426	Ebadf:
1427	err = -EBADF;
1428	out_unlock:
1429	spin_unlock(lock: &files->file_lock);
1430	return err;
1431	}
1432
1433	SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1434	{
1435	return ksys_dup3(oldfd, newfd, flags);
1436	}
1437
1438	SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1439	{
1440	if (unlikely(newfd == oldfd)) { / corner case /
1441	struct files_struct *files = current->files;
1442	struct file *f;
1443	int retval = oldfd;
1444
1445	rcu_read_lock();
1446	f = __fget_files_rcu(files, fd: oldfd, mask: `0`);
1447	if (!f)
1448	retval = -EBADF;
1449	rcu_read_unlock();
1450	if (f)
1451	fput(f);
1452	return retval;
1453	}
1454	return ksys_dup3(oldfd, newfd, flags: `0`);
1455	}
1456
1457	SYSCALL_DEFINE1(dup, unsigned int, fildes)
1458	{
1459	int ret = -EBADF;
1460	struct file *file = fget_raw(fildes);
1461
1462	if (file) {
1463	ret = get_unused_fd_flags(`0`);
1464	if (ret >= `0`)
1465	fd_install(ret, file);
1466	else
1467	fput(file);
1468	}
1469	return ret;
1470	}
1471
1472	int f_dupfd(unsigned int from, struct file file, unsigned* flags)
1473	{
1474	unsigned long nofile = rlimit(RLIMIT_NOFILE);
1475	int err;
1476	if (from >= nofile)
1477	return -EINVAL;
1478	err = alloc_fd(start: from, end: nofile, flags);
1479	if (err >= `0`) {
1480	get_file(f: file);
1481	fd_install(err, file);
1482	}
1483	return err;
1484	}
1485
1486	int iterate_fd(struct files_struct files, unsigned* n,
1487	int (f)(const* void , struct* file , unsigned*),
1488	const void *p)
1489	{
1490	struct fdtable *fdt;
1491	int res = `0`;
1492	if (!files)
1493	return `0`;
1494	spin_lock(lock: &files->file_lock);
1495	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1496	struct file *file;
1497	file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1498	if (!file)
1499	continue;
1500	res = f(p, file, n);
1501	if (res)
1502	break;
1503	}
1504	spin_unlock(lock: &files->file_lock);
1505	return res;
1506	}
1507	EXPORT_SYMBOL(iterate_fd);
1508

Browse the source code of Linux/fs/file.c