inode.c source code [Linux/fs/hugetlbfs/inode.c]

1	/*
2	* hugetlbpage-backed filesystem. Based on ramfs.
3	*
4	* Nadia Yvette Chambers, 2002
5	*
6	* Copyright (C) 2002 Linus Torvalds.
7	* License: GPL
8	*/
9
10	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12	#include <linux/thread_info.h>
13	#include <asm/current.h>
14	#include <linux/falloc.h>
15	#include <linux/fs.h>
16	#include <linux/mount.h>
17	#include <linux/file.h>
18	#include <linux/kernel.h>
19	#include <linux/writeback.h>
20	#include <linux/pagemap.h>
21	#include <linux/highmem.h>
22	#include <linux/init.h>
23	#include <linux/string.h>
24	#include <linux/capability.h>
25	#include <linux/ctype.h>
26	#include <linux/backing-dev.h>
27	#include <linux/hugetlb.h>
28	#include <linux/pagevec.h>
29	#include <linux/fs_parser.h>
30	#include <linux/mman.h>
31	#include <linux/slab.h>
32	#include <linux/dnotify.h>
33	#include <linux/statfs.h>
34	#include <linux/security.h>
35	#include <linux/magic.h>
36	#include <linux/migrate.h>
37	#include <linux/uio.h>
38
39	#include <linux/uaccess.h>
40	#include <linux/sched/mm.h>
41
42	#define CREATE_TRACE_POINTS
43	#include <trace/events/hugetlbfs.h>
44
45	static const struct address_space_operations hugetlbfs_aops;
46	static const struct file_operations hugetlbfs_file_operations;
47	static const struct inode_operations hugetlbfs_dir_inode_operations;
48	static const struct inode_operations hugetlbfs_inode_operations;
49
50	enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
51
52	struct hugetlbfs_fs_context {
53	struct hstate *hstate;
54	unsigned long long max_size_opt;
55	unsigned long long min_size_opt;
56	long max_hpages;
57	long nr_inodes;
58	long min_hpages;
59	enum hugetlbfs_size_type max_val_type;
60	enum hugetlbfs_size_type min_val_type;
61	kuid_t uid;
62	kgid_t gid;
63	umode_t mode;
64	};
65
66	int sysctl_hugetlb_shm_group;
67
68	enum hugetlb_param {
69	Opt_gid,
70	Opt_min_size,
71	Opt_mode,
72	Opt_nr_inodes,
73	Opt_pagesize,
74	Opt_size,
75	Opt_uid,
76	};
77
78	static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
79	fsparam_gid ("gid", Opt_gid),
80	fsparam_string("min_size", Opt_min_size),
81	fsparam_u32oct("mode", Opt_mode),
82	fsparam_string("nr_inodes", Opt_nr_inodes),
83	fsparam_string("pagesize", Opt_pagesize),
84	fsparam_string("size", Opt_size),
85	fsparam_uid ("uid", Opt_uid),
86	{}
87	};
88
89	/*
90	* Mask used when checking the page offset value passed in via system
91	* calls. This value will be converted to a loff_t which is signed.
92	* Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
93	* value. The extra bit (- 1 in the shift value) is to take the sign
94	* bit into account.
95	*/
96	#define PGOFF_LOFFT_MAX \
97	(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
98
99	static int hugetlbfs_file_mmap(struct file file, struct* vm_area_struct *vma)
100	{
101	struct inode *inode = file_inode(f: file);
102	loff_t len, vma_len;
103	int ret;
104	struct hstate *h = hstate_file(f: file);
105	vm_flags_t vm_flags;
106
107	/*
108	* vma address alignment (but not the pgoff alignment) has
109	* already been checked by prepare_hugepage_range. If you add
110	* any error returns here, do so after setting VM_HUGETLB, so
111	* is_vm_hugetlb_page tests below unmap_region go the right
112	* way when do_mmap unwinds (may be important on powerpc
113	* and ia64).
114	*/
115	vm_flags_set(vma, VM_HUGETLB \| VM_DONTEXPAND);
116	vma->vm_ops = &hugetlb_vm_ops;
117
118	/*
119	* page based offset in vm_pgoff could be sufficiently large to
120	* overflow a loff_t when converted to byte offset. This can
121	* only happen on architectures where sizeof(loff_t) ==
122	* sizeof(unsigned long). So, only check in those instances.
123	*/
124	if (sizeof(unsigned long) == sizeof(loff_t)) {
125	if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
126	return -EINVAL;
127	}
128
129	/ must be huge page aligned /
130	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
131	return -EINVAL;
132
133	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
134	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
135	/ check for overflow /
136	if (len < vma_len)
137	return -EINVAL;
138
139	inode_lock(inode);
140	file_accessed(file);
141
142	ret = -ENOMEM;
143
144	vm_flags = vma->vm_flags;
145	/*
146	* for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
147	* reserving here. Note: only for SHM hugetlbfs file, the inode
148	* flag S_PRIVATE is set.
149	*/
150	if (inode->i_flags & S_PRIVATE)
151	vm_flags \|= VM_NORESERVE;
152
153	if (hugetlb_reserve_pages(inode,
154	from: vma->vm_pgoff >> huge_page_order(h),
155	to: len >> huge_page_shift(h), vma,
156	vm_flags) < `0`)
157	goto out;
158
159	ret = `0`;
160	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
161	i_size_write(inode, i_size: len);
162	out:
163	inode_unlock(inode);
164
165	return ret;
166	}
167
168	/*
169	* Called under mmap_write_lock(mm).
170	*/
171
172	unsigned long
173	hugetlb_get_unmapped_area(struct file file, unsigned* long addr,
174	unsigned long len, unsigned long pgoff,
175	unsigned long flags)
176	{
177	unsigned long addr0 = `0`;
178	struct hstate *h = hstate_file(f: file);
179
180	if (len & ~huge_page_mask(h))
181	return -EINVAL;
182	if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h)))
183	return -EINVAL;
184	if (addr)
185	addr0 = ALIGN(addr, huge_page_size(h));
186
187	return mm_get_unmapped_area_vmflags(current->mm, filp: file, addr: addr0, len, pgoff,
188	flags, vm_flags: `0`);
189	}
190
191	/*
192	* Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset.
193	* Returns the maximum number of bytes one can read without touching the 1st raw
194	* HWPOISON page.
195	*/
196	static size_t adjust_range_hwpoison(struct folio *folio, size_t offset,
197	size_t bytes)
198	{
199	struct page *page = folio_page(folio, offset / PAGE_SIZE);
200	size_t safe_bytes;
201
202	if (is_raw_hwpoison_page_in_hugepage(page))
203	return `0`;
204	/ Safe to read the remaining bytes in this page. /
205	safe_bytes = PAGE_SIZE - (offset % PAGE_SIZE);
206	page++;
207
208	/ Check each remaining page as long as we are not done yet. /
209	for (; safe_bytes < bytes; safe_bytes += PAGE_SIZE, page++)
210	if (is_raw_hwpoison_page_in_hugepage(page))
211	break;
212
213	return min(safe_bytes, bytes);
214	}
215
216	/*
217	* Support for read() - Find the page attached to f_mapping and copy out the
218	* data. This provides functionality similar to filemap_read().
219	*/
220	static ssize_t hugetlbfs_read_iter(struct kiocb iocb, struct* iov_iter *to)
221	{
222	struct file *file = iocb->ki_filp;
223	struct hstate *h = hstate_file(f: file);
224	struct address_space *mapping = file->f_mapping;
225	struct inode *inode = mapping->host;
226	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
227	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
228	unsigned long end_index;
229	loff_t isize;
230	ssize_t retval = `0`;
231
232	while (iov_iter_count(i: to)) {
233	struct folio *folio;
234	size_t nr, copied, want;
235
236	/ nr is the maximum number of bytes to copy from this page /
237	nr = huge_page_size(h);
238	isize = i_size_read(inode);
239	if (!isize)
240	break;
241	end_index = (isize - `1`) >> huge_page_shift(h);
242	if (index > end_index)
243	break;
244	if (index == end_index) {
245	nr = ((isize - `1`) & ~huge_page_mask(h)) + `1`;
246	if (nr <= offset)
247	break;
248	}
249	nr = nr - offset;
250
251	/ Find the folio /
252	folio = filemap_lock_hugetlb_folio(h, mapping, idx: index);
253	if (IS_ERR(ptr: folio)) {
254	/*
255	* We have a HOLE, zero out the user-buffer for the
256	* length of the hole or request.
257	*/
258	copied = iov_iter_zero(bytes: nr, to);
259	} else {
260	folio_unlock(folio);
261
262	if (!folio_test_hwpoison(folio))
263	want = nr;
264	else {
265	/*
266	* Adjust how many bytes safe to read without
267	* touching the 1st raw HWPOISON page after
268	* offset.
269	*/
270	want = adjust_range_hwpoison(folio, offset, bytes: nr);
271	if (want == `0`) {
272	folio_put(folio);
273	retval = -EIO;
274	break;
275	}
276	}
277
278	/*
279	* We have the folio, copy it to user space buffer.
280	*/
281	copied = copy_folio_to_iter(folio, offset, bytes: want, i: to);
282	folio_put(folio);
283	}
284	offset += copied;
285	retval += copied;
286	if (copied != nr && iov_iter_count(i: to)) {
287	if (!retval)
288	retval = -EFAULT;
289	break;
290	}
291	index += offset >> huge_page_shift(h);
292	offset &= ~huge_page_mask(h);
293	}
294	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
295	return retval;
296	}
297
298	static int hugetlbfs_write_begin(const struct kiocb *iocb,
299	struct address_space *mapping,
300	loff_t pos, unsigned len,
301	struct folio *foliop, void* **fsdata)
302	{
303	return -EINVAL;
304	}
305
306	static int hugetlbfs_write_end(const struct kiocb *iocb,
307	struct address_space *mapping,
308	loff_t pos, unsigned len, unsigned copied,
309	struct folio folio, void* *fsdata)
310	{
311	BUG();
312	return -EINVAL;
313	}
314
315	static void hugetlb_delete_from_page_cache(struct folio *folio)
316	{
317	folio_clear_dirty(folio);
318	folio_clear_uptodate(folio);
319	filemap_remove_folio(folio);
320	}
321
322	/*
323	* Called with i_mmap_rwsem held for inode based vma maps. This makes
324	* sure vma (and vm_mm) will not go away. We also hold the hugetlb fault
325	* mutex for the page in the mapping. So, we can not race with page being
326	* faulted into the vma.
327	*/
328	static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma,
329	unsigned long addr, unsigned long pfn)
330	{
331	pte_t *ptep, pte;
332
333	ptep = hugetlb_walk(vma, addr, sz: huge_page_size(h: hstate_vma(vma)));
334	if (!ptep)
335	return false;
336
337	pte = huge_ptep_get(mm: vma->vm_mm, addr, ptep);
338	if (huge_pte_none(pte) \|\| !pte_present(a: pte))
339	return false;
340
341	if (pte_pfn(pte) == pfn)
342	return true;
343
344	return false;
345	}
346
347	/*
348	* Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
349	* No, because the interval tree returns us only those vmas
350	* which overlap the truncated area starting at pgoff,
351	* and no vma on a 32-bit arch can span beyond the 4GB.
352	*/
353	static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
354	{
355	unsigned long offset = `0`;
356
357	if (vma->vm_pgoff < start)
358	offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
359
360	return vma->vm_start + offset;
361	}
362
363	static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
364	{
365	unsigned long t_end;
366
367	if (!end)
368	return vma->vm_end;
369
370	t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
371	if (t_end > vma->vm_end)
372	t_end = vma->vm_end;
373	return t_end;
374	}
375
376	/*
377	* Called with hugetlb fault mutex held. Therefore, no more mappings to
378	* this folio can be created while executing the routine.
379	*/
380	static void hugetlb_unmap_file_folio(struct hstate *h,
381	struct address_space *mapping,
382	struct folio *folio, pgoff_t index)
383	{
384	struct rb_root_cached *root = &mapping->i_mmap;
385	struct hugetlb_vma_lock *vma_lock;
386	unsigned long pfn = folio_pfn(folio);
387	struct vm_area_struct *vma;
388	unsigned long v_start;
389	unsigned long v_end;
390	pgoff_t start, end;
391
392	start = index * pages_per_huge_page(h);
393	end = (index + `1`) * pages_per_huge_page(h);
394
395	i_mmap_lock_write(mapping);
396	retry:
397	vma_lock = NULL;
398	vma_interval_tree_foreach(vma, root, start, end - `1`) {
399	v_start = vma_offset_start(vma, start);
400	v_end = vma_offset_end(vma, end);
401
402	if (!hugetlb_vma_maps_pfn(vma, addr: v_start, pfn))
403	continue;
404
405	if (!hugetlb_vma_trylock_write(vma)) {
406	vma_lock = vma->vm_private_data;
407	/*
408	* If we can not get vma lock, we need to drop
409	* immap_sema and take locks in order. First,
410	* take a ref on the vma_lock structure so that
411	* we can be guaranteed it will not go away when
412	* dropping immap_sema.
413	*/
414	kref_get(kref: &vma_lock->refs);
415	break;
416	}
417
418	unmap_hugepage_range(vma, start: v_start, end: v_end, NULL,
419	ZAP_FLAG_DROP_MARKER);
420	hugetlb_vma_unlock_write(vma);
421	}
422
423	i_mmap_unlock_write(mapping);
424
425	if (vma_lock) {
426	/*
427	* Wait on vma_lock. We know it is still valid as we have
428	* a reference. We must 'open code' vma locking as we do
429	* not know if vma_lock is still attached to vma.
430	*/
431	down_write(sem: &vma_lock->rw_sema);
432	i_mmap_lock_write(mapping);
433
434	vma = vma_lock->vma;
435	if (!vma) {
436	/*
437	* If lock is no longer attached to vma, then just
438	* unlock, drop our reference and retry looking for
439	* other vmas.
440	*/
441	up_write(sem: &vma_lock->rw_sema);
442	kref_put(kref: &vma_lock->refs, release: hugetlb_vma_lock_release);
443	goto retry;
444	}
445
446	/*
447	* vma_lock is still attached to vma. Check to see if vma
448	* still maps page and if so, unmap.
449	*/
450	v_start = vma_offset_start(vma, start);
451	v_end = vma_offset_end(vma, end);
452	if (hugetlb_vma_maps_pfn(vma, addr: v_start, pfn))
453	unmap_hugepage_range(vma, start: v_start, end: v_end, NULL,
454	ZAP_FLAG_DROP_MARKER);
455
456	kref_put(kref: &vma_lock->refs, release: hugetlb_vma_lock_release);
457	hugetlb_vma_unlock_write(vma);
458
459	goto retry;
460	}
461	}
462
463	static void
464	hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
465	zap_flags_t zap_flags)
466	{
467	struct vm_area_struct *vma;
468
469	/*
470	* end == 0 indicates that the entire range after start should be
471	* unmapped. Note, end is exclusive, whereas the interval tree takes
472	* an inclusive "last".
473	*/
474	vma_interval_tree_foreach(vma, root, start, end ? end - `1` : ULONG_MAX) {
475	unsigned long v_start;
476	unsigned long v_end;
477
478	if (!hugetlb_vma_trylock_write(vma))
479	continue;
480
481	/*
482	* Skip VMAs without shareable locks. Per the design in commit
483	* 40549ba8f8e0, these will be handled by remove_inode_hugepages()
484	* called after this function with proper locking.
485	*/
486	if (!__vma_shareable_lock(vma))
487	goto skip;
488
489	v_start = vma_offset_start(vma, start);
490	v_end = vma_offset_end(vma, end);
491
492	unmap_hugepage_range(vma, start: v_start, end: v_end, NULL, zap_flags);
493
494	/*
495	* Note that vma lock only exists for shared/non-private
496	* vmas. Therefore, lock is not held when calling
497	* unmap_hugepage_range for private vmas.
498	*/
499	skip:
500	hugetlb_vma_unlock_write(vma);
501	}
502	}
503
504	/*
505	* Called with hugetlb fault mutex held.
506	* Returns true if page was actually removed, false otherwise.
507	*/
508	static bool remove_inode_single_folio(struct hstate h, struct* inode *inode,
509	struct address_space *mapping,
510	struct folio *folio, pgoff_t index,
511	bool truncate_op)
512	{
513	bool ret = false;
514
515	/*
516	* If folio is mapped, it was faulted in after being
517	* unmapped in caller or hugetlb_vmdelete_list() skips
518	* unmapping it due to fail to grab lock. Unmap (again)
519	* while holding the fault mutex. The mutex will prevent
520	* faults until we finish removing the folio. Hold folio
521	* lock to guarantee no concurrent migration.
522	*/
523	folio_lock(folio);
524	if (unlikely(folio_mapped(folio)))
525	hugetlb_unmap_file_folio(h, mapping, folio, index);
526
527	/*
528	* We must remove the folio from page cache before removing
529	* the region/ reserve map (hugetlb_unreserve_pages). In
530	* rare out of memory conditions, removal of the region/reserve
531	* map could fail. Correspondingly, the subpool and global
532	* reserve usage count can need to be adjusted.
533	*/
534	VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
535	hugetlb_delete_from_page_cache(folio);
536	ret = true;
537	if (!truncate_op) {
538	if (unlikely(hugetlb_unreserve_pages(inode, index,
539	index + `1`, `1`)))
540	hugetlb_fix_reserve_counts(inode);
541	}
542
543	folio_unlock(folio);
544	return ret;
545	}
546
547	/*
548	* remove_inode_hugepages handles two distinct cases: truncation and hole
549	* punch. There are subtle differences in operation for each case.
550	*
551	* truncation is indicated by end of range being LLONG_MAX
552	* In this case, we first scan the range and release found pages.
553	* After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
554	* maps and global counts. Page faults can race with truncation.
555	* During faults, hugetlb_no_page() checks i_size before page allocation,
556	* and again after obtaining page table lock. It will 'back out'
557	* allocations in the truncated range.
558	* hole punch is indicated if end is not LLONG_MAX
559	* In the hole punch case we scan the range and release found pages.
560	* Only when releasing a page is the associated region/reserve map
561	* deleted. The region/reserve map for ranges without associated
562	* pages are not modified. Page faults can race with hole punch.
563	* This is indicated if we find a mapped page.
564	* Note: If the passed end of range value is beyond the end of file, but
565	* not LLONG_MAX this routine still performs a hole punch operation.
566	*/
567	static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
568	loff_t lend)
569	{
570	struct hstate *h = hstate_inode(i: inode);
571	struct address_space *mapping = &inode->i_data;
572	const pgoff_t end = lend >> PAGE_SHIFT;
573	struct folio_batch fbatch;
574	pgoff_t next, index;
575	int i, freed = `0`;
576	bool truncate_op = (lend == LLONG_MAX);
577
578	folio_batch_init(fbatch: &fbatch);
579	next = lstart >> PAGE_SHIFT;
580	while (filemap_get_folios(mapping, start: &next, end: end - `1`, fbatch: &fbatch)) {
581	for (i = `0`; i < folio_batch_count(fbatch: &fbatch); ++i) {
582	struct folio *folio = fbatch.folios[i];
583	u32 hash = `0`;
584
585	index = folio->index >> huge_page_order(h);
586	hash = hugetlb_fault_mutex_hash(mapping, idx: index);
587	mutex_lock(lock: &hugetlb_fault_mutex_table[hash]);
588
589	/*
590	* Remove folio that was part of folio_batch.
591	*/
592	if (remove_inode_single_folio(h, inode, mapping, folio,
593	index, truncate_op))
594	freed++;
595
596	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
597	}
598	folio_batch_release(fbatch: &fbatch);
599	cond_resched();
600	}
601
602	if (truncate_op)
603	(void)hugetlb_unreserve_pages(inode,
604	start: lstart >> huge_page_shift(h),
605	LONG_MAX, freed);
606	}
607
608	static void hugetlbfs_evict_inode(struct inode *inode)
609	{
610	struct resv_map *resv_map;
611
612	trace_hugetlbfs_evict_inode(inode);
613	remove_inode_hugepages(inode, lstart: `0`, LLONG_MAX);
614
615	/*
616	* Get the resv_map from the address space embedded in the inode.
617	* This is the address space which points to any resv_map allocated
618	* at inode creation time. If this is a device special inode,
619	* i_mapping may not point to the original address space.
620	*/
621	resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
622	/ Only regular and link inodes have associated reserve maps /
623	if (resv_map)
624	resv_map_release(ref: &resv_map->refs);
625	clear_inode(inode);
626	}
627
628	static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
629	{
630	pgoff_t pgoff;
631	struct address_space *mapping = inode->i_mapping;
632	struct hstate *h = hstate_inode(i: inode);
633
634	BUG_ON(offset & ~huge_page_mask(h));
635	pgoff = offset >> PAGE_SHIFT;
636
637	i_size_write(inode, i_size: offset);
638	i_mmap_lock_write(mapping);
639	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
640	hugetlb_vmdelete_list(root: &mapping->i_mmap, start: pgoff, end: `0`,
641	ZAP_FLAG_DROP_MARKER);
642	i_mmap_unlock_write(mapping);
643	remove_inode_hugepages(inode, lstart: offset, LLONG_MAX);
644	}
645
646	static void hugetlbfs_zero_partial_page(struct hstate *h,
647	struct address_space *mapping,
648	loff_t start,
649	loff_t end)
650	{
651	pgoff_t idx = start >> huge_page_shift(h);
652	struct folio *folio;
653
654	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
655	if (IS_ERR(ptr: folio))
656	return;
657
658	start = start & ~huge_page_mask(h);
659	end = end & ~huge_page_mask(h);
660	if (!end)
661	end = huge_page_size(h);
662
663	folio_zero_segment(folio, start: (size_t)start, xend: (size_t)end);
664
665	folio_unlock(folio);
666	folio_put(folio);
667	}
668
669	static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
670	{
671	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
672	struct address_space *mapping = inode->i_mapping;
673	struct hstate *h = hstate_inode(i: inode);
674	loff_t hpage_size = huge_page_size(h);
675	loff_t hole_start, hole_end;
676
677	/*
678	* hole_start and hole_end indicate the full pages within the hole.
679	*/
680	hole_start = round_up(offset, hpage_size);
681	hole_end = round_down(offset + len, hpage_size);
682
683	inode_lock(inode);
684
685	/ protected by i_rwsem /
686	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE)) {
687	inode_unlock(inode);
688	return -EPERM;
689	}
690
691	i_mmap_lock_write(mapping);
692
693	/ If range starts before first full page, zero partial page. /
694	if (offset < hole_start)
695	hugetlbfs_zero_partial_page(h, mapping,
696	start: offset, min(offset + len, hole_start));
697
698	/ Unmap users of full pages in the hole. /
699	if (hole_end > hole_start) {
700	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
701	hugetlb_vmdelete_list(root: &mapping->i_mmap,
702	start: hole_start >> PAGE_SHIFT,
703	end: hole_end >> PAGE_SHIFT, zap_flags: `0`);
704	}
705
706	/ If range extends beyond last full page, zero partial page. /
707	if ((offset + len) > hole_end && (offset + len) > hole_start)
708	hugetlbfs_zero_partial_page(h, mapping,
709	start: hole_end, end: offset + len);
710
711	i_mmap_unlock_write(mapping);
712
713	/ Remove full pages from the file. /
714	if (hole_end > hole_start)
715	remove_inode_hugepages(inode, lstart: hole_start, lend: hole_end);
716
717	inode_unlock(inode);
718
719	return `0`;
720	}
721
722	static long hugetlbfs_fallocate(struct file file, int* mode, loff_t offset,
723	loff_t len)
724	{
725	struct inode *inode = file_inode(f: file);
726	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
727	struct address_space *mapping = inode->i_mapping;
728	struct hstate *h = hstate_inode(i: inode);
729	struct vm_area_struct pseudo_vma;
730	struct mm_struct *mm = current->mm;
731	loff_t hpage_size = huge_page_size(h);
732	unsigned long hpage_shift = huge_page_shift(h);
733	pgoff_t start, index, end;
734	int error;
735	u32 hash;
736
737	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
738	return -EOPNOTSUPP;
739
740	if (mode & FALLOC_FL_PUNCH_HOLE) {
741	error = hugetlbfs_punch_hole(inode, offset, len);
742	goto out_nolock;
743	}
744
745	/*
746	* Default preallocate case.
747	* For this range, start is rounded down and end is rounded up
748	* as well as being converted to page offsets.
749	*/
750	start = offset >> hpage_shift;
751	end = (offset + len + hpage_size - `1`) >> hpage_shift;
752
753	inode_lock(inode);
754
755	/ We need to check rlimit even when FALLOC_FL_KEEP_SIZE /
756	error = inode_newsize_ok(inode, offset: offset + len);
757	if (error)
758	goto out;
759
760	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
761	error = -EPERM;
762	goto out;
763	}
764
765	/*
766	* Initialize a pseudo vma as this is required by the huge page
767	* allocation routines.
768	*/
769	vma_init(vma: &pseudo_vma, mm);
770	vm_flags_init(vma: &pseudo_vma, VM_HUGETLB \| VM_MAYSHARE \| VM_SHARED);
771	pseudo_vma.vm_file = file;
772
773	for (index = start; index < end; index++) {
774	/*
775	* This is supposed to be the vaddr where the page is being
776	* faulted in, but we have no vaddr here.
777	*/
778	struct folio *folio;
779	unsigned long addr;
780
781	cond_resched();
782
783	/*
784	* fallocate(2) manpage permits EINTR; we may have been
785	* interrupted because we are using up too much memory.
786	*/
787	if (signal_pending(current)) {
788	error = -EINTR;
789	break;
790	}
791
792	/ addr is the offset within the file (zero based) /
793	addr = index * hpage_size;
794
795	/ mutex taken here, fault path and hole punch /
796	hash = hugetlb_fault_mutex_hash(mapping, idx: index);
797	mutex_lock(lock: &hugetlb_fault_mutex_table[hash]);
798
799	/ See if already present in mapping to avoid alloc/free /
800	folio = filemap_get_folio(mapping, index: index << huge_page_order(h));
801	if (!IS_ERR(ptr: folio)) {
802	folio_put(folio);
803	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
804	continue;
805	}
806
807	/*
808	* Allocate folio without setting the avoid_reserve argument.
809	* There certainly are no reserves associated with the
810	* pseudo_vma. However, there could be shared mappings with
811	* reserves for the file at the inode level. If we fallocate
812	* folios in these areas, we need to consume the reserves
813	* to keep reservation accounting consistent.
814	*/
815	folio = alloc_hugetlb_folio(vma: &pseudo_vma, addr, cow_from_owner: false);
816	if (IS_ERR(ptr: folio)) {
817	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
818	error = PTR_ERR(ptr: folio);
819	goto out;
820	}
821	folio_zero_user(folio, addr_hint: addr);
822	__folio_mark_uptodate(folio);
823	error = hugetlb_add_to_page_cache(folio, mapping, idx: index);
824	if (unlikely(error)) {
825	restore_reserve_on_error(h, vma: &pseudo_vma, address: addr, folio);
826	folio_put(folio);
827	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
828	goto out;
829	}
830
831	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
832
833	folio_set_hugetlb_migratable(folio);
834	/*
835	* folio_unlock because locked by hugetlb_add_to_page_cache()
836	* folio_put() due to reference from alloc_hugetlb_folio()
837	*/
838	folio_unlock(folio);
839	folio_put(folio);
840	}
841
842	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
843	i_size_write(inode, i_size: offset + len);
844	inode_set_ctime_current(inode);
845	out:
846	inode_unlock(inode);
847
848	out_nolock:
849	trace_hugetlbfs_fallocate(inode, mode, offset, len, ret: error);
850	return error;
851	}
852
853	static int hugetlbfs_setattr(struct mnt_idmap *idmap,
854	struct dentry dentry, struct* iattr *attr)
855	{
856	struct inode *inode = d_inode(dentry);
857	struct hstate *h = hstate_inode(i: inode);
858	int error;
859	unsigned int ia_valid = attr->ia_valid;
860	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
861
862	error = setattr_prepare(idmap, dentry, attr);
863	if (error)
864	return error;
865
866	trace_hugetlbfs_setattr(inode, dentry, attr);
867
868	if (ia_valid & ATTR_SIZE) {
869	loff_t oldsize = inode->i_size;
870	loff_t newsize = attr->ia_size;
871
872	if (newsize & ~huge_page_mask(h))
873	return -EINVAL;
874	/ protected by i_rwsem /
875	if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) \|\|
876	(newsize > oldsize && (info->seals & F_SEAL_GROW)))
877	return -EPERM;
878	hugetlb_vmtruncate(inode, offset: newsize);
879	}
880
881	setattr_copy(idmap, inode, attr);
882	mark_inode_dirty(inode);
883	return `0`;
884	}
885
886	static struct inode hugetlbfs_get_root(struct* super_block *sb,
887	struct hugetlbfs_fs_context *ctx)
888	{
889	struct inode *inode;
890
891	inode = new_inode(sb);
892	if (inode) {
893	inode->i_ino = get_next_ino();
894	inode->i_mode = S_IFDIR \| ctx->mode;
895	inode->i_uid = ctx->uid;
896	inode->i_gid = ctx->gid;
897	simple_inode_init_ts(inode);
898	inode->i_op = &hugetlbfs_dir_inode_operations;
899	inode->i_fop = &simple_dir_operations;
900	/ directory inodes start off with i_nlink == 2 (for "." entry) /
901	inc_nlink(inode);
902	lockdep_annotate_inode_mutex_key(inode);
903	}
904	return inode;
905	}
906
907	/*
908	* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
909	* be taken from reclaim -- unlike regular filesystems. This needs an
910	* annotation because huge_pmd_share() does an allocation under hugetlb's
911	* i_mmap_rwsem.
912	*/
913	static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
914
915	static struct inode hugetlbfs_get_inode(struct* super_block *sb,
916	struct mnt_idmap *idmap,
917	struct inode *dir,
918	umode_t mode, dev_t dev)
919	{
920	struct inode *inode;
921	struct resv_map *resv_map = NULL;
922
923	/*
924	* Reserve maps are only needed for inodes that can have associated
925	* page allocations.
926	*/
927	if (S_ISREG(mode) \|\| S_ISLNK(mode)) {
928	resv_map = resv_map_alloc();
929	if (!resv_map)
930	return NULL;
931	}
932
933	inode = new_inode(sb);
934	if (inode) {
935	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
936
937	inode->i_ino = get_next_ino();
938	inode_init_owner(idmap, inode, dir, mode);
939	lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
940	&hugetlbfs_i_mmap_rwsem_key);
941	inode->i_mapping->a_ops = &hugetlbfs_aops;
942	simple_inode_init_ts(inode);
943	inode->i_mapping->i_private_data = resv_map;
944	info->seals = F_SEAL_SEAL;
945	switch (mode & S_IFMT) {
946	default:
947	init_special_inode(inode, mode, dev);
948	break;
949	case S_IFREG:
950	inode->i_op = &hugetlbfs_inode_operations;
951	inode->i_fop = &hugetlbfs_file_operations;
952	break;
953	case S_IFDIR:
954	inode->i_op = &hugetlbfs_dir_inode_operations;
955	inode->i_fop = &simple_dir_operations;
956
957	/ directory inodes start off with i_nlink == 2 (for "." entry) /
958	inc_nlink(inode);
959	break;
960	case S_IFLNK:
961	inode->i_op = &page_symlink_inode_operations;
962	inode_nohighmem(inode);
963	break;
964	}
965	lockdep_annotate_inode_mutex_key(inode);
966	trace_hugetlbfs_alloc_inode(inode, dir, mode);
967	} else {
968	if (resv_map)
969	kref_put(kref: &resv_map->refs, release: resv_map_release);
970	}
971
972	return inode;
973	}
974
975	/*
976	* File creation. Allocate an inode, and we're done..
977	*/
978	static int hugetlbfs_mknod(struct mnt_idmap idmap, struct* inode *dir,
979	struct dentry *dentry, umode_t mode, dev_t dev)
980	{
981	struct inode *inode;
982
983	inode = hugetlbfs_get_inode(sb: dir->i_sb, idmap, dir, mode, dev);
984	if (!inode)
985	return -ENOSPC;
986	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
987	d_instantiate(dentry, inode);
988	dget(dentry);/ Extra count - pin the dentry in core /
989	return `0`;
990	}
991
992	static struct dentry hugetlbfs_mkdir(struct* mnt_idmap idmap, struct* inode *dir,
993	struct dentry *dentry, umode_t mode)
994	{
995	int retval = hugetlbfs_mknod(idmap, dir, dentry,
996	mode: mode \| S_IFDIR, dev: `0`);
997	if (!retval)
998	inc_nlink(inode: dir);
999	return ERR_PTR(error: retval);
1000	}
1001
1002	static int hugetlbfs_create(struct mnt_idmap *idmap,
1003	struct inode dir, struct* dentry *dentry,
1004	umode_t mode, bool excl)
1005	{
1006	return hugetlbfs_mknod(idmap, dir, dentry, mode: mode \| S_IFREG, dev: `0`);
1007	}
1008
1009	static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
1010	struct inode dir, struct* file *file,
1011	umode_t mode)
1012	{
1013	struct inode *inode;
1014
1015	inode = hugetlbfs_get_inode(sb: dir->i_sb, idmap, dir, mode: mode \| S_IFREG, dev: `0`);
1016	if (!inode)
1017	return -ENOSPC;
1018	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
1019	d_tmpfile(file, inode);
1020	return finish_open_simple(file, error: `0`);
1021	}
1022
1023	static int hugetlbfs_symlink(struct mnt_idmap *idmap,
1024	struct inode dir, struct* dentry *dentry,
1025	const char *symname)
1026	{
1027	const umode_t mode = S_IFLNK\|S_IRWXUGO;
1028	struct inode *inode;
1029	int error = -ENOSPC;
1030
1031	inode = hugetlbfs_get_inode(sb: dir->i_sb, idmap, dir, mode, dev: `0`);
1032	if (inode) {
1033	int l = strlen(symname)+`1`;
1034	error = page_symlink(inode, symname, len: l);
1035	if (!error) {
1036	d_instantiate(dentry, inode);
1037	dget(dentry);
1038	} else
1039	iput(inode);
1040	}
1041	inode_set_mtime_to_ts(inode: dir, ts: inode_set_ctime_current(inode: dir));
1042
1043	return error;
1044	}
1045
1046	#ifdef CONFIG_MIGRATION
1047	static int hugetlbfs_migrate_folio(struct address_space *mapping,
1048	struct folio dst, struct* folio *src,
1049	enum migrate_mode mode)
1050	{
1051	int rc;
1052
1053	rc = migrate_huge_page_move_mapping(mapping, dst, src);
1054	if (rc)
1055	return rc;
1056
1057	if (hugetlb_folio_subpool(folio: src)) {
1058	hugetlb_set_folio_subpool(folio: dst,
1059	subpool: hugetlb_folio_subpool(folio: src));
1060	hugetlb_set_folio_subpool(folio: src, NULL);
1061	}
1062
1063	folio_migrate_flags(newfolio: dst, folio: src);
1064
1065	return `0`;
1066	}
1067	#else
1068	#define hugetlbfs_migrate_folio NULL
1069	#endif
1070
1071	static int hugetlbfs_error_remove_folio(struct address_space *mapping,
1072	struct folio *folio)
1073	{
1074	return `0`;
1075	}
1076
1077	/*
1078	* Display the mount options in /proc/mounts.
1079	*/
1080	static int hugetlbfs_show_options(struct seq_file m, struct* dentry *root)
1081	{
1082	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb: root->d_sb);
1083	struct hugepage_subpool *spool = sbinfo->spool;
1084	unsigned long hpage_size = huge_page_size(h: sbinfo->hstate);
1085	unsigned hpage_shift = huge_page_shift(h: sbinfo->hstate);
1086	char mod;
1087
1088	if (!uid_eq(left: sbinfo->uid, GLOBAL_ROOT_UID))
1089	seq_printf(m, ",uid=%u",
1090	from_kuid_munged(&init_user_ns, sbinfo->uid));
1091	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
1092	seq_printf(m, ",gid=%u",
1093	from_kgid_munged(&init_user_ns, sbinfo->gid));
1094	if (sbinfo->mode != `0755`)
1095	seq_printf(m, ",mode=%o", sbinfo->mode);
1096	if (sbinfo->max_inodes != -`1`)
1097	seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
1098
1099	hpage_size /= `1024`;
1100	mod = `'K'`;
1101	if (hpage_size >= `1024`) {
1102	hpage_size /= `1024`;
1103	mod = `'M'`;
1104	}
1105	seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
1106	if (spool) {
1107	if (spool->max_hpages != -`1`)
1108	seq_printf(m, ",size=%llu",
1109	(unsigned long long)spool->max_hpages << hpage_shift);
1110	if (spool->min_hpages != -`1`)
1111	seq_printf(m, ",min_size=%llu",
1112	(unsigned long long)spool->min_hpages << hpage_shift);
1113	}
1114	return `0`;
1115	}
1116
1117	static int hugetlbfs_statfs(struct dentry dentry, struct* kstatfs *buf)
1118	{
1119	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb: dentry->d_sb);
1120	struct hstate *h = hstate_inode(i: d_inode(dentry));
1121	u64 id = huge_encode_dev(dev: dentry->d_sb->s_dev);
1122
1123	buf->f_fsid = u64_to_fsid(v: id);
1124	buf->f_type = HUGETLBFS_MAGIC;
1125	buf->f_bsize = huge_page_size(h);
1126	if (sbinfo) {
1127	spin_lock(lock: &sbinfo->stat_lock);
1128	/ If no limits set, just report 0 or -1 for max/free/used*
1129	* blocks, like simple_statfs() */
1130	if (sbinfo->spool) {
1131	long free_pages;
1132
1133	spin_lock_irq(lock: &sbinfo->spool->lock);
1134	buf->f_blocks = sbinfo->spool->max_hpages;
1135	free_pages = sbinfo->spool->max_hpages
1136	- sbinfo->spool->used_hpages;
1137	buf->f_bavail = buf->f_bfree = free_pages;
1138	spin_unlock_irq(lock: &sbinfo->spool->lock);
1139	buf->f_files = sbinfo->max_inodes;
1140	buf->f_ffree = sbinfo->free_inodes;
1141	}
1142	spin_unlock(lock: &sbinfo->stat_lock);
1143	}
1144	buf->f_namelen = NAME_MAX;
1145	return `0`;
1146	}
1147
1148	static void hugetlbfs_put_super(struct super_block *sb)
1149	{
1150	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
1151
1152	if (sbi) {
1153	sb->s_fs_info = NULL;
1154
1155	if (sbi->spool)
1156	hugepage_put_subpool(spool: sbi->spool);
1157
1158	kfree(objp: sbi);
1159	}
1160	}
1161
1162	static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1163	{
1164	if (sbinfo->free_inodes >= `0`) {
1165	spin_lock(lock: &sbinfo->stat_lock);
1166	if (unlikely(!sbinfo->free_inodes)) {
1167	spin_unlock(lock: &sbinfo->stat_lock);
1168	return `0`;
1169	}
1170	sbinfo->free_inodes--;
1171	spin_unlock(lock: &sbinfo->stat_lock);
1172	}
1173
1174	return `1`;
1175	}
1176
1177	static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
1178	{
1179	if (sbinfo->free_inodes >= `0`) {
1180	spin_lock(lock: &sbinfo->stat_lock);
1181	sbinfo->free_inodes++;
1182	spin_unlock(lock: &sbinfo->stat_lock);
1183	}
1184	}
1185
1186
1187	static struct kmem_cache *hugetlbfs_inode_cachep;
1188
1189	static struct inode hugetlbfs_alloc_inode(struct* super_block *sb)
1190	{
1191	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
1192	struct hugetlbfs_inode_info *p;
1193
1194	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
1195	return NULL;
1196	p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
1197	if (unlikely(!p)) {
1198	hugetlbfs_inc_free_inodes(sbinfo);
1199	return NULL;
1200	}
1201	return &p->vfs_inode;
1202	}
1203
1204	static void hugetlbfs_free_inode(struct inode *inode)
1205	{
1206	trace_hugetlbfs_free_inode(inode);
1207	kmem_cache_free(s: hugetlbfs_inode_cachep, objp: HUGETLBFS_I(inode));
1208	}
1209
1210	static void hugetlbfs_destroy_inode(struct inode *inode)
1211	{
1212	hugetlbfs_inc_free_inodes(sbinfo: HUGETLBFS_SB(sb: inode->i_sb));
1213	}
1214
1215	static const struct address_space_operations hugetlbfs_aops = {
1216	.write_begin = hugetlbfs_write_begin,
1217	.write_end = hugetlbfs_write_end,
1218	.dirty_folio = noop_dirty_folio,
1219	.migrate_folio = hugetlbfs_migrate_folio,
1220	.error_remove_folio = hugetlbfs_error_remove_folio,
1221	};
1222
1223
1224	static void init_once(void *foo)
1225	{
1226	struct hugetlbfs_inode_info *ei = foo;
1227
1228	inode_init_once(&ei->vfs_inode);
1229	}
1230
1231	static const struct file_operations hugetlbfs_file_operations = {
1232	.read_iter = hugetlbfs_read_iter,
1233	.mmap = hugetlbfs_file_mmap,
1234	.fsync = noop_fsync,
1235	.get_unmapped_area = hugetlb_get_unmapped_area,
1236	.llseek = default_llseek,
1237	.fallocate = hugetlbfs_fallocate,
1238	.fop_flags = FOP_HUGE_PAGES,
1239	};
1240
1241	static const struct inode_operations hugetlbfs_dir_inode_operations = {
1242	.create = hugetlbfs_create,
1243	.lookup = simple_lookup,
1244	.link = simple_link,
1245	.unlink = simple_unlink,
1246	.symlink = hugetlbfs_symlink,
1247	.mkdir = hugetlbfs_mkdir,
1248	.rmdir = simple_rmdir,
1249	.mknod = hugetlbfs_mknod,
1250	.rename = simple_rename,
1251	.setattr = hugetlbfs_setattr,
1252	.tmpfile = hugetlbfs_tmpfile,
1253	};
1254
1255	static const struct inode_operations hugetlbfs_inode_operations = {
1256	.setattr = hugetlbfs_setattr,
1257	};
1258
1259	static const struct super_operations hugetlbfs_ops = {
1260	.alloc_inode = hugetlbfs_alloc_inode,
1261	.free_inode = hugetlbfs_free_inode,
1262	.destroy_inode = hugetlbfs_destroy_inode,
1263	.evict_inode = hugetlbfs_evict_inode,
1264	.statfs = hugetlbfs_statfs,
1265	.put_super = hugetlbfs_put_super,
1266	.show_options = hugetlbfs_show_options,
1267	};
1268
1269	/*
1270	* Convert size option passed from command line to number of huge pages
1271	* in the pool specified by hstate. Size option could be in bytes
1272	* (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1273	*/
1274	static long
1275	hugetlbfs_size_to_hpages(struct hstate h, unsigned* long long size_opt,
1276	enum hugetlbfs_size_type val_type)
1277	{
1278	if (val_type == NO_SIZE)
1279	return -`1`;
1280
1281	if (val_type == SIZE_PERCENT) {
1282	size_opt <<= huge_page_shift(h);
1283	size_opt *= h->max_huge_pages;
1284	do_div(size_opt, `100`);
1285	}
1286
1287	size_opt >>= huge_page_shift(h);
1288	return size_opt;
1289	}
1290
1291	/*
1292	* Parse one mount parameter.
1293	*/
1294	static int hugetlbfs_parse_param(struct fs_context fc, struct* fs_parameter *param)
1295	{
1296	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1297	struct fs_parse_result result;
1298	struct hstate *h;
1299	char *rest;
1300	unsigned long ps;
1301	int opt;
1302
1303	opt = fs_parse(fc, desc: hugetlb_fs_parameters, param, result: &result);
1304	if (opt < `0`)
1305	return opt;
1306
1307	switch (opt) {
1308	case Opt_uid:
1309	ctx->uid = result.uid;
1310	return `0`;
1311
1312	case Opt_gid:
1313	ctx->gid = result.gid;
1314	return `0`;
1315
1316	case Opt_mode:
1317	ctx->mode = result.uint_32 & `01777U`;
1318	return `0`;
1319
1320	case Opt_size:
1321	/ memparse() will accept a K/M/G without a digit /
1322	if (!param->string \|\| !isdigit(c: param->string[`0`]))
1323	goto bad_val;
1324	ctx->max_size_opt = memparse(ptr: param->string, retptr: &rest);
1325	ctx->max_val_type = SIZE_STD;
1326	if (*rest == `'%'`)
1327	ctx->max_val_type = SIZE_PERCENT;
1328	return `0`;
1329
1330	case Opt_nr_inodes:
1331	/ memparse() will accept a K/M/G without a digit /
1332	if (!param->string \|\| !isdigit(c: param->string[`0`]))
1333	goto bad_val;
1334	ctx->nr_inodes = memparse(ptr: param->string, retptr: &rest);
1335	return `0`;
1336
1337	case Opt_pagesize:
1338	ps = memparse(ptr: param->string, retptr: &rest);
1339	h = size_to_hstate(size: ps);
1340	if (!h) {
1341	pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
1342	return -EINVAL;
1343	}
1344	ctx->hstate = h;
1345	return `0`;
1346
1347	case Opt_min_size:
1348	/ memparse() will accept a K/M/G without a digit /
1349	if (!param->string \|\| !isdigit(c: param->string[`0`]))
1350	goto bad_val;
1351	ctx->min_size_opt = memparse(ptr: param->string, retptr: &rest);
1352	ctx->min_val_type = SIZE_STD;
1353	if (*rest == `'%'`)
1354	ctx->min_val_type = SIZE_PERCENT;
1355	return `0`;
1356
1357	default:
1358	return -EINVAL;
1359	}
1360
1361	bad_val:
1362	return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
1363	param->string, param->key);
1364	}
1365
1366	/*
1367	* Validate the parsed options.
1368	*/
1369	static int hugetlbfs_validate(struct fs_context *fc)
1370	{
1371	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1372
1373	/*
1374	* Use huge page pool size (in hstate) to convert the size
1375	* options to number of huge pages. If NO_SIZE, -1 is returned.
1376	*/
1377	ctx->max_hpages = hugetlbfs_size_to_hpages(h: ctx->hstate,
1378	size_opt: ctx->max_size_opt,
1379	val_type: ctx->max_val_type);
1380	ctx->min_hpages = hugetlbfs_size_to_hpages(h: ctx->hstate,
1381	size_opt: ctx->min_size_opt,
1382	val_type: ctx->min_val_type);
1383
1384	/*
1385	* If max_size was specified, then min_size must be smaller
1386	*/
1387	if (ctx->max_val_type > NO_SIZE &&
1388	ctx->min_hpages > ctx->max_hpages) {
1389	pr_err("Minimum size can not be greater than maximum size\n");
1390	return -EINVAL;
1391	}
1392
1393	return `0`;
1394	}
1395
1396	static int
1397	hugetlbfs_fill_super(struct super_block sb, struct* fs_context *fc)
1398	{
1399	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1400	struct hugetlbfs_sb_info *sbinfo;
1401
1402	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1403	if (!sbinfo)
1404	return -ENOMEM;
1405	sb->s_fs_info = sbinfo;
1406	spin_lock_init(&sbinfo->stat_lock);
1407	sbinfo->hstate = ctx->hstate;
1408	sbinfo->max_inodes = ctx->nr_inodes;
1409	sbinfo->free_inodes = ctx->nr_inodes;
1410	sbinfo->spool = NULL;
1411	sbinfo->uid = ctx->uid;
1412	sbinfo->gid = ctx->gid;
1413	sbinfo->mode = ctx->mode;
1414
1415	/*
1416	* Allocate and initialize subpool if maximum or minimum size is
1417	* specified. Any needed reservations (for minimum size) are taken
1418	* when the subpool is created.
1419	*/
1420	if (ctx->max_hpages != -`1` \|\| ctx->min_hpages != -`1`) {
1421	sbinfo->spool = hugepage_new_subpool(h: ctx->hstate,
1422	max_hpages: ctx->max_hpages,
1423	min_hpages: ctx->min_hpages);
1424	if (!sbinfo->spool)
1425	goto out_free;
1426	}
1427	sb->s_maxbytes = MAX_LFS_FILESIZE;
1428	sb->s_blocksize = huge_page_size(h: ctx->hstate);
1429	sb->s_blocksize_bits = huge_page_shift(h: ctx->hstate);
1430	sb->s_magic = HUGETLBFS_MAGIC;
1431	sb->s_op = &hugetlbfs_ops;
1432	sb->s_d_flags = DCACHE_DONTCACHE;
1433	sb->s_time_gran = `1`;
1434
1435	/*
1436	* Due to the special and limited functionality of hugetlbfs, it does
1437	* not work well as a stacking filesystem.
1438	*/
1439	sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
1440	sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
1441	if (!sb->s_root)
1442	goto out_free;
1443	return `0`;
1444	out_free:
1445	kfree(objp: sbinfo->spool);
1446	kfree(objp: sbinfo);
1447	return -ENOMEM;
1448	}
1449
1450	static int hugetlbfs_get_tree(struct fs_context *fc)
1451	{
1452	int err = hugetlbfs_validate(fc);
1453	if (err)
1454	return err;
1455	return get_tree_nodev(fc, fill_super: hugetlbfs_fill_super);
1456	}
1457
1458	static void hugetlbfs_fs_context_free(struct fs_context *fc)
1459	{
1460	kfree(objp: fc->fs_private);
1461	}
1462
1463	static const struct fs_context_operations hugetlbfs_fs_context_ops = {
1464	.free = hugetlbfs_fs_context_free,
1465	.parse_param = hugetlbfs_parse_param,
1466	.get_tree = hugetlbfs_get_tree,
1467	};
1468
1469	static int hugetlbfs_init_fs_context(struct fs_context *fc)
1470	{
1471	struct hugetlbfs_fs_context *ctx;
1472
1473	ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
1474	if (!ctx)
1475	return -ENOMEM;
1476
1477	ctx->max_hpages = -`1`; / No limit on size by default /
1478	ctx->nr_inodes = -`1`; / No limit on number of inodes by default /
1479	ctx->uid = current_fsuid();
1480	ctx->gid = current_fsgid();
1481	ctx->mode = `0755`;
1482	ctx->hstate = &default_hstate;
1483	ctx->min_hpages = -`1`; / No default minimum size /
1484	ctx->max_val_type = NO_SIZE;
1485	ctx->min_val_type = NO_SIZE;
1486	fc->fs_private = ctx;
1487	fc->ops = &hugetlbfs_fs_context_ops;
1488	return `0`;
1489	}
1490
1491	static struct file_system_type hugetlbfs_fs_type = {
1492	.name = "hugetlbfs",
1493	.init_fs_context = hugetlbfs_init_fs_context,
1494	.parameters = hugetlb_fs_parameters,
1495	.kill_sb = kill_litter_super,
1496	.fs_flags = FS_ALLOW_IDMAP,
1497	};
1498
1499	static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1500
1501	static int can_do_hugetlb_shm(void)
1502	{
1503	kgid_t shm_group;
1504	shm_group = make_kgid(from: &init_user_ns, gid: sysctl_hugetlb_shm_group);
1505	return capable(CAP_IPC_LOCK) \|\| in_group_p(shm_group);
1506	}
1507
1508	static int get_hstate_idx(int page_size_log)
1509	{
1510	struct hstate *h = hstate_sizelog(page_size_log);
1511
1512	if (!h)
1513	return -`1`;
1514	return hstate_index(h);
1515	}
1516
1517	/*
1518	* Note that size should be aligned to proper hugepage size in caller side,
1519	* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1520	*/
1521	struct file hugetlb_file_setup(const* char *name, size_t size,
1522	vm_flags_t acctflag, int creat_flags,
1523	int page_size_log)
1524	{
1525	struct inode *inode;
1526	struct vfsmount *mnt;
1527	int hstate_idx;
1528	struct file *file;
1529
1530	hstate_idx = get_hstate_idx(page_size_log);
1531	if (hstate_idx < `0`)
1532	return ERR_PTR(error: -ENODEV);
1533
1534	mnt = hugetlbfs_vfsmount[hstate_idx];
1535	if (!mnt)
1536	return ERR_PTR(error: -ENOENT);
1537
1538	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1539	struct ucounts *ucounts = current_ucounts();
1540
1541	if (user_shm_lock(size, ucounts)) {
1542	pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
1543	current->comm, current->pid);
1544	user_shm_unlock(size, ucounts);
1545	}
1546	return ERR_PTR(error: -EPERM);
1547	}
1548
1549	file = ERR_PTR(error: -ENOSPC);
1550	/ hugetlbfs_vfsmount[] mounts do not use idmapped mounts. /
1551	inode = hugetlbfs_get_inode(sb: mnt->mnt_sb, idmap: &nop_mnt_idmap, NULL,
1552	S_IFREG \| S_IRWXUGO, dev: `0`);
1553	if (!inode)
1554	goto out;
1555	if (creat_flags == HUGETLB_SHMFS_INODE)
1556	inode->i_flags \|= S_PRIVATE;
1557
1558	inode->i_size = size;
1559	clear_nlink(inode);
1560
1561	if (hugetlb_reserve_pages(inode, from: `0`,
1562	to: size >> huge_page_shift(h: hstate_inode(i: inode)), NULL,
1563	vm_flags: acctflag) < `0`)
1564	file = ERR_PTR(error: -ENOMEM);
1565	else
1566	file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
1567	&hugetlbfs_file_operations);
1568	if (!IS_ERR(ptr: file))
1569	return file;
1570
1571	iput(inode);
1572	out:
1573	return file;
1574	}
1575
1576	static struct vfsmount __init mount_one_hugetlbfs(struct* hstate *h)
1577	{
1578	struct fs_context *fc;
1579	struct vfsmount *mnt;
1580
1581	fc = fs_context_for_mount(fs_type: &hugetlbfs_fs_type, SB_KERNMOUNT);
1582	if (IS_ERR(ptr: fc)) {
1583	mnt = ERR_CAST(ptr: fc);
1584	} else {
1585	struct hugetlbfs_fs_context *ctx = fc->fs_private;
1586	ctx->hstate = h;
1587	mnt = fc_mount_longterm(fc);
1588	put_fs_context(fc);
1589	}
1590	if (IS_ERR(ptr: mnt))
1591	pr_err("Cannot mount internal hugetlbfs for page size %luK",
1592	huge_page_size(h) / SZ_1K);
1593	return mnt;
1594	}
1595
1596	static int __init init_hugetlbfs_fs(void)
1597	{
1598	struct vfsmount *mnt;
1599	struct hstate *h;
1600	int error;
1601	int i;
1602
1603	if (!hugepages_supported()) {
1604	pr_info("disabling because there are no supported hugepage sizes\n");
1605	return -ENOTSUPP;
1606	}
1607
1608	error = -ENOMEM;
1609	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1610	sizeof(struct hugetlbfs_inode_info),
1611	`0`, SLAB_ACCOUNT, init_once);
1612	if (hugetlbfs_inode_cachep == NULL)
1613	goto out;
1614
1615	error = register_filesystem(&hugetlbfs_fs_type);
1616	if (error)
1617	goto out_free;
1618
1619	/ default hstate mount is required /
1620	mnt = mount_one_hugetlbfs(h: &default_hstate);
1621	if (IS_ERR(ptr: mnt)) {
1622	error = PTR_ERR(ptr: mnt);
1623	goto out_unreg;
1624	}
1625	hugetlbfs_vfsmount[default_hstate_idx] = mnt;
1626
1627	/ other hstates are optional /
1628	i = `0`;
1629	for_each_hstate(h) {
1630	if (i == default_hstate_idx) {
1631	i++;
1632	continue;
1633	}
1634
1635	mnt = mount_one_hugetlbfs(h);
1636	if (IS_ERR(ptr: mnt))
1637	hugetlbfs_vfsmount[i] = NULL;
1638	else
1639	hugetlbfs_vfsmount[i] = mnt;
1640	i++;
1641	}
1642
1643	return `0`;
1644
1645	out_unreg:
1646	(void)unregister_filesystem(&hugetlbfs_fs_type);
1647	out_free:
1648	kmem_cache_destroy(s: hugetlbfs_inode_cachep);
1649	out:
1650	return error;
1651	}
1652	fs_initcall(init_hugetlbfs_fs)
1653

Browse the source code of Linux/fs/hugetlbfs/inode.c