hugetlb_sysfs_init() add directory for them,
ll /sys/kernel/mm/hugepages/hugepages-1048576kB
ll /sys/kernel/mm/hugepages/hugepages-2048kB
hstate comes from
(1) hugetlb_init()
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
for x86, it is (PMD_SHIFT - PAGE_SHIFT)
(2) setup_hugepagesz()
'hugepagesz=xxx' kernel command line
(3) gigantic_pages_init()
---
/* With compaction or CMA we can allocate gigantic pages at runtime */
if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
---
X86_FEATURE_GBPAGES is 'pdpe1gb' in lscpu
The default_hstate is the one which page size is HPAGE_SIZE (ARCH specific 2M in x86)
At the same time, every hstate has its own kernel mount which can be used to
allocate huge pages for other subsystem,
init_hugetlbfs_fs()
---
for_each_hstate(h) {
char buf[50];
unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
buf);
i++;
}
---
When you mount a hugetlbfs, it need to select a hstate,
By default, it is default_hstate(HPAGE_SIZE), and you can set it through mount
option 'pagesize'
alloc_pool_huge_page() <- hugetlb_hstate_alloc_pages() <- hugetlb_nrpages_setup() "hugepages=" kernel command line
<- set_max_huge_pages() <- __nr_hugepages_store_common() <- /sys/kernel/mm/hugepages/xxx/nr_hugepages/nr_hugepages_mempolicy
<- hugetlb_sysctl_handler_common()
alloc_pool_huge_page()
-> alloc_fresh_huge_page()
-> alloc_gigantic_page()/alloc_buddy_huge_page()
-> prep_new_huge_page()
-> set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
-> put_page()
-> __put_compound_page()
-> dtor = get_compound_page_dtor(page);
-> (*dtor)(page)
free_huge_page()
-> enqueue_huge_page(h, page)
See it ? When you free a hugepage, it is not freed back to system, but to hstate
The hugepages are stored on hstate.hugepage_freelists
/sys/kernel/mm/hugepages/xxx/nr_hugepages is the 'persistent'.
'surplus' is the number of hugepages above the value of 'persistent'
Let's look at the code of allocation and free
hugetlb_fault()
-> hugetlb_no_page
-> alloc_huge_page()
---
//dequeue hugepages from hstate.hugepage_freelists()
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
//Try 'surplus' ones
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
...
}
---
alloc_buddy_huge_page_with_mpol()
-> alloc_surplus_huge_page()
---
spin_lock(&hugetlb_lock);
//exceeds the overcommit limit ?
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
goto out_unlock;
spin_unlock(&hugetlb_lock);
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
if (!page)
return NULL;
spin_lock(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetPageHugeTemporary(page);
put_page(page);
page = NULL;
} else {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
}
...
---
When free hugepages,
free_huge_page()
---
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
if (restore_reserve)
h->resv_huge_pages++;
if (PageHugeTemporary(page)) {
list_del(&page->lru);
ClearPageHugeTemporary(page);
update_and_free_page(h, page);
} else if (h->surplus_huge_pages_node[nid]) {
//remove the surplus hugepages
list_del(&page->lru);
update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
---
DAX mappings would always use hugepage.
__transparent_hugepage_enabled()
---
/*
* For dax vmas, try to always use hugepage mappings. If the kernel does
* not support hugepages, fsdax mappings will fallback to PAGE_SIZE
* mappings, and device-dax namespaces, that try to guarantee a given
* mapping size, will fail to enable
*/
if (vma_is_dax(vma))
return true;
---
And also
ext4_file_mmap()
---
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
vma->vm_flags |= VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
---
When page fault occurs,
__handle_mm_fault()
---
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
...
vmf.pud = pud_alloc(mm, p4d, address);
...
retry_pud:
//fadax dax_iomap_fault only support PE_SIZE_PTE and PE_SIZE_PMD (2M on x86-64)
if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
...
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
...
if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
...
return handle_pte_fault(&vmf);
---
ext4_dax_huge_fault()
-> dax_iomap_fault() //ext4_iomap_ops
-> dax_iomap_pmd_fault()
---
entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
...
pos = (loff_t)xas.xa_index << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
&srcmap);
// Note !!! iomap_begin would allocate fs blocks here, and the blocks
// maybe not contiguous after running some time. If that happens,
// we return VM_FAULT_FALLBACK and we would do pte fault next.
if (iomap.offset + iomap.length < pos + PMD_SIZE) //>
goto finish_iomap;
sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
switch (iomap.type) {
case IOMAP_MAPPED:
// For PMEM-DAX, we needn't to allocate pages, because we map
// the pages on nvdimm device directly.
error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
DAX_PMD, write && !sync);
...
result = vmf_insert_pfn_pmd(vmf, pfn, write);
break;
...
}
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
...
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
unlock_entry:
dax_unlock_entry(&xas, entry);
---