HugePages

hugetlb fs
HugePage and DAX

hugetlb fs


HugePage and DAX


DAX mappings would always use hugepage.

__transparent_hugepage_enabled()
---
    /*
     * For dax vmas, try to always use hugepage mappings. If the kernel does
     * not support hugepages, fsdax mappings will fallback to PAGE_SIZE
     * mappings, and device-dax namespaces, that try to guarantee a given
     * mapping size, will fail to enable
     */
    if (vma_is_dax(vma))
        return true;
---
And also
ext4_file_mmap()
---
    file_accessed(file);
    if (IS_DAX(file_inode(file))) {
        vma->vm_ops = &ext4_dax_vm_ops;

        vma->vm_flags |= VM_HUGEPAGE;

    } else {
        vma->vm_ops = &ext4_file_vm_ops;
    }
---

When page fault occurs,
__handle_mm_fault()
---
    pgd = pgd_offset(mm, address);
    p4d = p4d_alloc(mm, pgd, address);
    ...
    vmf.pud = pud_alloc(mm, p4d, address);
    ...
retry_pud:

    //fadax dax_iomap_fault only support PE_SIZE_PTE and PE_SIZE_PMD (2M on x86-64)

    if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
        ret = create_huge_pud(&vmf);
        if (!(ret & VM_FAULT_FALLBACK))
            return ret;
    }
    ...
    vmf.pmd = pmd_alloc(mm, vmf.pud, address);
    ...
    if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
        ret = create_huge_pmd(&vmf);
        if (!(ret & VM_FAULT_FALLBACK))
            return ret;
    }
    ...
    return handle_pte_fault(&vmf);
---

ext4_dax_huge_fault()
  -> dax_iomap_fault() //ext4_iomap_ops
    -> dax_iomap_pmd_fault()
---
    entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
    ...
    pos = (loff_t)xas.xa_index << PAGE_SHIFT;
    error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
            &srcmap);



    // Note !!! iomap_begin would allocate fs blocks here, and the blocks
    // maybe not contiguous after running some time. If that happens,
    // we return VM_FAULT_FALLBACK and we would do pte fault next.


    if (iomap.offset + iomap.length < pos + PMD_SIZE) //>
        goto finish_iomap;

    sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);

    switch (iomap.type) {
    case IOMAP_MAPPED:

        // For PMEM-DAX, we needn't to allocate pages, because we map
        // the pages on nvdimm device directly.

        error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);

        entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
                        DAX_PMD, write && !sync);
        ...
        result = vmf_insert_pfn_pmd(vmf, pfn, write);
        break;
        ...
    }

 finish_iomap:
    if (ops->iomap_end) {
        int copied = PMD_SIZE;
        ...
        ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
                &iomap);
    }
 unlock_entry:
    dax_unlock_entry(&xas, entry);
---