Address Space
Page Table
Reclaim
vm_area_struct describes
The page fault policy includes:
unsigned long vm_start; /* Our start address within vm_mm. */
unsigned long vm_end; /* The first byte after our end address within vm_mm. */
/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next, *vm_prev;
struct rb_node vm_rb;
vm_area_struct are linked in both a double-direction list and a rb tree
rb tree is to accelerate the insert and lookup.
double-direction list is more convenient for merging and iterating...
Look at find_vma(), vma_merge()
pgprot_t vm_page_prot;
unsigned long vm_flags; /* Flags, see mm.h. */
...
struct list_head anon_vma_chain; /* Serialized by mmap_lock &
* page_table_lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units */
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
a vm_area_struct could be stack, brk, a range of file or just a range of
physical address w/o 'struct page' behind it(VM_PFNMAP).
In code, vm_area_struct describes a special rule for page fault handler.
arch/x86/mm/fault.c
do_user_addr_fault()
---
tsk = current;
mm = tsk->mm;
...
vma = find_vma(mm, address);
// There is not a vm_area associated with the address, bad address !!!
if (unlikely(!vma)) {
bad_area(regs, hw_error_code, address);
return;
}
arch/x86/mm/fault.c
do_user_addr_fault()
---
tsk = current;
mm = tsk->mm;
...
vma = find_vma(mm, address);
...
if (likely(vma->vm_start <= address))
goto good_area;
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area:
if (unlikely(access_error(hw_error_code, vma))) {
// Bad access permission !!!
bad_area_access_error(regs, hw_error_code, address, vma);
return;
}
handle_mm_fault()
---
// !!(vma->vm_flags & VM_HUGETLB);
// set in hugetlbfs_file_mmap()
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
---
__handle_mm_fault()
---
// vma_is_dax(vma) in __transparent_hugepage_enabled()
if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
// vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
...
return handle_pte_fault(&vmf);
---
handle_pte_fault()
---
if (unlikely(pmd_none(*vmf->pmd))) {
vmf->pte = NULL;
} else {
...
// pte here is pointer, orig_pte is the content
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
...
}
if (!vmf->pte) {
// return !vma->vm_ops;
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
else
return do_fault(vmf);
}
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
---
page_add_new_anon_rmap()
page_add_file_rmap()
shrink_page_list()
---
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page)) {
enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
if (!try_to_unmap(page, flags)) {
stat->nr_unmap_fail += nr_pages;
if (!was_swapbacked && PageSwapBacked(page))
stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
---
// check if a page is mapped in vma at an address
page_vma_mapped_walk()
To be continued
View of a Task
/---------------/ Hole /-------------------/ Virtual Address what a task can see
pud/pmd/pte pud/pmd/pte Page table which translate virtual address to physical ones
vm_area_struct vm_area_struct Page fault policy behind the address
address_space (what is it for anonymous mapping)
Maintain pages and method to get data
When a task access the virtual address
(1) MMU lookup the TLB, if not hit, walk the page table
If corresponding page table entry is empty, raise hardware
exception and trigger software page fault.
(2) page fault find and check the vm_area_struct maintained in task_strcut->mm
and decide what to do next based on the information in associated vm_area_struct.
(3) If file mapping, invoke fault callback to get the page and data
(4) install the page into page table
invalidate_inode_pages2_range is a good example to understand this, it mainly do following things,
After this, page table entry is zapped and page is deleted from the
look up the pages associated with the address range maintained in inode's address_space
unmap_mapping_pages()
---
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
---
unmap_mapping_range_tree()
-> unmap_mapping_range_vma()
-> zap_page_range_single()
-> unmap_single_vma()
-> unmap_page_range()
---
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
---
Only page table is modified here
---
xa_lock_irqsave(&mapping->i_pages, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
xa_unlock_irqrestore(&mapping->i_pages, flags);
---
page cache. When the task access the address again, page fault will
rebuild the pages and page table entries based on the vm_area_struct
Comment from https://www.kernel.org/doc/html/v4.18/vm/split_page_table_lock.html
Originally, mm->page_table_lock spinlock protected all page tables of the
mm_struct. But this approach leads to poor page fault scalability of
multi-threaded applications due high contention on the lock. To improve
scalability, split page table lock was introduced.
With split page table lock we have separate per-table lock to serialize
access to the table. At the moment we use split lock for PTE and PMD tables.
Access to higher level tables protected by mm->page_table_lock.
The lock helper interfaces in kernel
static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl = pmd_lockptr(mm, pmd);
---
-> ptlock_ptr(pmd_to_page(pmd))
-> page->ptl
---
spin_lock(ptl);
return ptl;
}
/*
* No scalability reason to split PUD locks yet, but follow the same pattern
* as the PMD locks to make it easier if we decide to. The VM should not be
* considered ready to switch to split PUD locks yet; there may be places
* which need to be converted from page_table_lock.
*/
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
return &mm->page_table_lock;
}
The handle_mm_fault works under mmap_read_lock()
do_mmap_pgoff works under mmap_write_lock()
There are two kinds of page faults, minor and major, or soft and hard.
We could observe these two kinds of page faults in following ways,
If the page is loaded in memory at the time the fault is generated,
but is not marked in the memory management unit as being loaded in memory,
then it is called a minor or soft page fault.
The operating system delays loading parts of the program from disk until
the program attempts to use it and the page fault is generated. If the
page is not loaded in memory at the time of the fault, then it is called
a major or hard page fault.
In code, it works as following,
filemap_fault()
---
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
fpin = do_async_mmap_readahead(vmf, page);
} else if (!page) {
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
page = pagecache_get_page(mapping, offset,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
...
}
if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
goto out_retry;
---
mm_account_fault()
---
major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
if (major)
current->maj_flt++;
else
current->min_flt++;
---
However, there are some confusing things here,
The readahead could case the page cache has been allocated but IO
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
is still ongoing. When we fault on this page, it will be accounted
^^^^^^^^^^^^^^^^
as min fault, but we need to wait the read IO complete. This is
similar with the major faults.
CPU0 CPU1 CPU2 CPU3
per-cpu pvecs.lru_add pvecs.lru_add pvecs.lru_add pvecs.lru_add
per-numa pglist_data
File/Anon Active/Inactive LRU list
+-------------+ +------------+
/ active / / inactive /
+-------------+ +-------------+
The basic principle is to add to a local per-cpu queue and then batch to add to
global lru list when the number of local reaches a threshold.
filemap_fault()
grab_cache_page_write_begin()
-> pagecache_get_page()
-> add_to_page_cache_lru()
generic_file_buffered_read()
-> add_to_page_cache_lru()
-> lru_cache_add()
---
get_page(page);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
// the threshold is PAGEVEC_SIZE 15
if (!pagevec_add(pvec, page) || PageCompound(page))
__pagevec_lru_add(pvec);
local_unlock(&lru_pvecs.lock);
---
This percpu pagevec is to reduce the contending on the pgdat->lru_lock
which is a per numa node one.
__pagevec_lru_add()
---
pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
---
//This is the infrastructure of rotating pages around lru list
pagevec_lru_move_fn()
---
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct pglist_data *pagepgdat = page_pgdat(page);
if (pagepgdat != pgdat) {
if (pgdat)
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
pgdat = pagepgdat;
//The pgdat represents a whole numa node !!!!
//So the lock here is very big
spin_lock_irqsave(&pgdat->lru_lock, flags);
}
// per-memcg lruvec
lruvec = mem_cgroup_page_lruvec(page, pgdat);
(*move_fn)(page, lruvec, arg);
}
if (pgdat)
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
// The reference is got when add to the percpu pagevec
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
---
__pagevec_lru_add_fn()
-> SetPageLRU()
// Only active list when PageActive
// Is a newly added page deemed active ?
// Maybe not, look into the mark_page_accessed()
-> lru = page_lru(page)
-> add_page_to_lru_list()
There some other special percpu pagevecs
SetPageActive() <- __activate_page() <- activate_page() \ mark_page_accessed()[1] >
<- __lru_cache_activate_page() / >
<- shrink_page_list [2] >
[1] mark_page_accessed()
If the page has been marked PageReferenced, activate it !
If a page in page cache for file or anon must be on LRU list or on the way
there, namely the lru_pvecs.lru_add, refer to lru_cache_add.
[2] shrink_page_list()
There are many conditions could cause shrink_page_list() to activate a page.
An interesting thing is when a file-backed page is added to LRU firstly, it
should be added to the inactive LRU list.
Yes, when the page is added to lru initially, it is added to inactive list
Look at
page_lru()
---
if (PageUnevictable(page))
return LRU_UNEVICTABLE;
lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
if (PageActive(page))
lru += LRU_ACTIVE;
return lru;
---
while (!list_empty(&l_hold)) {
...
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
nr_rotated += thp_nr_pages(page);
list_add(&page->lru, &l_active);
continue;
}
}
ClearPageActive(page); /* we are de-activating */
SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}
end_page_writeback()
-> ClearPageReclaim(page);
-> rotate_reclaimable_page()
Writeback is about to end against a page which has been marked for immediate
reclaim. If it still appears to be reclaimable, move it to the tail of the
inactive list.
shrink_page_list()
---
if (PageDirty(page)) {
/*
* Only kswapd can writeback filesystem pages
* to avoid risk of stack overflow. But avoid
* injecting inefficient single-page IO into
* flusher writeback as much as possible[1]: only
* write pages when we've encountered many
* dirty pages, and when we've already scanned
* the rest of the LRU for clean pages and see
* the same dirty pages again (PageReclaim).
*/
if (page_is_file_lru(page) &&
(!current_is_kswapd() || !PageReclaim(page) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
SetPageReclaim(page);
goto activate_locked;
}
...
switch (pageout(page, mapping)) {
...
}
---
[1] writeback_single_inode() employs do_writepages()
We could get two points here,
(a) direct relcaim path cannot write dirty pages out
(b) kswapd could do that but must comply with a strict condition.
Yes !!!
shrink_page_list()
---
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested which affects wait_iff_congested. kswapd
* will stall and start writing pages if the tail of the LRU
* is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
stat->nr_dirty++;
if (dirty && !writeback)
stat->nr_unqueued_dirty++;
---
This would trigger
(1) kick up the writeback flusher
shrink_inactive_list()
---
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
* the LRU before the dirty limits are reached and the dirty
* data has expired. It can also happen when the proportion of
* dirty pages grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
if (stat.nr_unqueued_dirty == nr_taken)
wakeup_flusher_threads(WB_REASON_VMSCAN);
---
(2) allow kswap to start write dirty pages out
shrink_node()
---
/* Allow kswapd to start writing pages during reclaim.*/
if (sc->nr.unqueued_dirty == sc->nr.file_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
---
As to dirty page throttling, we would think of balance_dirty_pages()
But it only works in global or cgroup v2. The legacy memcg doesn't
have it.
shrink_page_list()
---
if (PageWriteback(page)) {
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
stat->nr_immediate++;
goto activate_locked;
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!PageReclaim(page) || !may_enter_fs) {
SetPageReclaim(page);
stat->nr_writeback++;
goto activate_locked;
/* Case 3 above */
} else {
unlock_page(page);
wait_on_page_writeback(page);
/* then go back and try same page again */
list_add_tail(&page->lru, page_list);
continue;
}
}
---
Look into the
writeback_throttling_sane()
---
/*
* The normal page dirty throttling mechanism in balance_dirty_pages() is
* completely broken with the legacy memcg and direct stalling in
^^^^^^^^^^^^^^^^^^
* shrink_page_list() is used for throttling instead, which lacks all the
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* niceties such as fairness, adaptive pausing, bandwidth proportional
* allocation and configurability.
*/
if (!cgroup_reclaim(sc))
return true;
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return true;
return false;
---
In shrink_page_list, a direct reclaim path in cgroup v1 would fall
into sleep to wait the Writeback page if it meet a recycled
PageReclaim page.