VFS

readahead

concepts

implementation

pipelining
REQ_RAHEAD
fs metadata

writeback

dirty balance
dirty timeout
hand page to writeback
per-queue wb
writeback(legacy)

bh to bio
BH state

dcache

path_walking
locks_of_dentry
lookup in parellel

pagecache

lifecycle of pagecache
pagecache in meminfo

fs misc

The truth of page lock

readahead

concepts

Readahead is file based, including the block device (every block device has a standalone inode, the metadata of filesystem which depends on vfs pagecache will use it)

Readahead can only work for sequential read case. ^^^^^^^^^^

implementation

(Quote from the comment in mm/readahead.c)
The fields in struct file_ra_state represent the most-recently readahead
attempt.


                        |<----- async_size ---------|
     |------------------- size -------------------->|
     |==================#===========================|
     ^start             ^page marked with PG_readahead

pipelining

To overlap application thinking time and disk I/O time, we do `readahead pipelining':

Do not wait until the application consumed all  readahead pages and stalled on the
missing page at readahead_index;  Instead, submit an asynchronous readahead I/O as
soon as there are only async_size pages left in the readahead window.


Normally async_size will be equal to size, for maximum pipelining.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


ondemand_readahead
---
    if ((offset == (ra->start + ra->size - ra->async_size) ||
         offset == (ra->start + ra->size))) {
        ra->start += ra->size;
        ra->size = get_next_ra_size(ra, max_pages);
        ra->async_size = ra->size; //for maximum pipelining
        goto readit;
    }
---


ra_submit
---
    return __do_page_cache_readahead(mapping, filp,
                    ra->start, ra->size, ra->async_size);
---

__do_page_cache_readahead
---
    for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
        pgoff_t page_offset = offset + page_idx;

        if (page_offset > end_index)
            break;

        page = xa_load(&mapping->i_pages, page_offset);
        ...

        page = __page_cache_alloc(gfp_mask);
        if (!page)
            break;
        page->index = page_offset;
        list_add(&page->lru, &page_pool);

        if (page_idx == nr_to_read - lookahead_size)
            SetPageReadahead(page);

        nr_pages++;
    }

    if (nr_pages)
        read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);

---


generic_file_buffered_read
---
        if (PageReadahead(page)) {
            page_cache_async_readahead(mapping,
                    ra, filp, page,
                    index, last_index - index);
        }
---

REQ_RAHEAD

When to set this flag ?

 aops.readpages

The aops.readpages is only used by the read_pages.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


ext4_readpages
  -> ext4_mpage_readpages //is_readahead is true
  ---
    bio = bio_alloc(GFP_KERNEL,
                min_t(int, nr_pages, BIO_MAX_PAGES));
            if (!bio) {
                if (ctx)
                    fscrypt_release_ctx(ctx);
                goto set_error_page;
            }
            bio_set_dev(bio, bdev);
            bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
            bio->bi_end_io = mpage_end_io;
            bio->bi_private = ctx;
            bio_set_op_attrs(bio, REQ_OP_READ,
                        is_readahead ? REQ_RAHEAD : 0);

  ---

blkdev_readpages
  -> mpage_readpages
  ---
    struct mpage_readpage_args args = {
        .get_block = get_block,
        .is_readahead = true,
    };
    unsigned page_idx;

    for (page_idx = 0; page_idx < nr_pages; page_idx++) {
        struct page *page = lru_to_page(pages);

        prefetchw(&page->flags);
        list_del(&page->lru);
        if (!add_to_page_cache_lru(page, mapping,
                    page->index,
                    readahead_gfp_mask(mapping))) {
            args.page = page;
            args.nr_pages = nr_pages - page_idx;
            args.bio = do_mpage_readpage(&args);
        }
        put_page(page);
    }
    BUG_ON(!list_empty(pages));
    if (args.bio)
        mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);

  ---

All of the common filesystem should provide this aops.readpages callback.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


 ___breadahead
sb_breadahead
  -> __breadahead(sb->s_bdev, block, sb->s_blocksize);
---
    struct buffer_head *bh = __getblk(bdev, block, size);
    if (likely(bh)) {
        ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
        brelse(bh);
    }
---

This sb_breadahead is used by ext4 to readahead metadata

__ext4_get_inode_loc
---
    if (EXT4_SB(sb)->s_inode_readahead_blks) {
            ext4_fsblk_t b, end, table;
            unsigned num;
            __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;

            table = ext4_inode_table(sb, gdp);
            /* s_inode_readahead_blks is always a power of 2 */
            b = block & ~((ext4_fsblk_t) ra_blks - 1);
            if (table > b)
                b = table;
            end = b + ra_blks;
            num = EXT4_INODES_PER_GROUP(sb);
            if (ext4_has_group_desc_csum(sb))
                num -= ext4_itable_unused_count(sb, gdp);
            table += num / inodes_per_block;
            if (end > table)
                end = table;
            while (b <= end)
                sb_breadahead(sb, b++);
        }

---


 _xfs_buf_ioapply

In block layer, REQ_RAHEAD is not important.

blk core

blk_init_request_from_bio
---
    if (bio->bi_opf & REQ_RAHEAD)
        req->cmd_flags |= REQ_FAILFAST_MASK;
---

#define REQ_FAILFAST_MASK \
    (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)

bcache

check_should_bypass
---
    /*
     * Flag for bypass if the IO is for read-ahead or background,
     * unless the read-ahead request is for metadata
     * (eg, for gfs2 or xfs).
     */
    if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
        !(bio->bi_opf & (REQ_META|REQ_PRIO)))
        goto skip;
---

dm_make_request
---
    /* if we're suspended, we have to queue this io for later */
    if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
        dm_put_live_table(md, srcu_idx);

        if (!(bio->bi_opf & REQ_RAHEAD))
            queue_io(md, bio);
        else
            bio_io_error(bio);
        return ret;
    }
---

raid5

raid5_make_request
---
        sh = raid5_get_active_stripe(conf, new_sector, previous,
                       (bi->bi_opf & REQ_RAHEAD), 0);
                       ^^^^^^^^^^^^^^^^^^^^^^^^^
                          is nonblock
---

fs metadata

Since the fs metadata is through the pagecache of the block device,
does it could use this readahead ?
Let's take some examples from ext4

inode table and inode bitmap

__ext4_get_inode_loc
---
    /*
     * Figure out the offset within the block group inode table
     */
    inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
    inode_offset = ((inode->i_ino - 1) %
            EXT4_INODES_PER_GROUP(sb));
    block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
    iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);


    // sb_getblk just setup the page in the pagecache and its bhs

    bh = sb_getblk(sb, block);
    ...
    if (!buffer_uptodate(bh)) {
        lock_buffer(bh);
    ...
make_io:
    ...
        /*
         * There are other valid inodes in the buffer, this inode
         * has in-inode xattrs, or we don't have this inode in memory.
         * Read the block from disk.
         */
        trace_ext4_load_inode(inode);
        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;

        submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
        wait_on_buffer(bh);

        ...
    }
---

ext4_read_inode_bitmap
---
    desc = ext4_get_group_desc(sb, block_group, NULL);
    ...
    bitmap_blk = ext4_inode_bitmap(sb, desc);
    ...
    bh = sb_getblk(sb, bitmap_blk);
    ...
    if (bitmap_uptodate(bh))
        goto verify;

    lock_buffer(bh);
    if (bitmap_uptodate(bh)) {
        unlock_buffer(bh);
        goto verify;
    }
    ...
    /*
     * submit the buffer_head for reading
     */
    trace_ext4_load_inode_bitmap(sb, block_group);
    bh->b_end_io = ext4_end_bitmap_read;
    get_bh(bh);
    submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
    wait_on_buffer(bh);
    ...
---

directory entries

ext4_readdir
---
    while (ctx->pos < inode->i_size) {
        struct ext4_map_blocks map;
        ...
        cond_resched();
        map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
        map.m_len = 1;
        err = ext4_map_blocks(NULL, inode, &map, 0);
        if (err > 0) {
            pgoff_t index = map.m_pblk >>
                    (PAGE_SHIFT - inode->i_blkbits);
            if (!ra_has_index(&file->f_ra, index))
                page_cache_sync_readahead(
                    sb->s_bdev->bd_inode->i_mapping,
                    &file->f_ra, file,
                    index, 1);
            file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
            bh = ext4_bread(NULL, inode, map.m_lblk, 0);
        }
        ...
    }
---

There are two points here,
1. ext4_bread
   ext4_bread
   ---
    bh = ext4_getblk(handle, inode, block, map_flags);
    if (IS_ERR(bh))
        return bh;
    if (!bh || buffer_uptodate(bh))
        return bh;

    ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);

    wait_on_buffer(bh);
    if (buffer_uptodate(bh))
        return bh;
   ---
2. there is indeed readahead here
   page_cache_sync_readahead is invoked here.
   the parameters is very interesting,

        page_cache_sync_readahead(
            sb->s_bdev->bd_inode->i_mapping,
            &file->f_ra, file,
            index, 1);
   the address_space is the block device's,
   the file_ra_state is the directory's

ext4_create
  -> ext4_add_nondir
    -> ext4_add_entry
    ---
    blocks = dir->i_size >> sb->s_blocksize_bits;
    for (block = 0; block < blocks; block++) {
        bh = ext4_read_dirblock(dir, block, DIRENT);
          -> __ext4_read_dirblock

            -> ext4_bread

        retval = add_dirent_to_buf(handle, &fname, dir, inode,
                       NULL, bh);
        if (retval != -ENOSPC)
            goto out;

        if (blocks == 1 && !dx_fallback &&
            ext4_has_feature_dir_index(sb)) {
            retval = make_indexed_dir(handle, &fname, dir,
                          inode, bh);
            bh = NULL; /* make_indexed_dir releases bh */
            goto out;
        }
        brelse(bh);
    }
    bh = ext4_append(handle, dir, &block);
    ---

__ext4_find_entry
---
    do {
        /*
         * We deal with the read-ahead logic here.
         */
        if (ra_ptr >= ra_max) {
            /* Refill the readahead buffer */
            ra_ptr = 0;
            if (block < start)
                ra_max = start - block;
            else
                ra_max = nblocks - block;
            ra_max = min(ra_max, ARRAY_SIZE(bh_use));
            retval = ext4_bread_batch(dir, block, ra_max,
                          false /* wait */, bh_use);
            ...
        }
        if ((bh = bh_use[ra_ptr++]) == NULL)
            goto next;
        wait_on_buffer(bh);
        ...
        set_buffer_verified(bh);
        i = search_dirblock(bh, dir, fname,
                block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
        ...
    next:
        if (++block >= nblocks)
            block = 0;
    } while (block != start);
---

ext4_bread_batch
---
    for (i = 0; i < bh_count; i++)
        bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);

    for (i = 0; i < bh_count; i++)
        /* Note that NULL bhs[i] is valid because of holes. */
        if (bhs[i] && !buffer_uptodate(bhs[i]))
            ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
                    &bhs[i]);

    if (!wait)
        return 0;

    for (i = 0; i < bh_count; i++)
        if (bhs[i])
            wait_on_buffer(bhs[i]);
---

So we know the fs metadata doesn't use the readahead directly, the fs could implement its own readahead stuff.

writeback

dirty balance

Look at the comment of balance_dirty_pages

balance_dirty_pages() must be called by processes which are generating dirty
data.  It looks at the number of dirty pages in the machine and will force
                                                                ^^^^^^^^^^
the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If we're over `background_thresh' then the writeback threads are woken to
perform some writeout.

In normal case, the writeback mode could avoid the application to be blocked,
but what if the case when there is continuous writting IO ? ^^^^^^^^^^^^^^^^^^^^^^

                        pagecache
               write    +-------+
  application ------->  | Dirty |
                        +-------+
                        | Dirty |
                        +-------+
                        | Dirty |                  __________
                        +-------+  writeback      /         /|
                        | Dirty | ----------->   /_________/ |
                        +-------+                |         | /
                                                 |_________|/ 

When there is dirty balance mechanism here, the bw of application writting is
actually equal to the disk bw. So regarding to latency, the writeback here is
not so helpful.

But combined with delayed allocation, writeback could avoid fragment.
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Anyway, let's look at how does it work ?

generic_perform_write
---
    do {
        ...
        offset = (pos & (PAGE_SIZE - 1));
        bytes = min_t(unsigned long, PAGE_SIZE - offset,
                        iov_iter_count(i));

again:
        ...
        status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                        &page, &fsdata);
        if (unlikely(status < 0))
            break;

        if (mapping_writably_mapped(mapping))
            flush_dcache_page(page);

        copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
        flush_dcache_page(page);

        status = a_ops->write_end(file, mapping, pos, bytes, copied,
                        page, fsdata);
        if (unlikely(status < 0))
            break;
        copied = status;

        cond_resched();

        iov_iter_advance(i, copied);
        ...
        pos += copied;
        written += copied;

        balance_dirty_pages_ratelimited(mapping);
    } while (iov_iter_count(i));
---

dirty timeout

queue_io is used to move the timeout dirty inode from the wb->b_dirty or wb->b_dirty_time to wb->b_io.

move_expired_inodes
---
    if ((flags & EXPIRE_DIRTY_ATIME) == 0)
        older_than_this = work->older_than_this;
    else if (!work->for_sync) {
        expire_time = jiffies - (dirtytime_expire_interval * HZ);
        older_than_this = &expire_time;
    }
    while (!list_empty(delaying_queue)) {
        inode = wb_inode(delaying_queue->prev);

        // If the dirty time is before older_than_this, it will be moved.
                                ^^^^^^^^^^^^^^^^^^^^^^

        if (older_than_this &&
            inode_dirtied_after(inode, *older_than_this))
            break;
        list_move(&inode->i_io_list, &tmp);
        ...
    }
---

So how to set the older_than_this ?

EXPIRE_DIRTY_ATIME

It is only used for b_dirty_time
queue_io
---
    moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                     EXPIRE_DIRTY_ATIME, work);
---
In this case,
move_expired_inodes
---
    if ((flags & EXPIRE_DIRTY_ATIME) == 0)
        older_than_this = work->older_than_this;
    else if (!work->for_sync) {
        expire_time = jiffies - (dirtytime_expire_interval * HZ);
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        older_than_this = &expire_time;
    }
---

Only move the inodes that have been dirtied for dirty_expire_interval HZ (12 hours)
Get more from the comment of dirty_expire_interval

periodic writeback (the so-called kupdate)

wb_writeback
---
        if (work->for_kupdate) {
            oldest_jif = jiffies -
                msecs_to_jiffies(dirty_expire_interval * 10);
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
                                     30 seconds
        } 
---

The periodic writeback will writeback the dirty inode after 30s

background writeback

wb_writeback
---
        if (work->for_kupdate) {
            oldest_jif = jiffies -
                msecs_to_jiffies(dirty_expire_interval * 10);
        } else if (work->for_background)
            oldest_jif = jiffies;
            ^^^^^^^^^^^^^^^^^^^^^
                 Now !
---

The interesting thing is when to start up a background writeback ?
balance_dirty_pages
  -> wb_start_background_writeback

wb_workfn
  -> wb_do_writeback
    -> wb_check_background_flush
    ---
    if (wb_over_bg_thresh(wb)) {
        ^^^^^^^^^^^^^^^^^^^^^^

        struct wb_writeback_work work = {
            .nr_pages    = LONG_MAX,
            .sync_mode    = WB_SYNC_NONE,
            .for_background    = 1,
            .range_cyclic    = 1,
            .reason        = WB_REASON_BACKGROUND,
        };

        return wb_writeback(wb, &work);
    }
    ---

others

For all the other types of writeback (vmscan, sync ...), it will not check the
dirty time but just write them out directly.

hand page to writeback

filesystems that use BH

mark_buffer_dirty
---
    if (!test_set_buffer_dirty(bh)) {
        struct page *page = bh->b_page;
        struct address_space *mapping = NULL;

        lock_page_memcg(page);
        if (!TestSetPageDirty(page)) {
            mapping = page_mapping(page);
            if (mapping)
                __set_page_dirty(page, mapping, 0);
                ^^^^^^^^^^^^^^^^

            // set PAGECACHE_TAG_DIRTY on the page in the pagecache tree

        }
        unlock_page_memcg(page);
        if (mapping)
            __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
            ^^^^^^^^^^^^^^^^^^

        // hand the inode to writeback core

    }

---

regular data
ext4_write_end
  -> block_write_end
    -> __block_commit_write
      -> mark_buffer_dirty

metadata
__jbd2_journal_refile_buffer
  -> __jbd2_journal_unfile_buffer
    -> __jbd2_journal_temp_unlink_buffer
      -> mark_buffer_dirty

iomap (xfs)

 
__iomap_write_end
  -> iomap_set_page_dirty
  ---
    lock_page_memcg(page);
    newly_dirty = !TestSetPageDirty(page);
    if (newly_dirty)
        __set_page_dirty(page, mapping, 0);
    unlock_page_memcg(page);

    if (newly_dirty)
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
    return newly_dirty;

  ---

per-queue wb

The bdi is per-queue instead of per-fs.

#STEP 0
blk_alloc_queue_node
---
    q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
    if (!q->backing_dev_info)
        goto fail_split;

    q->stats = blk_alloc_queue_stats();
    if (!q->stats)
        goto fail_stats;

    q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
    q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
    q->backing_dev_info->name = "block";
    q->node = node_id;

    timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
            laptop_mode_timer_fn, 0);
---

#STEP 1
__blkdev_get
---
    if (!bdev->bd_openers) {
        first_open = true;
        bdev->bd_disk = disk;
        bdev->bd_queue = disk->queue;
        bdev->bd_contains = bdev;
        bdev->bd_partno = partno;

        if (!partno) {
            ret = -ENXIO;
            bdev->bd_part = disk_get_part(disk, partno);
            if (!bdev->bd_part)
                goto out_clear;

            ret = 0;
            if (disk->fops->open) {
                ret = disk->fops->open(bdev, mode);
                ...
            }

            if (!ret) {
                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
                set_init_blocksize(bdev);
            }

            ...
        } else {
            struct block_device *whole;
            whole = bdget_disk(disk, 0);
            ret = -ENOMEM;
            if (!whole)
                goto out_clear;
            BUG_ON(for_part);
            ret = __blkdev_get(whole, mode, 1);
            if (ret)
                goto out_clear;
            bdev->bd_contains = whole;
            bdev->bd_part = disk_get_part(disk, partno);
            if (!(disk->flags & GENHD_FL_UP) ||
                !bdev->bd_part || !bdev->bd_part->nr_sects) {
                ret = -ENXIO;
                goto out_clear;
            }
            bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
            set_init_blocksize(bdev);
        }

        if (bdev->bd_bdi == &noop_backing_dev_info)
            bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
---

#STEP 2
mount_dev
---
    s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
         bdev);
---

static int set_bdev_super(struct super_block *s, void *data)
{
    s->s_bdev = data;
    s->s_dev = s->s_bdev->bd_dev;
    s->s_bdi = bdi_get(s->s_bdev->bd_bdi);

    return 0;
}

#STEP 3
balance_dirty_pages_ratelimited
  -> inode_to_bdi
  ---
    sb = inode->i_sb;
#ifdef CONFIG_BLOCK
    if (sb_is_blkdev_sb(sb))
        return I_BDEV(inode)->bd_bdi;
#endif
    return sb->s_bdi;
  ---

writeback(legacy)

wb_do_writeback

 work item queued by wb_queue_work
__writeback_inodes_sb_nr // input wb_writeback_work.nr_pages
OR
sync_inodes_sb // wb_writeback_work.nr_pages is LONG_MAX
               // for_sync = 1
  -> bdi_split_work_to_wbs
    -> wb_queue_work

Both of above push the writeback on the sb (a fs)

Some filesystem interfaces will invoke them with WB_REASON_FS_FREE_SPACE.

 start_all

laptop_mode_timer_fn
  -> wakeup_flusher_threads_bdi
    -> __wakeup_flusher_threads_bdi
      -> wb_start_writeback
        -> set WB_start_all
        -> wb_wakeup

SYSCALL sync / pm_suspend->enter_state
  -> ksys_sync
OR

shrink_inactive_list  // WB_REASON_VMSCAN

    -> wakeup_flusher_threads //iterate bdi_list
      -> __wakeup_flusher_threads_bdi //iterate all the dirty wb
        -> wb_start_writeback
          -> set WB_start_all
          -> wb_wakeup

Start flush on all of bdi

 periodic (kupdate style)
__mark_inode_dirty

    // If this is the first dirty inode for this bdi,
    // we have to wake-up the corresponding bdi thread
    // to make sure background write-back happens
    // later.

  -> wb_wakeup_delayed

wb_workfn
---
    if (!list_empty(&wb->work_list))
        wb_wakeup(wb);
    else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
        wb_wakeup_delayed(wb);
---

unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

5s


 background
Actually, no explicit background writeback trigger.
Just need to wb_wakeup.
Then wb_do_writeback will check itself.
wb_do_writeback
  -> wb_check_background_flush
---
    if (wb_over_bg_thresh(wb)) {

        struct wb_writeback_work work = {
            .nr_pages    = LONG_MAX,
            .sync_mode    = WB_SYNC_NONE,
            .for_background    = 1,
            .range_cyclic    = 1,
            .reason        = WB_REASON_BACKGROUND,
        };

        return wb_writeback(wb, &work);
    }
---

balance_dirty_pages will try to trigger background writeback.
balance_dirty_pages
  -> wb_start_background_writeback
    -> wb_wakeup



The main difference between such writeback method is how many pages that can be
written out.
For example,

 period (kupdate)
It will try to write out the dirtied inode that has been expired.
wb_writeback
---
    oldest_jif = jiffies;
    work->older_than_this = &oldest_jif;

    blk_start_plug(&plug);
    spin_lock(&wb->list_lock);
    for (;;) {
        ...
        if (work->for_kupdate) {
            oldest_jif = jiffies -
                msecs_to_jiffies(dirty_expire_interval * 10); 
        }
        ...
        if (list_empty(&wb->b_io))
            queue_io(wb, work);
             -> move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
             ---
             if ((flags & EXPIRE_DIRTY_ATIME) == 0)
                 older_than_this = work->older_than_this;
             ...
             while (!list_empty(delaying_queue)) {
                 inode = wb_inode(delaying_queue->prev);
                 if (older_than_this &&
                     inode_dirtied_after(inode, *older_than_this))
                     break;
                 list_move(&inode->i_io_list, &tmp);
                 moved++;
                 ...
             }
             ---

unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */

3s

So the dirty expire time is 30s


 background
For background, wb_writeback will update oldest_jif every time, so all of the
dirtied inode could be moved to wb->b_io.
The key point here is
wb_writeback
---

        /*
         * For background writeout, stop when we are below the
         * background dirty threshold
         */

        if (work->for_background && !wb_over_bg_thresh(wb))
            break;
---

BH

Historically, a buffer_head was used to map a single block within a page, and of
course as the unit of I/O through the filesystem and block layers.
Nowadays the basic I/O unit is the bio, and buffer_heads are used for

 extracting block mappings (via .get_block) 
static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
    set_buffer_mapped(bh);
    bh->b_bdev = sb->s_bdev;
    bh->b_blocknr = block;
    bh->b_size = sb->s_blocksize;
}

A single block that a bh represents is the blocksize of filesystem, not the
block device.

In block layer, the logical block size is still 512 bytes which is the
traditional sector size, but the real block logical size of the storage hardware
is varaible. Look at the nvme driver code:
nvme_setup_rw
---
    cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
    cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
    cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
                                  -> (sector >> (ns->lba_shift - 9))
    cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);

---
What if the logical block size of the device is larger than 4K ?


 tracking state of block within a page cache
There could be one or more bh in one page. We could see this in
alloc_page_buffers.
---
    head = NULL;
    offset = PAGE_SIZE;
    while ((offset -= size) >= 0) { // size here is blocksize of fs
        bh = alloc_buffer_head(gfp);
        if (!bh)
            goto no_grow;

        bh->b_this_page = head;
        bh->b_blocknr = -1;
        head = bh;

        bh->b_size = size;

        /* Link the buffer to its page */
        set_bh_page(bh, page, offset);
    }
    return head;
---

 wrapping bio submission for backward compatibility reasons (e.g. submit_bh).

bh to bio

Look at the submit_bh_wbc to know the basic steps:

    ---
    bio = bio_alloc(GFP_NOIO, 1);

    if (wbc) {
        wbc_init_bio(wbc, bio);
        wbc_account_io(wbc, bh->b_page, bh->b_size);
    }

    bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
    bio_set_dev(bio, bh->b_bdev);
    bio->bi_write_hint = write_hint;

    bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
    BUG_ON(bio->bi_iter.bi_size != bh->b_size);

    bio->bi_end_io = end_bio_bh_io_sync;
                      -> bh->b_end_io(bh, !bio->bi_status);
    bio->bi_private = bh;

    /* Take care of bh's that straddle the end of the device */
    guard_bio_eod(op, bio);

    if (buffer_meta(bh))
        op_flags |= REQ_META;
    if (buffer_prio(bh))
        op_flags |= REQ_PRIO;
    bio_set_op_attrs(bio, op, op_flags);

    submit_bio(bio);
    ---

enum bh_state_bits defines the state of a bh and is contained in bh->b_state
And there are 3 marcos to define the set, clear and test operations about this state. They are defined in include/linux/buffer_head.h

set_buffer_##name

clear_buffer_##name

buffer_##name

Let's look at how to use them in fs.

BH_Uptodate

Means the bh contains valid data.
When read/write operation is completed successfully, BH_Uptodate will be set on
the bh, otherwise clear it.

BH_Lock

When a bh is under io, we will lock it.
A very classical scenario is __bread_slow:
---
    lock_buffer(bh);
    if (buffer_uptodate(bh)) {
        unlock_buffer(bh);
        return bh;
    } else {
        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;   
        submit_bh(REQ_OP_READ, 0, bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
            return bh;
    }
    brelse(bh);
    return NULL;
---
end_buffer_read_sync
  -> __end_buffer_read_notouch
  ---
    if (uptodate) {
        set_buffer_uptodate(bh);
    } else {
        /* This happens, due to failed read-ahead attempts. */
        clear_buffer_uptodate(bh);
    }
    unlock_buffer(bh)
  ---

BH_Dirty

BH_Dirty is set in mark_buffer_dirty.
Except for setting BH_Dirty, it would also:
 - set page dirty
 - __mark_inode_dirty(inode, I_DIRTY_PAGES)
   __mark_inode_dirty will hand over this inode to writeback, then the
   dirty data will be written to disk.

The BH_Dirty is usually cleared before IO.
For example:
---
    lock_buffer(bh);
    clear_buffer_dirty(bh);

    get_bh(bh); /* for end_buffer_write_sync() */
    bh->b_end_io = end_buffer_write_sync;
    submit_bh(REQ_OP_WRITE, 0, bh);

    wait_on_buffer(bh);
---

BH_Req
BH_New
BH_Uptodate_Lock

BH_Mapped

Has a disk mapping, in the other word, this bh corresponds to a block on disk.
It is usually set after get_block.
Look at the follow combinations between Mapped and Uptodate
Mapped Uptodate
  No    No        "unknown" - must do get_block()
  No    Yes       "hole" - zero-filled (no associated block on disk image)
  Yes   No        "allocated" - allocated on disk, not read in
  Yes   Yes       "valid" - allocated and up-to-date in memory.

BH_Async_Read/Write

The bh->b_end_io is end_buffer_async_read/write.

dcache

dcache, dentry cache, directory entry cache.

A dentry's core job is to represent a directory or file in filesystem and cache
the mapping between the file/directory and the associated inode. This inode
contains the core operations of filesystem.

The dentries encode the fs tree structure, the name of files. The main part of a dentry is :

inode
name (final part of the path)
parent (the name of the containing directory)

path_walking

The path walking is mainly done in link_path_walk, let's look at the skeleton of it.

    for(;;) {
        ...
        hash_len = hash_name(nd->path.dentry, name);

        hash_name will calculate the length and hash of the path component.
        hash_len = len << 32 | hash
        hash value is calculated based on pointer of parent dentry and entry name.

        ...
        nd->last.hash_len = hash_len;
        nd->last.name = name;
        nd->last_type = type;

        nd->last is the name component we walking currently.
        link_path_walk will leave the last component of the path for do_last

        name += hashlen_len(hash_len);
        if (!*name)
            goto OK;
        /*
         * If it wasn't NUL, we know it was '/'. Skip that
         * slash, and continue until no more slashes.
         */
        do {
            name++;
        } while (unlikely(*name == '/'));
        if (unlikely(!*name)) {
            ...
        } else {
            /* not the last component */
            err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
        }
        ...
    }
walk_component will mainly do 3 things:
1. try to get the dentry in the cache
lookup_fast
  -> __d_lookup(&nd->path.dentry, &nd->last)
    -> get hash list by d_hash(name->hash)
2. if not in cache, try to get it from fs
lookup_slow
  -> __lookup_slow
    -> d_alloc_parallel // allocate dentry
    -> inode->i_op->lookup

    // this will cause some io to get in the filesystem metadata of directory
    // and inode.

3. follow_managed
   mountpoint will be resolved here.

locks_of_dentry

Refer to Documentation/filesystems/path-lookup.txt
dcache is used to speed up the looking up of inode associated with a path name. this look up could come from multiple cores concurrently and frequently, so the lock mechanism is very important. Let's look into the lock of denty cache next and find out the how it promote the performance.

In Documentation/filesystems/path-lookup.txt, it always says "would like to do path walking without taking locks or reference
counts of intermediate dentries along the path.", why ?

Look into the path lookup process,
    [0]            [2]                    [4]
   +---+      +---------+            +-----------+
   |   v      |         v            |           v
/home/will/Desktop/wangjianchao/source_code/linux-stable/Makefile 
       |      ^         |            ^           |           ^
       +------+         +------------+           +-----------+
          [1]                 [3]                     [5]

[0]  dentry of "/", "home"
[1]  dentry of "home", "will"
[2]  dentry of "will", "Desktop"
[3]  dentry of "Desktop", "wangjianchao"
[4]  dentry of "wangjianchao", "source_code"
[5]  dentry of "source_code", "Makefile"

walk_component will be executed for [0] ~ [5], and lookup_fast will be invoked
every time. At the moment, the component dentry's d_lock has to be locked to
serialize the accessing to the dentry.
__d_lookup
---
        spin_lock(&dentry->d_lock);
        if (dentry->d_parent != parent)
            goto next;
        if (d_unhashed(dentry))
            goto next;

        if (!d_same_name(dentry, parent, name))
            goto next;

        dentry->d_lockref.count++;
        found = dentry;
        spin_unlock(&dentry->d_lock);
---

The contending on the lock of dentry of "home", "will" and "Desktop" should be
very high. On the system of a lot of cores, the dentry cache could become a
scalability problem with workload which perform lot of lookup.

Currently, there are two path walking modes:

ref-walk

ref-walk is the traditional way of performing dcache lookup using d_lock to
serialize the concurrent modifications to the dentry and take a reference count
on it.

The reference is release here,

step_into
  -> path_to_nameidata
---
    if (!(nd->flags & LOOKUP_RCU)) {
        dput(nd->path.dentry);
        if (nd->path.mnt != path->mnt)
            mntput(nd->path.mnt);
    }
    nd->path.mnt = path->mnt;
    nd->path.dentry = path->dentry;

---

rcu-walk

rcu-walk uses seqcount based dentry lookups, and can perform lookup of intermediate
elements without any stores to shared data in the dentry or inode.

The 'storing to shared data' means, in ref-walk, it need to:

1. lock and unlock on spinlock d_lock

    it is to serialize the modifications to dentry from rename or other

2. reference counter increment of dentry

    prevent the dentry to be released.

    in ref-walk, we need to take a reference count when get every intermediate
    component, and put it when step into next one.
    walk_component
      -> lookup_fast
        -> __d_lookup // get the reference
      -> step_into
        -> path_to_nameidata
          -> dput(nd->path.dentry) // if not LOOKUP_RCU

To kill them, rcu-walk does as following:

Take the RCU lock for the entire path walk to protect the dentry against being freed.

path_openat/path_lookupat...
  -> path_init

    -> rcu_read_lock // if LOOKUP_RCU

  -> link_path_walk
    -> walk_component
      -> lookup_fast
        -> __d_lookup_rcu
  -> terminate_walk

    -> rcu_read_unlock // if LOOKUP_RCU

So now dentry refcounts are not required for dentry persistence.
Because the dentry free employs call_rcu.
dput
  -> dentry_kill
    -> __dentry_kill
      -> dentry_free
        -> call_rcu(&dentry->d_u.d_rcu, __d_free);

Use seqlock to protect the dentry name, parent and inode.

The snapshot of the dentry's name, parent and inode (for child lookup) will be
protected by the per-dentry seqlock. dentry lookups recheck the sequence after
the child is found in case anything changed in the parent in the path walk.
lookup_fast
---
    if (nd->flags & LOOKUP_RCU) {
        unsigned seq;
        bool negative;
        dentry = __d_lookup_rcu(parent, &nd->last, &seq);
        ---
        
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
            unsigned seq;

    seqretry:
        /*
         * The dentry sequence count protects us from concurrent
         * renames, and thus protects parent and name fields.
         *
         * The caller must perform a seqcount check in order
         * to do anything useful with the returned dentry.
         */
            seq = raw_seqcount_begin(&dentry->d_seq);
            if (dentry->d_parent != parent)
                continue;
            if (d_unhashed(dentry))
                continue;

            if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                ...
            } else {
                if (dentry->d_name.hash_len != hashlen)
                    continue;
                if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
                    continue;
            }
            *seqp = seq;
            return dentry;
        }

       
        ---
        ...

        //This sequence count validates that the inode matches
        //the dentry name information from lookup.

        *inode = d_backing_inode(dentry);
        negative = d_is_negative(dentry);
        if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
            return -ECHILD;
        ...

        //This sequence count validates that the parent had no
        //changes while we did the lookup of the dentry above.

        if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
            return -ECHILD;
        *seqp = seq;
        ...
    }
---


This d_seq could ensure if we get a dentry, it must be expected.
Even if the dentry is renamed after this, it is still valid because the rename
would not allocate a new dentry but just modify and rehash it.

Who would write_seqcount the dentry->d_seq ?

dentry_unlink_inode

---
    raw_write_seqcount_begin(&dentry->d_seq);
    __d_clear_type_and_inode(dentry);
    hlist_del_init(&dentry->d_u.d_alias);
    raw_write_seqcount_end(&dentry->d_seq);
---
dentry_unlink_inode <- __dentry_kill
                       d_delete

__d_instantiate

---
    spin_lock(&dentry->d_lock);
    /*
     * Decrement negative dentry count if it was in the LRU list.
     */
    if (dentry->d_flags & DCACHE_LRU_LIST)
        this_cpu_dec(nr_dentry_negative);
    hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
    raw_write_seqcount_begin(&dentry->d_seq);
    __d_set_inode_and_type(dentry, inode, add_flags);
    raw_write_seqcount_end(&dentry->d_seq);
    fsnotify_update_flags(dentry);
    spin_unlock(&dentry->d_lock);
---

__d_add

---
    if (inode) {
        unsigned add_flags = d_flags_for_inode(inode);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
        fsnotify_update_flags(dentry);
    }
---

__d_move

---
    write_seqcount_begin(&dentry->d_seq);
    write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

    /* unhash both */
    if (!d_unhashed(dentry))
        ___d_drop(dentry);
    if (!d_unhashed(target))
        ___d_drop(target);

    /* ... and switch them in the tree */
    dentry->d_parent = target->d_parent;
    if (!exchange) {
        copy_name(dentry, target);
        target->d_hash.pprev = NULL;
        dentry->d_parent->d_lockref.count++;
        if (dentry != old_parent) /* wasn't IS_ROOT */
            WARN_ON(!--old_parent->d_lockref.count);
    } else {
        target->d_parent = old_parent;
        swap_names(dentry, target);
        list_move(&target->d_child, &target->d_parent->d_subdirs);
        __d_rehash(target);
        fsnotify_update_flags(target);
    }
    list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
    __d_rehash(dentry);
    fsnotify_update_flags(dentry);
    fscrypt_handle_d_move(dentry);

    write_seqcount_end(&target->d_seq);
    write_seqcount_end(&dentry->d_seq);


---

There are two points that why seqlock is better than spinlock in almost-read scenario.

No lock contending in read scenario
No write operation in read scenario, this is good for cache.

lookup in parellel

There are two parts of dentry look up, the fast path and slow path.
Let's look at the slow path here.
Quote from here

https://lwn.net/Articles/685108/

All directory operations are done with the inode mutex (i_mutex) held, which prevents anything else 
from touching that directory. But the most common operation, lookup, is non-destructive, so there is
no real conceptual reason to stop it from happening in parallel.

The typical scenario could be


CPU0     CPU1      CPU2     CPU3     CPU4
T0       T1        T2       T3       T4
  \      \         |        /        /
    \      \       |      /        /
      \      \     |    /        /
               
              /var/log/ 
          T0  T1  T2  T3  T4

If all of the dentries of T0 ~ T1 happen to be not in memory, all of them have
to invoke lookup_slow.

If the lock here is a mutex, the performance will very bad.

Then a rw_semaphore is introduced to replace the mutex.

static struct dentry *lookup_slow(const struct qstr *name,
                  struct dentry *dir,
                  unsigned int flags)
{
    struct inode *inode = dir->d_inode;
    struct dentry *res;
    inode_lock_shared(inode);
      -> down_read(&inode->i_rwsem); 
    res = __lookup_slow(name, dir, flags);
    inode_unlock_shared(inode);
    return res;
}

But there is a problem: the mutex currently protects the directory entry (dentry). A lookup operation can cause dentries to be created, which can lead to races if two dentries are created for the same name.
How to handle this ?

Look at the d_alloc_parallel.
---
    struct hlist_bl_head *b = in_lookup_hash(parent, hash);

    // alloc a dentry structrue here.

    struct dentry *new = d_alloc(parent, name);
 
retry:
    rcu_read_lock();
    r_seq = read_seqbegin(&rename_lock);

    // look up the a dentry with (parent, name) in hash cache
    // there could be some one create a same one concurrently.

    dentry = __d_lookup_rcu(parent, name, &d_seq);
    if (unlikely(dentry)) {
        ...

    // anything changes on the dentry ?

        if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
            rcu_read_unlock();
            dput(dentry);
            goto retry;
        }
        rcu_read_unlock();
        dput(new);
        return dentry;
    }
    if (unlikely(read_seqretry(&rename_lock, r_seq))) {
        rcu_read_unlock();
        goto retry;
    }

    hlist_bl_lock(b);

    // A spin lock here.
    // So there could be only one entering this critical section,
    // namely, only one of concurrent lookups with same parent and name pair
    // could add its dentry on the hash cache, the others have to wait. When
    // they come in this critical section, a dentry with same name and parent
    // pair has been there.
    // At the moment, there are 2 cases:
    //  - the dentry is in lookup, indicating inode->i_op->lookup is ongoing.
    //    we have to wait.
    //  - otherwise, the lookup has been completed, we could return this dentry
    //    directly.

    hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
        if (dentry->d_name.hash != hash)
            continue;
        if (dentry->d_parent != parent)
            continue;
        if (!d_same_name(dentry, parent, name))
            continue;
        hlist_bl_unlock(b);
        /* now we can try to grab a reference */
        if (!lockref_get_not_dead(&dentry->d_lockref)) {
            rcu_read_unlock();
            goto retry;
        }

        rcu_read_unlock();
        /*
         * somebody is likely to be still doing lookup for it;
         * wait for them to finish
         */
        spin_lock(&dentry->d_lock);
        d_wait_lookup(dentry);
        if (unlikely(dentry->d_name.hash != hash))
            goto mismatch;
        if (unlikely(dentry->d_parent != parent))
            goto mismatch;
        if (unlikely(d_unhashed(dentry)))
            goto mismatch;
        if (unlikely(!d_same_name(dentry, parent, name)))
            goto mismatch;
        /* OK, it *is* a hashed match; return it */
        spin_unlock(&dentry->d_lock);
        dput(new);
        return dentry;
    }
    rcu_read_unlock();
    /* we can't take ->d_lock here; it's OK, though. */
    new->d_flags |= DCACHE_PAR_LOOKUP; // dentry in-lookup is set here.
    new->d_wait = wq;
    hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
    hlist_bl_unlock(b);
    return new;

---

pagecache

lifecycle of pagecache


grow pagecache

grow pagecache

pagecache_get_page
  -> find_get_entry
  -> __page_cache_alloc
  -> add_to_page_cache_lru
    -> __add_to_page_cache_locked
    -> lru_cache_add
      -> __lru_cache_add
      ---
        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);

        get_page(page);
        if (!pagevec_add(pvec, page) || PageCompound(page))

             ^^^^^^^^^^^ [1]

            __pagevec_lru_add(pvec);

           ^^^^^^^^^^^^^^^^^^^^^^^^ [2]

        put_cpu_var(lru_add_pvec);
      ---

The interesting thing here is that the page would be added into per-cpu pagevec
first. If per-cpu pagevec is full, __pagevec_lru_add would drain the pages into
lru list one time.

__pagevec_lru_add
  -> __pagevec_lru_add_fn
  ---
    SetPageLRU(page);
    smp_mb();

    if (page_evictable(page)) {
        lru = page_lru(page);
    } else {
        ...
    }

    add_page_to_lru_list(page, lruvec, lru);
  ---

When drain pages into lru, we need to select lru list for it. This is done by page_lru

static __always_inline enum lru_list page_lru(struct page *page)
{
    enum lru_list lru;

    if (PageUnevictable(page))
        lru = LRU_UNEVICTABLE;
    else {
        lru = page_lru_base_type(page);
        if (PageActive(page))
            lru += LRU_ACTIVE;
    }
    return lru;
}

enum lru_list {
    LRU_INACTIVE_ANON = LRU_BASE,
    LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
    LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
    LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
    LRU_UNEVICTABLE,
    NR_LRU_LISTS
};

pagecache in meminfo

Buffers and Cached

meminfo_proc_show
---
    si_meminfo(&i);
    si_swapinfo(&i);
    committed = percpu_counter_read_positive(&vm_committed_as);

    cached = global_node_page_state(NR_FILE_PAGES) -
            total_swapcache_pages() - i.bufferram;
    if (cached < 0)
        cached = 0;

    for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
        pages[lru] = global_node_page_state(NR_LRU_BASE + lru);

    available = si_mem_available();
    sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
    sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);

    show_val_kb(m, "MemTotal:       ", i.totalram);
    show_val_kb(m, "MemFree:        ", i.freeram);
    show_val_kb(m, "MemAvailable:   ", available);
    show_val_kb(m, "Buffers:        ", i.bufferram);
---

What need to be noticed here is the Buffers
bufferram come from nr_blockdev_pages
---
    struct block_device *bdev;
    long ret = 0;
    spin_lock(&bdev_lock);
    list_for_each_entry(bdev, &all_bdevs, bd_list) {
        ret += bdev->bd_inode->i_mapping->nrpages;
    }
    spin_unlock(&bdev_lock);
    return ret;
---

fs misc

The truth of page lock

block_read_full_page(), __block_write_full_page() and __block_write_full_page()_all will create buffer_heads for page.

create_page_buffers()
    -> create_empty_buffers()
        -> attach_page_buffers()
            -> SetPagePrivate()
            -> set_page_private()

The truth of the page lock and bh lock
In the process of read operations

do_generic_file_read()
    -> page_cache_sync_readahead() // if page is not present
        -> ondemand_readahead()
            -> ra_submit()
                -> __do_page_cache_readahead()
                    -> read_pages()
                        -> mapping->a_ops->readpages()
                           ext4_mpage_readpages()
                            -> add_to_page_cache_lru()
                                -> __set_page_locked() // page is locked ------> Here
                            -> block_read_full_page() // if page has buffers
                                -> lock_buffer() // the buffer_head is locked -----> Here
                                -> mark_buffer_async_read()
                                    //bh->b_end_io = end_buffer_async_read
                                -> submit_bh()
    -> !PageUptodate() && !trylock_page() , go to page_not_up_to_date
    -> lock_page_killable()
    -> if PageUptodate(), unlock_page() and goto page_ok

Where the page and buffer_head are unlocked ?

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
    unsigned long flags;
    struct buffer_head *first;
    struct buffer_head *tmp;
    struct page *page;
    int page_uptodate = 1;

    BUG_ON(!buffer_async_read(bh));

    page = bh->b_page;
    if (uptodate) {
        set_buffer_uptodate(bh);
    } else {
        clear_buffer_uptodate(bh);
        buffer_io_error(bh, ", async page read");
        SetPageError(page);
    }

    /*
     * Be _very_ careful from here on. Bad things can happen if
     * two buffer heads end IO at almost the same time and both
     * decide that the page is now completely done.
     */
    first = page_buffers(page);
    local_irq_save(flags);
    bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
    clear_buffer_async_read(bh);
    unlock_buffer(bh);
    tmp = bh;
    do {
        if (!buffer_uptodate(tmp))
            page_uptodate = 0;
        if (buffer_async_read(tmp)) {
            BUG_ON(!buffer_locked(tmp)); //This could prove that the buffer_head is locked during the read process
            goto still_busy;
        }
        tmp = tmp->b_this_page;
    } while (tmp != bh);
    bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
    local_irq_restore(flags);

    /*
     * If none of the buffers had errors and they are all
     * uptodate then we can set the page uptodate.
     */
    if (page_uptodate && !PageError(page))
        SetPageUptodate(page);
    unlock_page(page);
    return;

still_busy:
    bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
    local_irq_restore(flags);
    return;
}

If all the buffer in that page are uptodate, the page will be set uptodate and unlocked.
We could see that the page and buffer_head both are locked during the process of read operations. The lock ensure the page exclusive because the device need write data into the page through DMA. What's about the write operations ?
The write process is divided into two parts.
1> write the user data into page cache

generic_perform_write()
    -> a_ops->write_begin()
       ext4_write_begin()
        -> grab_cache_page_write_begin()
            -> pagecache_get_page()
                -> find_get_entry()
                If get 
                -> lock_page() //page is locked
                otherwise
                -> add_to_page_cache_lru()
                    -> __set_page_locked() //page is locked'
        -> unlock_page()
        -> ext4_journal_start() // About why does unlock_page() before the
    ext4_journal_start(), please refer to the comment in ext4_write_begin()
        -> lock_page() // the page is relocked
        -> wait_for_stable_page()

    -> iov_iter_copy_from_user_atomic()

    -> a_ops->write_begin()
        -> block_write_end()
            -> __block_commit_write()
                -> set_buffer_uptodate()
                -> mark_buffer_dirty()
                -> SetPageUptodate() // if no partial
        -> unlock_page() // page is unlocked

The page lock will ensure the page exclusive from other operations when the user data is being copied into it. 2> write back the dirty page

ext4_writepages()
    -> blk_start_plug()
    -> write_cache_pages() //Go here when in journal mode because this mode
    does not support delayed allocation. We use this branch to demonstrate the
    page and bh lock because I really didn't find the where does the lock_page
    locate.
        -> lock_page() -------> Here
        -> wait_on_page_writeback() when PageWriteback() // keep the write back atomic
        -> clear_page_dirty_for_io()
        -> __writepage()
            -> ext4_writepage()
                -> ext4_bio_write_page()
                    -> set_page_writeback() //very important
                    -> set_buffer_async_write()
                    -> io_submit_add_bh()
                        -> ext4_io_submit()
                        -> io_submit_init_bio()
                            // set bi_end_io = ext4_end_bio()
                    -> unlock_page() ----> Here
                    
    -> blk_finish_plug()


ext4_end_bio()
    -> ext4_finish_bio()
        -> clear_buffer_async_write()
        -> end_page_writeback() // !under_io
            -> test_clear_page_writeback()
            -> wake_up_page(page, PG_writeback);

The page is not locked during the write operations. But the writeback flag is set to ensure the atomicity of the operations on the page