readahead
writeback
BH
dcache
pagecache
fs misc
(Quote from the comment in mm/readahead.c)
The fields in struct file_ra_state represent the most-recently readahead
attempt.
|<----- async_size ---------|
|------------------- size -------------------->|
|==================#===========================|
^start ^page marked with PG_readahead
To overlap application thinking time and disk I/O time, we do `readahead pipelining':
Do not wait until the application consumed all readahead pages and stalled on the
missing page at readahead_index; Instead, submit an asynchronous readahead I/O as
soon as there are only async_size pages left in the readahead window.
Normally async_size will be equal to size, for maximum pipelining.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ondemand_readahead
---
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size; //for maximum pipelining
goto readit;
}
---
ra_submit
---
return __do_page_cache_readahead(mapping, filp,
ra->start, ra->size, ra->async_size);
---
__do_page_cache_readahead
---
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
page = xa_load(&mapping->i_pages, page_offset);
...
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
nr_pages++;
}
if (nr_pages)
read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
---
generic_file_buffered_read
---
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
---
When to set this flag ?
In block layer, REQ_RAHEAD is not important.
The aops.readpages is only used by the read_pages.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ext4_readpages
-> ext4_mpage_readpages //is_readahead is true
---
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
fscrypt_release_ctx(ctx);
goto set_error_page;
}
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio->bi_private = ctx;
bio_set_op_attrs(bio, REQ_OP_READ,
is_readahead ? REQ_RAHEAD : 0);
---
blkdev_readpages
-> mpage_readpages
---
struct mpage_readpage_args args = {
.get_block = get_block,
.is_readahead = true,
};
unsigned page_idx;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = lru_to_page(pages);
prefetchw(&page->flags);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
page->index,
readahead_gfp_mask(mapping))) {
args.page = page;
args.nr_pages = nr_pages - page_idx;
args.bio = do_mpage_readpage(&args);
}
put_page(page);
}
BUG_ON(!list_empty(pages));
if (args.bio)
mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
---
All of the common filesystem should provide this aops.readpages callback.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
sb_breadahead
-> __breadahead(sb->s_bdev, block, sb->s_blocksize);
---
struct buffer_head *bh = __getblk(bdev, block, size);
if (likely(bh)) {
ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
brelse(bh);
}
---
This sb_breadahead is used by ext4 to readahead metadata
__ext4_get_inode_loc
---
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
__u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
table = ext4_inode_table(sb, gdp);
/* s_inode_readahead_blks is always a power of 2 */
b = block & ~((ext4_fsblk_t) ra_blks - 1);
if (table > b)
b = table;
end = b + ra_blks;
num = EXT4_INODES_PER_GROUP(sb);
if (ext4_has_group_desc_csum(sb))
num -= ext4_itable_unused_count(sb, gdp);
table += num / inodes_per_block;
if (end > table)
end = table;
while (b <= end)
sb_breadahead(sb, b++);
}
---
blk_init_request_from_bio
---
if (bio->bi_opf & REQ_RAHEAD)
req->cmd_flags |= REQ_FAILFAST_MASK;
---
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
check_should_bypass
---
/*
* Flag for bypass if the IO is for read-ahead or background,
* unless the read-ahead request is for metadata
* (eg, for gfs2 or xfs).
*/
if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
!(bio->bi_opf & (REQ_META|REQ_PRIO)))
goto skip;
---
dm_make_request
---
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
dm_put_live_table(md, srcu_idx);
if (!(bio->bi_opf & REQ_RAHEAD))
queue_io(md, bio);
else
bio_io_error(bio);
return ret;
}
---
raid5_make_request
---
sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_opf & REQ_RAHEAD), 0);
^^^^^^^^^^^^^^^^^^^^^^^^^
is nonblock
---
Since the fs metadata is through the pagecache of the block device,
does it could use this readahead ?
Let's take some examples from ext4
So we know the fs metadata doesn't use the readahead directly, the fs could
implement its own readahead stuff.
__ext4_get_inode_loc
---
/*
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
// sb_getblk just setup the page in the pagecache and its bhs
bh = sb_getblk(sb, block);
...
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
...
make_io:
...
/*
* There are other valid inodes in the buffer, this inode
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
trace_ext4_load_inode(inode);
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
...
}
---
ext4_read_inode_bitmap
---
desc = ext4_get_group_desc(sb, block_group, NULL);
...
bitmap_blk = ext4_inode_bitmap(sb, desc);
...
bh = sb_getblk(sb, bitmap_blk);
...
if (bitmap_uptodate(bh))
goto verify;
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
goto verify;
}
...
/*
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
...
---
ext4_readdir
---
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
...
cond_resched();
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
pgoff_t index = map.m_pblk >>
(PAGE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
&file->f_ra, file,
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
}
...
}
---
There are two points here,
1. ext4_bread
ext4_bread
---
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
if (!bh || buffer_uptodate(bh))
return bh;
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
---
2. there is indeed readahead here
page_cache_sync_readahead is invoked here.
the parameters is very interesting,
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
&file->f_ra, file,
index, 1);
the address_space is the block device's,
the file_ra_state is the directory's
ext4_create
-> ext4_add_nondir
-> ext4_add_entry
---
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
bh = ext4_read_dirblock(dir, block, DIRENT);
-> __ext4_read_dirblock
-> ext4_bread
retval = add_dirent_to_buf(handle, &fname, dir, inode,
NULL, bh);
if (retval != -ENOSPC)
goto out;
if (blocks == 1 && !dx_fallback &&
ext4_has_feature_dir_index(sb)) {
retval = make_indexed_dir(handle, &fname, dir,
inode, bh);
bh = NULL; /* make_indexed_dir releases bh */
goto out;
}
brelse(bh);
}
bh = ext4_append(handle, dir, &block);
---
__ext4_find_entry
---
do {
/*
* We deal with the read-ahead logic here.
*/
if (ra_ptr >= ra_max) {
/* Refill the readahead buffer */
ra_ptr = 0;
if (block < start)
ra_max = start - block;
else
ra_max = nblocks - block;
ra_max = min(ra_max, ARRAY_SIZE(bh_use));
retval = ext4_bread_batch(dir, block, ra_max,
false /* wait */, bh_use);
...
}
if ((bh = bh_use[ra_ptr++]) == NULL)
goto next;
wait_on_buffer(bh);
...
set_buffer_verified(bh);
i = search_dirblock(bh, dir, fname,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
...
next:
if (++block >= nblocks)
block = 0;
} while (block != start);
---
ext4_bread_batch
---
for (i = 0; i < bh_count; i++)
bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
if (bhs[i] && !buffer_uptodate(bhs[i]))
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
&bhs[i]);
if (!wait)
return 0;
for (i = 0; i < bh_count; i++)
if (bhs[i])
wait_on_buffer(bhs[i]);
---
Look at the comment of balance_dirty_pages
balance_dirty_pages() must be called by processes which are generating dirty
data. It looks at the number of dirty pages in the machine and will force
^^^^^^^^^^
the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If we're over `background_thresh' then the writeback threads are woken to
perform some writeout.
In normal case, the writeback mode could avoid the application to be blocked,
but what if the case when there is continuous writting IO ?
^^^^^^^^^^^^^^^^^^^^^^
pagecache
write +-------+
application -------> | Dirty |
+-------+
| Dirty |
+-------+
| Dirty | __________
+-------+ writeback / /|
| Dirty | -----------> /_________/ |
+-------+ | | /
|_________|/
When there is dirty balance mechanism here, the bw of application writting is
actually equal to the disk bw. So regarding to latency, the writeback here is
not so helpful.
But combined with delayed allocation, writeback could avoid fragment.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Anyway, let's look at how does it work ?
generic_perform_write
---
do {
...
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
...
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status < 0))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
break;
copied = status;
cond_resched();
iov_iter_advance(i, copied);
...
pos += copied;
written += copied;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
---
queue_io is used to move the timeout dirty inode from the wb->b_dirty or
wb->b_dirty_time to wb->b_io.
move_expired_inodes
---
if ((flags & EXPIRE_DIRTY_ATIME) == 0)
older_than_this = work->older_than_this;
else if (!work->for_sync) {
expire_time = jiffies - (dirtytime_expire_interval * HZ);
older_than_this = &expire_time;
}
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
// If the dirty time is before older_than_this, it will be moved.
^^^^^^^^^^^^^^^^^^^^^^
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_io_list, &tmp);
...
}
---
So how to set the older_than_this ?
It is only used for b_dirty_time
queue_io
---
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
EXPIRE_DIRTY_ATIME, work);
---
In this case,
move_expired_inodes
---
if ((flags & EXPIRE_DIRTY_ATIME) == 0)
older_than_this = work->older_than_this;
else if (!work->for_sync) {
expire_time = jiffies - (dirtytime_expire_interval * HZ);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
older_than_this = &expire_time;
}
---
Only move the inodes that have been dirtied for dirty_expire_interval HZ (12 hours)
Get more from the comment of dirty_expire_interval
wb_writeback
---
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
^^^^^^^^^^^^^^^^^^^^^^^^^^
30 seconds
}
---
The periodic writeback will writeback the dirty inode after 30s
wb_writeback
---
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
oldest_jif = jiffies;
^^^^^^^^^^^^^^^^^^^^^
Now !
---
The interesting thing is when to start up a background writeback ?
balance_dirty_pages
-> wb_start_background_writeback
wb_workfn
-> wb_do_writeback
-> wb_check_background_flush
---
if (wb_over_bg_thresh(wb)) {
^^^^^^^^^^^^^^^^^^^^^^
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
.sync_mode = WB_SYNC_NONE,
.for_background = 1,
.range_cyclic = 1,
.reason = WB_REASON_BACKGROUND,
};
return wb_writeback(wb, &work);
}
---
For all the other types of writeback (vmscan, sync ...), it will not check the
dirty time but just write them out directly.
mark_buffer_dirty
---
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
struct address_space *mapping = NULL;
lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
mapping = page_mapping(page);
if (mapping)
__set_page_dirty(page, mapping, 0);
^^^^^^^^^^^^^^^^
// set PAGECACHE_TAG_DIRTY on the page in the pagecache tree
}
unlock_page_memcg(page);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
^^^^^^^^^^^^^^^^^^
// hand the inode to writeback core
}
---
regular data
ext4_write_end
-> block_write_end
-> __block_commit_write
-> mark_buffer_dirty
metadata
__jbd2_journal_refile_buffer
-> __jbd2_journal_unfile_buffer
-> __jbd2_journal_temp_unlink_buffer
-> mark_buffer_dirty
__iomap_write_end
-> iomap_set_page_dirty
---
lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
if (newly_dirty)
__set_page_dirty(page, mapping, 0);
unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
---
The bdi is per-queue instead of per-fs.
#STEP 0
blk_alloc_queue_node
---
q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
if (!q->backing_dev_info)
goto fail_split;
q->stats = blk_alloc_queue_stats();
if (!q->stats)
goto fail_stats;
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->backing_dev_info->name = "block";
q->node = node_id;
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
laptop_mode_timer_fn, 0);
---
#STEP 1
__blkdev_get
---
if (!bdev->bd_openers) {
first_open = true;
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
bdev->bd_partno = partno;
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
if (!bdev->bd_part)
goto out_clear;
ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
...
}
if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
set_init_blocksize(bdev);
}
...
} else {
struct block_device *whole;
whole = bdget_disk(disk, 0);
ret = -ENOMEM;
if (!whole)
goto out_clear;
BUG_ON(for_part);
ret = __blkdev_get(whole, mode, 1);
if (ret)
goto out_clear;
bdev->bd_contains = whole;
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
ret = -ENXIO;
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
set_init_blocksize(bdev);
}
if (bdev->bd_bdi == &noop_backing_dev_info)
bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
---
#STEP 2
mount_dev
---
s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
bdev);
---
static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
return 0;
}
#STEP 3
balance_dirty_pages_ratelimited
-> inode_to_bdi
---
sb = inode->i_sb;
#ifdef CONFIG_BLOCK
if (sb_is_blkdev_sb(sb))
return I_BDEV(inode)->bd_bdi;
#endif
return sb->s_bdi;
---
wb_do_writeback
The main difference between such writeback method is how many pages that can be
written out.
For example,
__writeback_inodes_sb_nr // input wb_writeback_work.nr_pages
OR
sync_inodes_sb // wb_writeback_work.nr_pages is LONG_MAX
// for_sync = 1
-> bdi_split_work_to_wbs
-> wb_queue_work
Both of above push the writeback on the sb (a fs)
Some filesystem interfaces will invoke them with WB_REASON_FS_FREE_SPACE.
laptop_mode_timer_fn
-> wakeup_flusher_threads_bdi
-> __wakeup_flusher_threads_bdi
-> wb_start_writeback
-> set WB_start_all
-> wb_wakeup
SYSCALL sync / pm_suspend->enter_state
-> ksys_sync
OR
shrink_inactive_list // WB_REASON_VMSCAN
-> wakeup_flusher_threads //iterate bdi_list
-> __wakeup_flusher_threads_bdi //iterate all the dirty wb
-> wb_start_writeback
-> set WB_start_all
-> wb_wakeup
Start flush on all of bdi
__mark_inode_dirty
// If this is the first dirty inode for this bdi,
// we have to wake-up the corresponding bdi thread
// to make sure background write-back happens
// later.
-> wb_wakeup_delayed
wb_workfn
---
if (!list_empty(&wb->work_list))
wb_wakeup(wb);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
wb_wakeup_delayed(wb);
---
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
5s
Actually, no explicit background writeback trigger.
Just need to wb_wakeup.
Then wb_do_writeback will check itself.
wb_do_writeback
-> wb_check_background_flush
---
if (wb_over_bg_thresh(wb)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
.sync_mode = WB_SYNC_NONE,
.for_background = 1,
.range_cyclic = 1,
.reason = WB_REASON_BACKGROUND,
};
return wb_writeback(wb, &work);
}
---
balance_dirty_pages will try to trigger background writeback.
balance_dirty_pages
-> wb_start_background_writeback
-> wb_wakeup
It will try to write out the dirtied inode that has been expired.
wb_writeback
---
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
...
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
}
...
if (list_empty(&wb->b_io))
queue_io(wb, work);
-> move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
---
if ((flags & EXPIRE_DIRTY_ATIME) == 0)
older_than_this = work->older_than_this;
...
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_io_list, &tmp);
moved++;
...
}
---
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
3s
So the dirty expire time is 30s
For background, wb_writeback will update oldest_jif every time, so all of the
dirtied inode could be moved to wb->b_io.
The key point here is
wb_writeback
---
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
if (work->for_background && !wb_over_bg_thresh(wb))
break;
---
Historically, a buffer_head was used to map a single block within a page, and of
course as the unit of I/O through the filesystem and block layers.
Nowadays the basic I/O unit is the bio, and buffer_heads are used for
static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
set_buffer_mapped(bh);
bh->b_bdev = sb->s_bdev;
bh->b_blocknr = block;
bh->b_size = sb->s_blocksize;
}
A single block that a bh represents is the blocksize of filesystem, not the
block device.
In block layer, the logical block size is still 512 bytes which is the
traditional sector size, but the real block logical size of the storage hardware
is varaible. Look at the nvme driver code:
nvme_setup_rw
---
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
-> (sector >> (ns->lba_shift - 9))
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
---
What if the logical block size of the device is larger than 4K ?
There could be one or more bh in one page. We could see this in
alloc_page_buffers.
---
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) { // size here is blocksize of fs
bh = alloc_buffer_head(gfp);
if (!bh)
goto no_grow;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
bh->b_size = size;
/* Link the buffer to its page */
set_bh_page(bh, page, offset);
}
return head;
---
Look at the submit_bh_wbc to know the basic steps:
---
bio = bio_alloc(GFP_NOIO, 1);
if (wbc) {
wbc_init_bio(wbc, bio);
wbc_account_io(wbc, bh->b_page, bh->b_size);
}
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
BUG_ON(bio->bi_iter.bi_size != bh->b_size);
bio->bi_end_io = end_bio_bh_io_sync;
-> bh->b_end_io(bh, !bio->bi_status);
bio->bi_private = bh;
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(op, bio);
if (buffer_meta(bh))
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
---
enum bh_state_bits defines the state of a bh and is contained in bh->b_state
And there are 3 marcos to define the set, clear and test operations about this
state. They are defined in include/linux/buffer_head.h
Let's look at how to use them in fs.
Means the bh contains valid data.
When read/write operation is completed successfully, BH_Uptodate will be set on
the bh, otherwise clear it.
When a bh is under io, we will lock it.
A very classical scenario is __bread_slow:
---
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
return bh;
} else {
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, 0, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
}
brelse(bh);
return NULL;
---
end_buffer_read_sync
-> __end_buffer_read_notouch
---
if (uptodate) {
set_buffer_uptodate(bh);
} else {
/* This happens, due to failed read-ahead attempts. */
clear_buffer_uptodate(bh);
}
unlock_buffer(bh)
---
BH_Dirty is set in mark_buffer_dirty.
Except for setting BH_Dirty, it would also:
- set page dirty
- __mark_inode_dirty(inode, I_DIRTY_PAGES)
__mark_inode_dirty will hand over this inode to writeback, then the
dirty data will be written to disk.
The BH_Dirty is usually cleared before IO.
For example:
---
lock_buffer(bh);
clear_buffer_dirty(bh);
get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(REQ_OP_WRITE, 0, bh);
wait_on_buffer(bh);
---
Has a disk mapping, in the other word, this bh corresponds to a block on disk.
It is usually set after get_block.
Look at the follow combinations between Mapped and Uptodate
Mapped Uptodate
No No "unknown" - must do get_block()
No Yes "hole" - zero-filled (no associated block on disk image)
Yes No "allocated" - allocated on disk, not read in
Yes Yes "valid" - allocated and up-to-date in memory.
The bh->b_end_io is end_buffer_async_read/write.
dcache, dentry cache, directory entry cache.
A dentry's core job is to represent a directory or file in filesystem and cache
the mapping between the file/directory and the associated inode. This inode
contains the core operations of filesystem.
The dentries encode the fs tree structure, the name of files.
The main part of a dentry is :
The path walking is mainly done in link_path_walk, let's look at the skeleton of
it.
for(;;) {
...
hash_len = hash_name(nd->path.dentry, name);
hash_name will calculate the length and hash of the path component.
hash_len = len << 32 | hash
hash value is calculated based on pointer of parent dentry and entry name.
...
nd->last.hash_len = hash_len;
nd->last.name = name;
nd->last_type = type;
nd->last is the name component we walking currently.
link_path_walk will leave the last component of the path for do_last
name += hashlen_len(hash_len);
if (!*name)
goto OK;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
name++;
} while (unlikely(*name == '/'));
if (unlikely(!*name)) {
...
} else {
/* not the last component */
err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
}
...
}
walk_component will mainly do 3 things:
1. try to get the dentry in the cache
lookup_fast
-> __d_lookup(&nd->path.dentry, &nd->last)
-> get hash list by d_hash(name->hash)
2. if not in cache, try to get it from fs
lookup_slow
-> __lookup_slow
-> d_alloc_parallel // allocate dentry
-> inode->i_op->lookup
// this will cause some io to get in the filesystem metadata of directory
// and inode.
3. follow_managed
mountpoint will be resolved here.
Refer to Documentation/filesystems/path-lookup.txt
dcache is used to speed up the looking up of inode associated with a path name.
this look up could come from multiple cores concurrently and frequently, so the
lock mechanism is very important. Let's look into the lock of denty cache next
and find out the how it promote the performance.
In Documentation/filesystems/path-lookup.txt, it always says "would like to do path walking without taking locks or reference
counts of intermediate dentries along the path.", why ?
Look into the path lookup process,
[0] [2] [4]
+---+ +---------+ +-----------+
| v | v | v
/home/will/Desktop/wangjianchao/source_code/linux-stable/Makefile
| ^ | ^ | ^
+------+ +------------+ +-----------+
[1] [3] [5]
[0] dentry of "/", "home"
[1] dentry of "home", "will"
[2] dentry of "will", "Desktop"
[3] dentry of "Desktop", "wangjianchao"
[4] dentry of "wangjianchao", "source_code"
[5] dentry of "source_code", "Makefile"
walk_component will be executed for [0] ~ [5], and lookup_fast will be invoked
every time. At the moment, the component dentry's d_lock has to be locked to
serialize the accessing to the dentry.
__d_lookup
---
spin_lock(&dentry->d_lock);
if (dentry->d_parent != parent)
goto next;
if (d_unhashed(dentry))
goto next;
if (!d_same_name(dentry, parent, name))
goto next;
dentry->d_lockref.count++;
found = dentry;
spin_unlock(&dentry->d_lock);
---
The contending on the lock of dentry of "home", "will" and "Desktop" should be
very high. On the system of a lot of cores, the dentry cache could become a
scalability problem with workload which perform lot of lookup.
Currently, there are two path walking modes:
The 'storing to shared data' means, in ref-walk, it need to:
ref-walk is the traditional way of performing dcache lookup using d_lock to
serialize the concurrent modifications to the dentry and take a reference count
on it.
The reference is release here,
step_into
-> path_to_nameidata
---
if (!(nd->flags & LOOKUP_RCU)) {
dput(nd->path.dentry);
if (nd->path.mnt != path->mnt)
mntput(nd->path.mnt);
}
nd->path.mnt = path->mnt;
nd->path.dentry = path->dentry;
---
rcu-walk uses seqcount based dentry lookups, and can perform lookup of intermediate
elements without any stores to shared data in the dentry or inode.
To kill them, rcu-walk does as following:
it is to serialize the modifications to dentry from rename or other
prevent the dentry to be released.
in ref-walk, we need to take a reference count when get every intermediate
component, and put it when step into next one.
walk_component
-> lookup_fast
-> __d_lookup // get the reference
-> step_into
-> path_to_nameidata
-> dput(nd->path.dentry) // if not LOOKUP_RCU
Who would write_seqcount the dentry->d_seq ?
path_openat/path_lookupat...
-> path_init
-> rcu_read_lock // if LOOKUP_RCU
-> link_path_walk
-> walk_component
-> lookup_fast
-> __d_lookup_rcu
-> terminate_walk
-> rcu_read_unlock // if LOOKUP_RCU
So now dentry refcounts are not required for dentry persistence.
Because the dentry free employs call_rcu.
dput
-> dentry_kill
-> __dentry_kill
-> dentry_free
-> call_rcu(&dentry->d_u.d_rcu, __d_free);
The snapshot of the dentry's name, parent and inode (for child lookup) will be
protected by the per-dentry seqlock. dentry lookups recheck the sequence after
the child is found in case anything changed in the parent in the path walk.
lookup_fast
---
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
bool negative;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
---
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
unsigned seq;
seqretry:
/*
* The dentry sequence count protects us from concurrent
* renames, and thus protects parent and name fields.
*
* The caller must perform a seqcount check in order
* to do anything useful with the returned dentry.
*/
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
if (d_unhashed(dentry))
continue;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
...
} else {
if (dentry->d_name.hash_len != hashlen)
continue;
if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
continue;
}
*seqp = seq;
return dentry;
}
---
...
//This sequence count validates that the inode matches
//the dentry name information from lookup.
*inode = d_backing_inode(dentry);
negative = d_is_negative(dentry);
if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
return -ECHILD;
...
//This sequence count validates that the parent had no
//changes while we did the lookup of the dentry above.
if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
return -ECHILD;
*seqp = seq;
...
}
---
This d_seq could ensure if we get a dentry, it must be expected.
Even if the dentry is renamed after this, it is still valid because the rename
would not allocate a new dentry but just modify and rehash it.
There are two points that why seqlock is better than spinlock in almost-read
scenario.
---
raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
raw_write_seqcount_end(&dentry->d_seq);
---
dentry_unlink_inode <- __dentry_kill
d_delete
---
spin_lock(&dentry->d_lock);
/*
* Decrement negative dentry count if it was in the LRU list.
*/
if (dentry->d_flags & DCACHE_LRU_LIST)
this_cpu_dec(nr_dentry_negative);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
fsnotify_update_flags(dentry);
spin_unlock(&dentry->d_lock);
---
---
if (inode) {
unsigned add_flags = d_flags_for_inode(inode);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
fsnotify_update_flags(dentry);
}
---
---
write_seqcount_begin(&dentry->d_seq);
write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
/* unhash both */
if (!d_unhashed(dentry))
___d_drop(dentry);
if (!d_unhashed(target))
___d_drop(target);
/* ... and switch them in the tree */
dentry->d_parent = target->d_parent;
if (!exchange) {
copy_name(dentry, target);
target->d_hash.pprev = NULL;
dentry->d_parent->d_lockref.count++;
if (dentry != old_parent) /* wasn't IS_ROOT */
WARN_ON(!--old_parent->d_lockref.count);
} else {
target->d_parent = old_parent;
swap_names(dentry, target);
list_move(&target->d_child, &target->d_parent->d_subdirs);
__d_rehash(target);
fsnotify_update_flags(target);
}
list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
__d_rehash(dentry);
fsnotify_update_flags(dentry);
fscrypt_handle_d_move(dentry);
write_seqcount_end(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
---
There are two parts of dentry look up, the fast path and slow path.
Let's look at the slow path here.
Quote from here
https://lwn.net/Articles/685108/
All directory operations are done with the inode mutex (i_mutex) held, which prevents anything else
from touching that directory. But the most common operation, lookup, is non-destructive, so there is
no real conceptual reason to stop it from happening in parallel.
The typical scenario could be
CPU0 CPU1 CPU2 CPU3 CPU4
T0 T1 T2 T3 T4
\ \ | / /
\ \ | / /
\ \ | / /
/var/log/
T0 T1 T2 T3 T4
If all of the dentries of T0 ~ T1 happen to be not in memory, all of them have
to invoke lookup_slow.
If the lock here is a mutex, the performance will very bad.
Then a rw_semaphore is introduced to replace the mutex.
static struct dentry *lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
struct inode *inode = dir->d_inode;
struct dentry *res;
inode_lock_shared(inode);
-> down_read(&inode->i_rwsem);
res = __lookup_slow(name, dir, flags);
inode_unlock_shared(inode);
return res;
}
But there is a problem: the mutex currently protects the directory entry (dentry).
A lookup operation can cause dentries to be created, which can lead to races if two
dentries are created for the same name.
How to handle this ?
Look at the d_alloc_parallel.
---
struct hlist_bl_head *b = in_lookup_hash(parent, hash);
// alloc a dentry structrue here.
struct dentry *new = d_alloc(parent, name);
retry:
rcu_read_lock();
r_seq = read_seqbegin(&rename_lock);
// look up the a dentry with (parent, name) in hash cache
// there could be some one create a same one concurrently.
dentry = __d_lookup_rcu(parent, name, &d_seq);
if (unlikely(dentry)) {
...
// anything changes on the dentry ?
if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
rcu_read_unlock();
dput(dentry);
goto retry;
}
rcu_read_unlock();
dput(new);
return dentry;
}
if (unlikely(read_seqretry(&rename_lock, r_seq))) {
rcu_read_unlock();
goto retry;
}
hlist_bl_lock(b);
// A spin lock here.
// So there could be only one entering this critical section,
// namely, only one of concurrent lookups with same parent and name pair
// could add its dentry on the hash cache, the others have to wait. When
// they come in this critical section, a dentry with same name and parent
// pair has been there.
// At the moment, there are 2 cases:
// - the dentry is in lookup, indicating inode->i_op->lookup is ongoing.
// we have to wait.
// - otherwise, the lookup has been completed, we could return this dentry
// directly.
hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
if (dentry->d_name.hash != hash)
continue;
if (dentry->d_parent != parent)
continue;
if (!d_same_name(dentry, parent, name))
continue;
hlist_bl_unlock(b);
/* now we can try to grab a reference */
if (!lockref_get_not_dead(&dentry->d_lockref)) {
rcu_read_unlock();
goto retry;
}
rcu_read_unlock();
/*
* somebody is likely to be still doing lookup for it;
* wait for them to finish
*/
spin_lock(&dentry->d_lock);
d_wait_lookup(dentry);
if (unlikely(dentry->d_name.hash != hash))
goto mismatch;
if (unlikely(dentry->d_parent != parent))
goto mismatch;
if (unlikely(d_unhashed(dentry)))
goto mismatch;
if (unlikely(!d_same_name(dentry, parent, name)))
goto mismatch;
/* OK, it *is* a hashed match; return it */
spin_unlock(&dentry->d_lock);
dput(new);
return dentry;
}
rcu_read_unlock();
/* we can't take ->d_lock here; it's OK, though. */
new->d_flags |= DCACHE_PAR_LOOKUP; // dentry in-lookup is set here.
new->d_wait = wq;
hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
hlist_bl_unlock(b);
return new;
---
grow pagecache
grow pagecache
pagecache_get_page
-> find_get_entry
-> __page_cache_alloc
-> add_to_page_cache_lru
-> __add_to_page_cache_locked
-> lru_cache_add
-> __lru_cache_add
---
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
^^^^^^^^^^^ [1]
__pagevec_lru_add(pvec);
^^^^^^^^^^^^^^^^^^^^^^^^ [2]
put_cpu_var(lru_add_pvec);
---
The interesting thing here is that the page would be added into per-cpu pagevec
first. If per-cpu pagevec is full, __pagevec_lru_add would drain the pages into
lru list one time.
__pagevec_lru_add
-> __pagevec_lru_add_fn
---
SetPageLRU(page);
smp_mb();
if (page_evictable(page)) {
lru = page_lru(page);
} else {
...
}
add_page_to_lru_list(page, lruvec, lru);
---
When drain pages into lru, we need to select lru list for it. This is done by page_lru
static __always_inline enum lru_list page_lru(struct page *page)
{
enum lru_list lru;
if (PageUnevictable(page))
lru = LRU_UNEVICTABLE;
else {
lru = page_lru_base_type(page);
if (PageActive(page))
lru += LRU_ACTIVE;
}
return lru;
}
enum lru_list {
LRU_INACTIVE_ANON = LRU_BASE,
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};
meminfo_proc_show
---
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
if (cached < 0)
cached = 0;
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE);
sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
show_val_kb(m, "MemAvailable: ", available);
show_val_kb(m, "Buffers: ", i.bufferram);
---
What need to be noticed here is the Buffers
bufferram come from nr_blockdev_pages
---
struct block_device *bdev;
long ret = 0;
spin_lock(&bdev_lock);
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
spin_unlock(&bdev_lock);
return ret;
---
block_read_full_page(), __block_write_full_page() and
__block_write_full_page()_all will create buffer_heads for page.
create_page_buffers()
-> create_empty_buffers()
-> attach_page_buffers()
-> SetPagePrivate()
-> set_page_private()
The truth of the page lock and bh lock
In the process of read operations
do_generic_file_read()
-> page_cache_sync_readahead() // if page is not present
-> ondemand_readahead()
-> ra_submit()
-> __do_page_cache_readahead()
-> read_pages()
-> mapping->a_ops->readpages()
ext4_mpage_readpages()
-> add_to_page_cache_lru()
-> __set_page_locked() // page is locked ------> Here
-> block_read_full_page() // if page has buffers
-> lock_buffer() // the buffer_head is locked -----> Here
-> mark_buffer_async_read()
//bh->b_end_io = end_buffer_async_read
-> submit_bh()
-> !PageUptodate() && !trylock_page() , go to page_not_up_to_date
-> lock_page_killable()
-> if PageUptodate(), unlock_page() and goto page_ok
Where the page and buffer_head are unlocked ?
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
struct page *page;
int page_uptodate = 1;
BUG_ON(!buffer_async_read(bh));
page = bh->b_page;
if (uptodate) {
set_buffer_uptodate(bh);
} else {
clear_buffer_uptodate(bh);
buffer_io_error(bh, ", async page read");
SetPageError(page);
}
/*
* Be _very_ careful from here on. Bad things can happen if
* two buffer heads end IO at almost the same time and both
* decide that the page is now completely done.
*/
first = page_buffers(page);
local_irq_save(flags);
bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
do {
if (!buffer_uptodate(tmp))
page_uptodate = 0;
if (buffer_async_read(tmp)) {
BUG_ON(!buffer_locked(tmp)); //This could prove that the buffer_head is locked during the read process
goto still_busy;
}
tmp = tmp->b_this_page;
} while (tmp != bh);
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
/*
* If none of the buffers had errors and they are all
* uptodate then we can set the page uptodate.
*/
if (page_uptodate && !PageError(page))
SetPageUptodate(page);
unlock_page(page);
return;
still_busy:
bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
local_irq_restore(flags);
return;
}
If all the buffer in that page are uptodate, the page will be set uptodate and
unlocked.
We could see that the page and buffer_head both are locked during the process
of read operations. The lock ensure the page exclusive because the device need
write data into the page through DMA.
What's about the write operations ?
The write process is divided into two parts.
1> write the user data into page cache
generic_perform_write()
-> a_ops->write_begin()
ext4_write_begin()
-> grab_cache_page_write_begin()
-> pagecache_get_page()
-> find_get_entry()
If get
-> lock_page() //page is locked
otherwise
-> add_to_page_cache_lru()
-> __set_page_locked() //page is locked'
-> unlock_page()
-> ext4_journal_start() // About why does unlock_page() before the
ext4_journal_start(), please refer to the comment in ext4_write_begin()
-> lock_page() // the page is relocked
-> wait_for_stable_page()
-> iov_iter_copy_from_user_atomic()
-> a_ops->write_begin()
-> block_write_end()
-> __block_commit_write()
-> set_buffer_uptodate()
-> mark_buffer_dirty()
-> SetPageUptodate() // if no partial
-> unlock_page() // page is unlocked
The page lock will ensure the page exclusive from other operations when the
user data is being copied into it.
2> write back the dirty page
ext4_writepages()
-> blk_start_plug()
-> write_cache_pages() //Go here when in journal mode because this mode
does not support delayed allocation. We use this branch to demonstrate the
page and bh lock because I really didn't find the where does the lock_page
locate.
-> lock_page() -------> Here
-> wait_on_page_writeback() when PageWriteback() // keep the write back atomic
-> clear_page_dirty_for_io()
-> __writepage()
-> ext4_writepage()
-> ext4_bio_write_page()
-> set_page_writeback() //very important
-> set_buffer_async_write()
-> io_submit_add_bh()
-> ext4_io_submit()
-> io_submit_init_bio()
// set bi_end_io = ext4_end_bio()
-> unlock_page() ----> Here
-> blk_finish_plug()
ext4_end_bio()
-> ext4_finish_bio()
-> clear_buffer_async_write()
-> end_page_writeback() // !under_io
-> test_clear_page_writeback()
-> wake_up_page(page, PG_writeback);
The page is not locked during the write operations. But the writeback flag is
set to ensure the atomicity of the operations on the page