QUESTION:
If a bh is inserted into one transaction, can we write to it again ?
Transaction state, running, committing, logged (which could be checkpointed)
ext4_journal_get_write_access
-> __ext4_journal_get_write_access
-> jbd2_journal_get_write_access
-> do_get_write_access
do_get_write_access
---
//the transaction that this handle belongs to
transaction_t *transaction = handle->h_transaction;
...
jbd_lock_bh_state(bh);
...
/*
* The buffer is already part of this transaction if b_transaction or
* b_next_transaction points to it
* we could do modification on one bh repeatedly in one transaction
*/
if (jh->b_transaction == transaction ||
jh->b_next_transaction == transaction)
goto done;
jh->b_modified = 0;
/*
* If the buffer is not journaled right now, we need to make sure it
* doesn't get written to disk before the caller actually commits the
* new data
*/
if (!jh->b_transaction) {
smp_wmb();
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
goto done;
}
/*
* There is one case we have to be very careful about. If the
* committing transaction is currently writing this buffer out to disk
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* and has NOT made a copy-out, then we cannot modify the buffer
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* contents at all right now. The essence of copy-out is that it is
^^^^^^^^^^^^^^^^^^^^^^^^^
* the extra copy, not the primary copy, which gets journaled. If the
* primary copy is already going to disk then we cannot do copy-out
* here.
*/
if (buffer_shadow(bh)) {
jbd_unlock_bh_state(bh);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
...
if (jh->b_jlist == BJ_Metadata || force_copy) {
if (!frozen_buffer) {
jbd_unlock_bh_state(bh);
frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
//we have unlock the bh before do the allocation, so we have to
//recheck all of the state
goto repeat;
}
jh->b_frozen_data = frozen_buffer;
frozen_buffer = NULL;
jbd2_freeze_jh_data(jh);
}
attach_next:
smp_wmb();
jh->b_next_transaction = transaction;
done:
jbd_unlock_bh_state(bh);
---
What is the Handle here ?
A handle is a operation that need to be guarantee atomicity.
Multiple handle will be committed in on transaction.
Critical point of a handle
Before start a handle, we need to guarantee there is enough space in the current
transaction to carry this newly handle. The space needed is provided by the caller.
jbd2__journal_start
-> start_this_handle
-> add_transaction_credits
---
needed = atomic_add_return(total, &t->t_outstanding_credits);
if (needed > journal->j_max_transaction_buffers) {
/*
* If the current transaction is already too large,
^^^^^^^^^^^^^^^^^^^
* then start to commit it: we can then go back and
* attach this handle to a new transaction.
*/
atomic_sub(total, &t->t_outstanding_credits);
wait_transaction_locked(journal);
return 1;
}
/*
* The commit code assumes that it can get enough log space
* without forcing a checkpoint. This is *critical* for
* correctness: a checkpoint of a buffer which is also
* associated with a committing transaction creates a deadlock,
* so commit simply cannot force through checkpoints.
*/
if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
__jbd2_log_wait_for_space(journal);
write_unlock(&journal->j_state_lock);
return 1;
}
---
A transaction can only be committed when all of its handle have been stopped.
start_this_handle
---
//under read lock of journal->j_state_lock
atomic_inc(&transaction->t_updates);
---
jbd2_journal_commit_transaction
---
//under write lock of journal->j_state_lock
while (atomic_read(&commit_transaction->t_updates)) {
DEFINE_WAIT(wait);
prepare_to_wait(&journal->j_wait_updates, &wait,
TASK_UNINTERRUPTIBLE);
if (atomic_read(&commit_transaction->t_updates)) {
spin_unlock(&commit_transaction->t_handle_lock);
write_unlock(&journal->j_state_lock);
schedule();
write_lock(&journal->j_state_lock);
spin_lock(&commit_transaction->t_handle_lock);
}
finish_wait(&journal->j_wait_updates, &wait);
}
---
jbd2_journal_stop
---
if (atomic_dec_and_test(&transaction->t_updates)) {
wake_up(&journal->j_wait_updates);
if (journal->j_barrier_count)
wake_up(&journal->j_wait_transaction_locked);
}
---
ordered mode
jbd2_journal_commit_transaction
---
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
jbd_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
err = journal_submit_data_buffers(journal, commit_transaction);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...
while (commit_transaction->t_buffers) {
/* Find the next buffer to be journaled... */
jh = commit_transaction->t_buffers;
...
/* Make sure we have a descriptor block in which to
record the metadata buffer. */
if (!descriptor) {
descriptor = jbd2_journal_get_descriptor_buffer(
commit_transaction,
JBD2_DESCRIPTOR_BLOCK);
...
tagp = &descriptor->b_data[sizeof(journal_header_t)];
space_left = descriptor->b_size -
sizeof(journal_header_t);
first_tag = 1;
set_buffer_jwrite(descriptor);
set_buffer_dirty(descriptor);
wbuf[bufs++] = descriptor;
jbd2_file_log_bh(&log_bufs, descriptor);
}
/* Where is the buffer to be written? */
err = jbd2_journal_next_log_block(journal, &blocknr);
...
set_bit(BH_JWrite, &jh2bh(jh)->b_state);
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr);
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
tag_flag = 0;
if (flags & 1)
tag_flag |= JBD2_FLAG_ESCAPE;
if (!first_tag)
tag_flag |= JBD2_FLAG_SAME_UUID;
tag = (journal_block_tag_t *) tagp;
write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
tagp += tag_bytes;
space_left -= tag_bytes;
bufs++;
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
tagp += 16;
space_left -= 16;
first_tag = 0;
}
/* If there's no more to do, or if the descriptor is full,
let the IO rip! */
if (bufs == journal->j_wbufsize ||
commit_transaction->t_buffers == NULL ||
space_left < tag_bytes + 16 + csum_size) {
jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
/* Write an end-of-descriptor marker before
submitting the IOs. "tag" still points to
the last tag we set up. */
tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
start_journal_io:
if (descriptor)
jbd2_descriptor_block_csum_set(journal,
descriptor);
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
/*
* Compute checksum.
*/
if (jbd2_has_feature_checksum(journal)) {
crc32_sum =
jbd2_checksum_data(crc32_sum, bh);
}
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
}
cond_resched();
stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
descriptor = NULL;
bufs = 0;
}
}
err = journal_finish_inode_data_buffers(journal, commit_transaction);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
---
journal_submit_data_buffers // loop the commit_transaction->t_inode_list
-> journal_submit_data_buffers
---
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = mapping->nrpages * 2,
.range_start = 0,
.range_end = i_size_read(mapping->host),
};
ret = generic_writepages(mapping, &wbc);
---
journal_finish_inode_data_buffers // loop the commit_transaction->t_inode_list
-> filemap_fdatawait_keep_errors
-> __filemap_fdatawrite_range
-> wait_on_page_writeback
---
The dirtied inodes are added into transaction ordered list when ext4_map_blocks
---
out_sem:
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
/*
* Inodes with freshly allocated blocks where contents will be
* visible after transaction commit must be on transaction's
* ordered data list.
*/
if (map->m_flags & EXT4_MAP_NEW &&
!(map->m_flags & EXT4_MAP_UNWRITTEN) &&
!(flags & EXT4_GET_BLOCKS_ZERO) &&
!ext4_is_quota_file(inode) &&
ext4_should_order_data(inode)) {
^^^^^^^^^^^^^^^^^^^^^^^
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
ret = ext4_jbd2_inode_add_wait(handle, inode);
else
ret = ext4_jbd2_inode_add_write(handle, inode);
^^^^^^^^^^^^^^^^^^^^^^^^^^
if (ret)
return ret;
}
}
This is to guarantee that the we don't exposure stale data when data=ordered mode.
^^^^^^^^^^^^^^^^^^^
Please refer to the following commit comment
From 06bd3c36a733ac27962fea7d6f47168841376824 Mon Sep 17 00:00:00 2001
From: Jan Kara
xfs doesn't have such feature as the developers think the POSIX doesn't
guarantee that the data has been reached the disk as the the write systemcall
returns except for the application use fdatasync.
But it has a feature that that only update the file size after the data hit disk.
xfs_end_ioend
---
done:
if (ioend->io_append_trans)
error = xfs_setfilesize_ioend(ioend, error);
---
Let's first look at the ext4_mb_regular_allocator
ext4_mb_regular_allocator
---
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
/*
* searching for the right group start
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
cond_resched();
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
*/
if (group >= ngroups)
group = 0;
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group(ac, group, cr);
if (ret <= 0) {
if (!first_err)
first_err = ret;
continue;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
ext4_lock_group(sb, group);
^^^^^^^^^^^^^^^^^^^^^^^^^
// This should be the critical point of the contend when allocate space.
// And it is a per group lock.
// So if we could spread the allocation requests across multiple
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// groups, the allocation could be parallel.
^^^^^^
/*
* We need to check again after locking the
* block group
*/
ret = ext4_mb_good_group(ac, group, cr);
if (ret <= 0) {
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
if (!first_err)
first_err = ret;
continue;
}
ac->ac_groups_scanned++;
if (cr == 0)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 && sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % sbi->s_stripe))
ext4_mb_scan_aligned(ac, &e4b);
else
ext4_mb_complex_scan_group(ac, &e4b);
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
}
}
---
How to spread the allocation request across multiple groups ?
ext4_mb_regular_allocator would first try the group specified by
ac->ac_g_ex.fe_group;
Where does it come from ?
One of the most important resource is
ext4_mb_initialize_context
---
/* start searching from the goal */
goal = ar->goal;
if (goal < le32_to_cpu(es->s_first_data_block) ||
goal >= ext4_blocks_count(es))
goal = le32_to_cpu(es->s_first_data_block);
ext4_get_group_no_and_offset(sb, goal, &group, &block);
^^^^^^
/* set up allocation goals */
ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
---
The ar->goal could come from
ext4_ext_find_goal
---
// if path is not NULL, we use the existing extent to get the goal.
if (path) {
int depth = path->p_depth;
struct ext4_extent *ex;
ex = path[depth].p_ext;
if (ex) {
ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
if (block > ext_block)
return ext_pblk + (block - ext_block);
else
return ext_pblk - (ext_block - block);
}
/* it looks like index is empty;
* try to find starting block from index itself */
if (path[depth].p_bh)
return path[depth].p_bh->b_blocknr;
}
/* OK. use inode's group */
return ext4_inode_to_goal_block(inode);
---
ext4_inode_to_goal_block
---
block_group = ei->i_block_group;
if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
/*
* If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
* block groups per flexgroup, reserve the first block
* group for directories and special files. Regular
* files will start at the second block group. This
* tends to speed up directory access and improves
* fsck times.
*/
block_group &= ~(flex_size-1);
if (S_ISREG(inode->i_mode))
block_group++;
}
bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
/*
* If we are doing delayed allocation, we don't need take
* colour into account.
*/
if (test_opt(inode->i_sb, DELALLOC))
return bg_start;
// The color is to avoid fragment when multiple threads try to write files
// that lay on the same group.
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
else
colour = (current->pid % 16) * ((last_block - bg_start) / 16);
return bg_start + colour;
---
So the file would follow its inode's group, how to select group for an inode ?
__ext4_new_inode
---
if (S_ISDIR(mode))
ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
else
ret2 = find_group_other(sb, dir, &group, mode);
---
We don't look into the code here but jut post the comment.
Orlov's allocator for directories.
We always try to spread first-level directories.
If there are blockgroups with both free inodes and free blocks counts
not worse than average we return one with smallest directory count.
Otherwise we simply return a random group.
For the rest rules look so:
It's OK to put directory into a group unless
it has too many directories already (max_dirs) or
it has too few free inodes left (min_inodes) or
it has too few free blocks left (min_blocks) or
Parent's group is preferred, if it doesn't satisfy these
conditions we search cyclically through the rest. If none
of the groups look good we just look for a group with more
free inodes than average (starting at parent's group).
Try to place the inode is the same flex group as its
parent. If we can't find space, use the Orlov algorithm to
find another flex group, and store that information in the
parent directory's inode information so that use that flex
group for future allocations.
Except for this, ext4 also has pre allocation space in both inode and locality.
ext4_mb_new_blocks
---
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
if (*errp)
goto discard_and_exit;
/* as we've just preallocated more space than
* user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
*errp = ext4_mb_new_preallocation(ac);
...
}
---