Concepts
infrastructure
buffer head
jbd2
Inode
Dcache
Quota
ext4
Tmpfs
xfs Log
xfs Inode
xfs buf
xfs bmap
The link count of a file tells the total number of links a file has
which is nothing but the number of hard-links a file has. This count,
however, does not include the soft-link count.
Note: The soft-link is not part of the link count since the soft-link's
inode number is different from the original file.
When a directory is created, except for the link count for the dentry in
parent directory, two extra link count is increased,
(1) one on the parent directory for ".." dentry in child directory
(2) one on the child directory for "." dentry ion child directory
Quote from https://pubs.opengroup.org/onlinepubs/9699919799/functions/unlink.html#tag_16_635
When the file's link count becomes 0 and no process has the file open, the space occupied by
the file shall be freed and the file shall no longer be accessible. If one or more processes
have the file open when the last link is removed, the link shall be removed before unlink()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
returns, but the removal of the file contents shall be postponed until all references to the
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
file are closed.
^^^^^^^^^^^^^^
How does xfs implement this ?
xfs_droplink()
---
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
// drop the link in the indoe on disk
drop_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (VFS_I(ip)->i_nlink)
return 0;
return xfs_iunlink(tp, ip);
---
// link this inode on the list on agi->agi_unlinked[]
xfs_iunlink()
-> xfs_iunlink_update_bucket()
---
agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
offset = offsetof(struct xfs_agi, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket_index);
xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
---
xfs_fs_destroy_inode()
-> xfs_inactive()
---
if (VFS_I(ip)->i_nlink != 0) {
...
return;
}
if (S_ISREG(VFS_I(ip)->i_mode) &&
(ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
...
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)
error = xfs_inactive_truncate(ip);
if (error)
return;
...
/*
* Free the inode.
*/
error = xfs_inactive_ifree(ip);
-> xfs_ifree()
-> xfs_iunlink_remove()
---
This is called during recovery to process any inodes which we unlinked but
not freed when the system crashed. These inodes will be on the lists in the
AGI blocks. What we do here is scan all the AGIs and fully truncate and free
any inodes found on the lists. Each inode is removed from the lists when it
has been fully truncated and is freed. The freeing of the inode and its
removal from the list must be atomic.
xlog_recover_process_iunlinks()
-> xlog_recover_process_one_iunlink()
---
ino = XFS_AGINO_TO_INO(mp, agno, agino);
error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
...
error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
...
agino = be32_to_cpu(dip->di_next_unlinked);
xfs_buf_relse(ibp);
xfs_irele(ip);
-> iput(VFS_I(ip));
//The reference would be zero and triger xfs_fs_destroy_inode()
---
Actually, both xfs and ext4-jbd2 employ asynchronous journal which
could batch IOs to journal to promote performance. Asynchronous journal
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
may lead to lose data (inode could also be lost) but still keep metadata consistent.
Let's look at how does xfs and ext4-jbd2 implement the asynchronous journal
In constrast with asynchronous journal, synchronous journal could guarantee
__xfs_trans_commit()
-> xfs_log_commit_cil()
-> xlog_cil_push_background()
---
// min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
// ^^^^^^^^^^^^^^^^^^^^^^^^
// Max of the xlog buffer
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
up_read(&cil->xc_ctx_lock);
return;
}
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
up_read(&cil->xc_ctx_lock);
/*
* If we are well over the space limit, throttle the work that is being
* done until the push work on this context has begun.
*/
if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
return;
}
spin_unlock(&cil->xc_push_lock);
---
When to start commit jdb2?
(1) log space is not enough
start_this_handle()
-> add_transaction_credits()
---
needed = atomic_add_return(total, &t->t_outstanding_credits);
if (needed > journal->j_max_transaction_buffers) {
/*
* If the current transaction is already too large,
* then start to commit it: we can then go back and
* attach this handle to a new transaction.
*/
atomic_sub(total, &t->t_outstanding_credits);
...
wait_transaction_locked(journal);
return 1;
}
---
(2) transaction is too old
jbd2_journal_stop()
---
/*
* If the handle is marked SYNC, we need to set another commit
* going! We also want to force a commit if the transaction is too
* old now.
*/
if (handle->h_sync ||
time_after_eq(jiffies, transaction->t_expires)) {
/* This is non-blocking */
jbd2_log_start_commit(journal, tid);
}
---
(3) commit timeouts
jbd2_get_transaction()
---
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
---
commit_timeout()
---
journal_t *journal = from_timer(journal, t, j_commit_timer);
wake_up_process(journal->j_task);
---
the metadata has been on disk after the syscall returns, but this could hurts
the performance. Batch synchronous journal IO is to promote this which has been
supported by jbd2.
jbd2_journal_stop()
---
/*
* Implement synchronous transaction batching.If the handle
* was synchronous, don't force a commit immediately. Let's
* yield and let another thread piggyback onto this
* transaction. Keep doing that while new threads continue to
* arrive. It doesn't cost much - we're about to run a committed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* and sleep on IO anyway. Speeds up many-threaded, many-dir
^^^^^^^^^^^^^^^^^^^^^^
The io here means the journal IO if we commit immediately.
* operations by 30x or more...
*
* We try and optimize the sleep time against what the
* underlying disk can do, instead of having a static sleep
* time. This is useful for the case where our storage is so
* fast that it is more optimal to go ahead and force a flush
* and wait for the transaction to be committed than it is to
* wait for an arbitrary amount of time for new writers to
* join the transaction. We achieve this by measuring how
* long it takes to commit a transaction, and compare it with
* how long this transaction has been running, and if run time
* < commit time then we sleep for the delta and commit. This
* greatly helps super fast disks that would see slowdowns as
* more threads started doing fsyncs.
*
* But don't do this if this process was the most recent one '
* to perform a synchronous write. We do this to detect the
* case where a single process is doing a stream of sync
* writes. No point in waiting for joiners in that case.
*
* Setting max_batch_time to 0 disables this completely.
*/
pid = current->pid;
if (handle->h_sync && journal->j_last_sync_writer != pid &&
journal->j_max_batch_time) {
u64 commit_time, trans_time;
journal->j_last_sync_writer = pid;
read_lock(&journal->j_state_lock);
commit_time = journal->j_average_commit_time;
read_unlock(&journal->j_state_lock);
trans_time = ktime_to_ns(ktime_sub(ktime_get(),
transaction->t_start_time));
commit_time = max_t(u64, commit_time,
1000*journal->j_min_batch_time);
commit_time = min_t(u64, commit_time,
1000*journal->j_max_batch_time);
if (trans_time < commit_time) {
ktime_t expires = ktime_add_ns(ktime_get(),
commit_time);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
}
}
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
if (handle->h_sync ||
time_after_eq(jiffies, transaction->t_expires)) {
jbd2_log_start_commit(journal, tid);
/*
* Special case: JBD2_SYNC synchronous updates require us
* to wait for the commit to complete.
*/
if (handle->h_sync && !(current->flags & PF_MEMALLOC))
wait_for_commit = 1;
}
stop_this_handle(handle);
if (wait_for_commit)
err = jbd2_log_wait_commit(journal, tid);
---
When a process performs an operation to a file, the Linux kernel performs
the check in the following order:
Or __user__ dictated access control
This includes both classic UNIX style permission checks and POSIX Access Control Lists (ACL).
Classical UNIX checks compare the current process UID and GID versus the UID and GID of the
file being accessed with regards to which modes have been set (Read/Write/eXecute).
Access Control List extends classic UNIX checks to allow more options regarding permission control.
Or policy based access control.
This is implemented using Linux Security Modules (LSM) which are not real modules anymore (they
used to be but it was dropped). They enable additionnal checks based on other models than the
classical UNIX style security checks. All of those models are based on a policy describing what
kind of opeartions are allowed for which process in which context.
The classical unix style permissions is based on two points:
Through following two steps:
# ll test.c
-rw-r--r-- 1 will will 71 Mar 25 09:37 test.c
# id will
uid=1000(will) gid=1000(will) groups=1000(will)
The code of permission check is as following,
Based on the accessing process' gid/uid to know who you are, user ? members
of same group ? other ?
Based on the role decided by step one, get your permission, which is composed
by following compoments
#define S_IRUSR 00400
#define S_IWUSR 00200
#define S_IXUSR 00100
#define S_IRGRP 00040
#define S_IWGRP 00020
#define S_IXGRP 00010
#define S_IROTH 00004
#define S_IWOTH 00002
#define S_IXOTH 00001
Note !!! They are Octal !!!
do_last()
-> may_open()
-> inode_permission()
-> sb_permission()
---
/* Nobody gets write access to a read-only fs. */
if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
return -EROFS;
---
-> do_inode_permission()
-> generic_permission()
-> acl_permission_check()
acl_permission_check()
---
if (likely(uid_eq(current_fsuid(), inode->i_uid)))
mode >>= 6;
else {
if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
int error = check_acl(inode, mask);
if (error != -EAGAIN)
return error;
}
if (in_group_p(inode->i_gid))
mode >>= 3;
}
if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
return 0;
return -EACCES;
---
When we read the /proc/PID/io, we would get following things,
rchar: 2814484553
wchar: 2326047
syscr: 689899
syscw: 7616
read_bytes: 507052032
write_bytes: 1105920
cancelled_write_bytes: 393216
What do these fields stand for ? Let's look at the code.
proc_tid_io_accounting/proc_tgid_io_accounting()
-> do_io_accounting()
---
struct task_io_accounting acct = task->ioac;
...
if (whole && lock_task_sighand(task, &flags)) {
struct task_struct *t = task;
task_io_accounting_add(&acct, &task->signal->ioac);
while_each_thread(task, t)
task_io_accounting_add(&acct, &t->ioac);
unlock_task_sighand(task, &flags);
}
seq_printf(m,
"rchar: %llu\n"
"wchar: %llu\n"
"syscr: %llu\n"
"syscw: %llu\n"
"read_bytes: %llu\n"
"write_bytes: %llu\n"
"cancelled_write_bytes: %llu\n",
(unsigned long long)acct.rchar,
(unsigned long long)acct.wchar,
(unsigned long long)acct.syscr,
(unsigned long long)acct.syscw,
(unsigned long long)acct.read_bytes,
(unsigned long long)acct.write_bytes,
(unsigned long long)acct.cancelled_write_bytes);
---
Let's look into the fields only by one
add_rchar() <- vfs_read()
<- do_readv()
<- do_preadv()
<- compat_readv()
<- do_sendfile()
<- vfs_copy_file_range() >
rchar records the bytes read through read/readv/preadv...
the data can be from not only a regular file but also a socket
add_wchar() <- vfs_write()
<- do_writev()
<- do_pwritev()
<- compat_writev()
<- do_sendfile()
<- vfs_copy_file_range() >
wchar is almost same with rchar
inc_syscr/syscw() are hooked along with add_rchar/wchar()
task_io_account_read() <- submit_bio()
<- nfs_file_direct_read()
<- read_cache_pages() <-nfs_readpages()
<-fuse_readpages() >
read is always __synchronous__ no mater buffer read nor direct IO, so submit_bio is always necessary, except for special filesystem,
such as networking filesystem.
task_io_account_write() <- __blkdev_direct_IO_simple()
<- __blkdev_direct_IO()
<- submit_page_section() <- dio_zero_block()
<- do_direct_IO()
<- iomap_dio_bio_actor()
<- nfs_file_direct_write()
<- account_page_dirtied() <- __set_page_dirty()
<- __set_page_dirty_nobuffers() >
buffer write is asynchronous, the final writeback is done by writeback workers, so we cannot
account it in submit_bio.
What's iov_iter ?
struct iov_iter {
unsigned int type; Type, IOVEC/KVEC/BVEC
size_t iov_offset; current offset, like bio.bi_iter.bi_sector
size_t count; residual count like bio.bi_iter.bi_size
union {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
...
};
See the bio_vec, basically, iov and kvec have similar functions.
The bio.bi_io_vec always points the head of the bvec array.
iov/kvec/bvec always points the current vector, like bio.bi_io_vec[bio->bi_iter.bi_idx]
...
};
Just refer to iterate_and_advance() to know how does the iov_iter work
Even though we try to compare the bio with iov_iter, but they are not different.
bio has a map between the block device and buffer in memory,
however, iov_iter only describes the buffer, which is more similar with sglist.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Next, we will figure out what does the IOVEC/KVEC/BVEC mean and who dose use them.
The other modules that receive iov_iter could use following interfaces to handle it
struct iovec
{
void __user *iov_base;
__kernel_size_t iov_len;
};
There are two cases here,
(1) construct iov_iter in kernel, and address from userland
vfs_read()
-> new_sync_read() //read_iter
---
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
// We have said that iov_iter is just buffer, the io is described
// by kiocb.ki_filp and kiocb.ki_pos
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_init(&iter, READ, &iov, 1, len);
ret = call_read_iter(filp, &kiocb, &iter);
---
(2) iov array is from userland
static ssize_t vfs_readv(
struct file *file,
const struct iovec __user *vec,
unsigned long vlen,
loff_t *pos,
rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t ret;
ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
// __import_iovec()->iovec_from_user() would copy the iovec array from userland
if (ret >= 0) {
ret = do_iter_read(file, &iter, pos, flags);
kfree(iov);
}
return ret;
}
struct kvec {
void *iov_base;
size_t iov_len;
};
struct bio_vec {
struct page *bv_page;
unsigned int bv_len;
unsigned int bv_offset;
};
nbd_send_cmd() use both of them,
nbd_send_cmd()
---
struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
struct iov_iter from;
//1st kvec is used to send nbd command
iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
...
cmd->index = index;
cmd->cookie = nsock->cookie;
cmd->retries = 0;
request.type = htonl(type | nbd_cmd_flags);
if (type != NBD_CMD_FLUSH) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
request.len = htonl(size);
}
handle = nbd_cmd_handle(cmd);
memcpy(request.handle, &handle, sizeof(handle));
//the iov_iter 'from' carry the nbd command
result = sock_xmit(nbd, index, 1, &from,
(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
send_pages:
if (type != NBD_CMD_WRITE)
goto out;
//Then let's send out the request with bvec iov_iter_bvec
bio = req->bio;
while (bio) {
struct bio *next = bio->bi_next;
struct bvec_iter iter;
struct bio_vec bvec;
bio_for_each_segment(bvec, bio, iter) {
bool is_last = !next && bio_iter_last(bvec, iter);
int flags = is_last ? 0 : MSG_MORE;
//Setup a iov_iter for every bvec
iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
...
result = sock_xmit(nbd, index, 1, &from, flags, &sent);
...
if (is_last)
break;
}
bio = next;
}
---
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
if (unlikely(!page_copy_sane(page, offset, bytes)))
return 0;
if (i->type & (ITER_BVEC|ITER_KVEC)) {
void *kaddr = kmap_atomic(page);
size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
kunmap_atomic(kaddr);
return wanted;
} else if (unlikely(iov_iter_is_discard(i)))
return bytes;
else if (likely(!iov_iter_is_pipe(i)))
return copy_page_to_iter_iovec(page, offset, bytes, i);
else
return copy_page_to_iter_pipe(page, offset, bytes, i);
}
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
const char *from = addr;
if (unlikely(iov_iter_is_pipe(i)))
return copy_pipe_to_iter(addr, bytes, i);
if (iter_is_iovec(i))
might_fault();
iterate_and_advance(i, bytes, v,
//IOVEC, userland buffers, the 'I' in iterate_and_advance
copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
//BVEC, pages, the 'B' in iterate_and_advance
memcpy_to_page(v.bv_page, v.bv_offset,
(from += v.bv_len) - v.bv_len, v.bv_len),
//KVEC, kernel buffers, the 'K' in iterate_and_advance
memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
)
return bytes;
}
We cannot take the inode lock (inode.i_rwsem) in page fault path.
For example,
ext4_dax_vm_ops.ext4_dax_huge_fault()
---
if (write) {
...
} else {
down_read(&EXT4_I(inode)->i_mmap_sem);
}
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
if (write) {
...
} else {
up_read(&EXT4_I(inode)->i_mmap_sem);
}
---
xfs_file_vm_ops.xfs_filemap_fault()
---
// XFS_MMAPLOCK_SHARED -> xfs_inode_t.i_mmaplock
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
pfn_t pfn;
ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
(write_fault && !vmf->cow_page) ?
&xfs_direct_write_iomap_ops :
&xfs_read_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
} else {
if (write_fault)
ret = iomap_page_mkwrite(vmf,
&xfs_buffered_write_iomap_ops);
else
ret = filemap_fault(vmf);
}
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
---
Why ?
Refer to link, https://lwn.net/Articles/548098
The problem has to do with the order in which locks are acquired.
For normal filesystem operations, the filesystem code will obtain any locks it requires;
the memory management subsystem will then grab mmap_sem should that be required — to bring a read or write buffer into RAM, for example.
When a page fault happens, though, the lock ordering is reversed:
first mmap_sem is taken, then the filesystem code is invoked to bring the needed page into RAM.
Let's look at the code that could show us the lock ordering,
ext4_dax_read_iter()
-> inode_lock_shared()
-> dax_iomap_rw()
-> iomap_apply()
-> dax_iomap_actor()
-> dax_copy_to_iter()
-> _copy_mc_to_iter()
// when we write to userland buffer, page fault would happen, and then
// do_user_addr_fault() would come
do_user_addr_fault()
-> mmap_read_lock(mm)
-> handle_mm_fault()
-> __handle_mm_fault()
-> handle_pte_fault()
-> do_fault()
-> do_read_fault()
-> __do_fault()
-> vma->vm_ops->fault(vmf);
ext4_dax_fault()
xfs_filemap_fault()
In the previous section, we have known that the whole page fault path is
under the mm->mmap_sem. In page fault path, we could do a lot of thing,
including some read or write IO, And this could cause a lot of problems, such as
Holding the mmap_sem while doing IO is problematic because it can cause
system-wide priority inversions. Consider some large company that does a
lot of web traffic. This large company has load balancing logic in it's
core web server, cause some engineer thought this was a brilliant plan.
This load balancing logic gets statistics from /proc about the system,
which trip over processes mmap_sem for various reasons. Now the web
server application is in a protected cgroup, but these other processes may
not be, and if they are being throttled while their mmap_sem is held we'll
stall, and cause this nice death spiral.
The upstream solves this problem as following,
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
struct file *fpin)
{
int flags = vmf->flags;
if (fpin)
return fpin;
if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
mmap_read_unlock(vmf->vma->vm_mm);
}
return fpin;
}
(1) Get the reference of the file under mmap_sem
(2) Unlock the mmap_sem
To understand this solotuion, we need to know what does the mmap_sem protect.
The common case is that the vma is backed by a regular file.
This rbtree is to quickly finding the VMA associated with a given address,
or finding a gap in the address space that is large enough to hold a new VMA
Making it possible to walk through the entire space.
It includes the page fault policy behind this vma and also the things behind it,
a file or just some pages.
vm_mmap_pgoff()
-> mmap_write_lock_killable()
-> do_mmap()
-> mmap_region()
---
vma = vm_area_alloc(mm);
...
if (file) {
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
...
}
---
A reference is grabbed here to keep it alive during the mapping
__do_munmap()
-> unmap_region()
-> unmap_vmas()
---
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
// Page table is removed here
---
-> remove_vma_list()
-> remove_vma_list()
-> remove_vma()
---
if (vma->vm_file)
fput(vma->vm_file)
---
After maybe_unlock_mmap_for_io() pins the file and unlock the mmap_sem,
it could guarantee the file will be gone during this, and it looks like
a normal IO through syscall read/write. When the IO is completed, we could
retry the page fault path, at the moment, the IO has been ready.
do_user_addr_fault()
---
if (unlikely(!mmap_read_trylock(mm))) {
...
retry:
mmap_read_lock(mm);
} else {
might_sleep();
}
vma = find_vma(mm, address);
...
fault = handle_mm_fault(vma, address, flags, regs);
...
/*
* If we need to retry the mmap_lock has already been released,
* and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first.
*/
if (unlikely((fault & VM_FAULT_RETRY) &&
(flags & FAULT_FLAG_ALLOW_RETRY))) {
flags |= FAULT_FLAG_TRIED;
goto retry;
}
---
do_shared_fault() handles the page fault on
The main steps of do_shared_fault is as following,
Prepare the page in page cache and fill it.
ext4 follows the basic fashion of linux page fault. It invokes
filemap_fault()
---
//Try to find the page, if not exist, create a new one
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
fpin = do_async_mmap_readahead(vmf, page);
} else if (!page) {
...
retry_find:
page = pagecache_get_page(mapping, offset, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask);
}
...
page_not_uptodate:
ClearPageError(page);
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
// If not update, read it in
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
if (!PageUptodate(page))
error = -EIO;
}
---
What do we need to do to make a page writable ?
Look int othe ext4_page_mkwrite()
---
do {
err = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (err == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
The most important thing here is to reserve the space for the page (delay
allocation case)
---
finish_fault()
---
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
ret = 0;
/* Re-check under ptl */
if (likely(pte_none(*vmf->pte)))
do_set_pte(vmf, page, vmf->address);
else
ret = VM_FAULT_NOPAGE;
update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
---
sendfile is one of the feature to implement zero-copy
read write sendfile
|uuuuuu| ---\
^ \ ---/
-------/--------\--------------------------------------
/ v
|ssssss| |dddddd| |ssssss| -> |dddddd|
do_sendfile()
-> splice_direct_to_actor()
-> do_splice_to() //read from src into pipe
-> f_op->splice_read()
-> do_splice_from() //write from pipe into dest
-> f_op->splice_write()
Let's how does the pipe dance here ?
generic_file_splice_read()
-> ext4_file_read_iter()
-> generic_file_read_iter()
-> generic_file_read_iter()
-> copy_page_to_iter()
-> copy_page_to_iter_pipe()
---
off = i->iov_offset;
buf = &pipe->bufs[i_head & p_mask];
if (off) {
if (offset == off && buf->page == page) {
/* merge with the last one */
buf->len += bytes;
i->iov_offset += bytes;
goto out;
}
i_head++;
buf = &pipe->bufs[i_head & p_mask];
}
if (pipe_full(i_head, p_tail, pipe->max_usage))
return 0;
buf->ops = &page_cache_pipe_buf_ops;
get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = bytes;
pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
---
The pipe here looks like the another kind of bvec arrays.
iter_file_splice_write()
---
pipe_lock(pipe);
splice_from_pipe_begin(&sd);
while (sd.total_len) {
struct iov_iter from;
unsigned int head, tail, mask;
size_t left;
int n;
ret = splice_from_pipe_next(pipe, &sd);
if (ret <= 0)
break;
...
head = pipe->head;
tail = pipe->tail;
mask = pipe->ring_size - 1;
/* build the vector */
left = sd.total_len;
for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t this_len = buf->len;
if (this_len > left)
this_len = left;
...
array[n].bv_page = buf->page;
array[n].bv_len = this_len;
array[n].bv_offset = buf->offset;
left -= this_len;
}
// construct a bvec iov_iter from the pipe
iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
ret = vfs_iter_write(out, &from, &sd.pos, 0);
if (ret <= 0)
break;
sd.num_spliced += ret;
sd.total_len -= ret;
*ppos = sd.pos;
/* dismiss the fully eaten buffers, adjust the partial one */
tail = pipe->tail;
while (ret) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
if (ret >= buf->len) {
ret -= buf->len;
buf->len = 0;
pipe_buf_release(pipe, buf);
tail++;
pipe->tail = tail;
if (pipe->files)
sd.need_wakeup = true;
} else {
buf->offset += ret;
buf->len -= ret;
ret = 0;
}
}
}
done:
kfree(array);
splice_from_pipe_end(pipe, &sd);
pipe_unlock(pipe);
---
When you open or close a file, you could get or put many instances' reference.
Please refer to following comment from link https://lwn.net/Articles/494158/
The management of file structure reference counts is done with calls to fget() and fput().
A file structure, which represents an open file, can depend on a lot of resources:
as long as a file is open, the kernel must maintain its underlying storage device,
filesystem, network protocol information, security-related information, user-space
notification requests, and more. An fget() call will ensure that all of those resources
stay around as long as they are needed. A call to fput(), instead, might result in the
destruction of any of those resources. For example, closing the last file on an unmounted
filesystem will cause that filesystem to truly go away.
Next, let's try to figure out what they are.
We could get some hint in the __fput()
__fput is deferred to the task work context which will be invoked before the
task returns to userland. Refer to the following comment to get the point
----
What all this means is that a call to fput() can do a lot of work, and that
work may require the acquisition of a number of locks. The problem is that
fput() can also be called from any number of contexts; there are a few hundred
fput() and fput_light() calls in the kernel. Each of those call sites has its
own locking environment and, usually, no knowledge of what code in other
subsystems may be called from fput(). So the potential for problems like
locking-order violations is real.
----
void fput_many(struct file *file, unsigned int refs)
{
if (atomic_long_sub_and_test(refs, &file->f_count)) {
struct task_struct *task = current;
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
return;
}
if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
schedule_delayed_work(&delayed_fput_work, 1);
}
}
exit_to_user_mode_loop()
-> tracehook_notify_resume()
-> task_work_run()
Let's focus on the dput() and mntput() in __fput().
The mnt and dentry associated with the file is assigned in vfs_open()
mntput()
-> mntput_no_expire()
---
...
lock_mount_hash();
smp_mb();
mnt_add_count(mnt, -1);
count = mnt_get_count(mnt);
if (count != 0) {
WARN_ON(count < 0);
rcu_read_unlock();
unlock_mount_hash();
return;
}
...
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
struct task_struct *task = current;
if (likely(!(task->flags & PF_KTHREAD))) {
init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
return;
}
if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
schedule_delayed_work(&delayed_mntput_work, 1);
return;
}
cleanup_mnt(mnt);
---
If the reference of the mnt is exhausted, cleanup_mnt() will be invoked.
cleanup_mnt()
---
...
//put the dentry of the mount point
dput(mnt->mnt.mnt_root);
//This is the most important part, the real work of umount would be done
//here
deactivate_super(mnt->mnt.mnt_sb);
mnt_free_id(mnt);
call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
---
deactivate_super()
-> deactivate_locked_super()
-> fs->kill_sb()
kill_block_super()
---
generic_shutdown_super(sb);
sync_blockdev(bdev);
WARN_ON_ONCE(!(mode & FMODE_EXCL));
blkdev_put(bdev, mode | FMODE_EXCL);
---
generic_shutdown_super()
---
// shrink the dcache to release all of the reference to inode cache
shrink_dcache_for_umount(sb);
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
/* evict all inodes with zero refcount */
// Can we ensure all of the inode to be evicted ?
// It should be YES, because every opened file holds a reference of mount.
// When we reach here, all of them should be closed.
evict_inodes(sb);
...
if (!list_empty(&sb->s_inodes)) {
printk("VFS: Busy inodes after unmount of %s. "
"Self-destruct in 5 seconds. Have a nice day...\n",
sb->s_id);
}
---
Every opened file holds a reference to the dentry and every dentry holds a
reference to inode.
path_openat()
-> link_path_walk()
-> open_last_lookups()
-> do_open()
-> vfs_open()
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
Let's first look at what will be done during a mount
do_new_mount()
-> fs_context_for_mount()
-> alloc_fs_context()
-> fc->fs_type->init_fs_context()
legacy_init_fs_context()
fc->ops = &legacy_fs_context_ops
vfs_get_tree()
-> fc->ops->get_tree()
legacy_get_tree()
-> fc->fs_type->mount()
ext4_mount()
-> mount_bdev() with ext4_fill_super()
mount_bdev()
---
bdev = blkdev_get_by_path(dev_name, mode, fs_type);
...
mutex_lock(&bdev->bd_fsfreeze_mutex);
...
s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
bdev);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (s->s_root) {
...
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
---
root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
...
sb->s_root = d_make_root(root);
---
...
s->s_flags |= SB_ACTIVE;
bdev->bd_super = s;
}
return dget(s->s_root);
---
When a file is mapped, how does the kernel know the file is written ?
The answer is writenotify
do_mmap()
-> mmap_region()
-> vma_set_page_prot()
---
unsigned long vm_flags = vma->vm_flags;
pgprot_t vm_page_prot;
vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
if (vma_wants_writenotify(vma, vm_page_prot)) {
vm_flags &= ~VM_SHARED;
vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
}
// Note, vma->vm_flags is not modified
/* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
---
vma_wants_writenotify()
---
/* If it was private or non-writable, the write bit is already clear */
if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
return 0;
/* The backer wishes to know when pages are first written to? */
if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
^^^^^^^^^^^^ ^^^^^^^^^^^
return 1;
---
The vma->wm_page_prot will be used when vmf_insert_mixed_mkwrite()
Finally, the mapped page doesn't have write permission
handle_pte_fault()
---
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
// Set the pte access flags
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
}
---
do_wp_page()
-> wp_page_shared()
-> do_page_mkwrite()
The page will be marked dirty during this.
clear_page_dirty_for_io() is invoked during page writeback
What's synchronous page fault for ?
https://lwn.net/Articles/731706/
Normally, filesystems are in control of all I/O to the underlying storage
^^^^^^^
media; they use that control to ensure that the filesystem structure is
consistent at all times. Even when a file on a traditional storage device
is mapped into a process's virtual address space, the filesystem manages
the writeback of modified pages from the page cache to persistent storage.
Directly mapped persistent memory bypasses the filesystem, though, leading
to a number of potential problems including inconsistent metadata or data
corruption and loss if the filesystem relocates the file being modified.
For example, in ext4,
ext4_dax_huge_fault()
-> dax_iomap_fault()
-> dax_iomap_pte_fault()
-> ext4_iomap_begin()
-> ext4_iomap_alloc()
-> ext4_journal_start()
-> ext4_map_blocks()
-> ext4_journal_stop()
At this moment, the metadata has not been on persistent storage.
-> vmf_insert_mixed_mkwrite() //insert page fault
After the page fault returns, the metadata may have not been on the page fault
due to the asynchronous journal employed by both xfs and ext4. The userland process
^^^^^^^^^^^^^^^^^^^^
(usually a storage engine) can write data on the mapping. But if the system crashes
before the fs metadata get flushed the persistent storage medium, the written data
could be lost.
MAP_SYNC is such as flags that tells kernel do the fsync after before the page fault
returns.
ext4_dax_huge_fault()
-> dax_iomap_fault()
-> dax_iomap_pte_fault()
-> ext4_iomap_begin()
---
/*
* If we are doing synchronous page fault and inode needs fsync,
* we can insert PTE into page tables only after that happens.
* Skip insertion for now and return the pfn so that caller can
* insert it after fsync is done.
*/
if (sync) {
*pfnp = pfn;
ret = VM_FAULT_NEEDDSYNC | major;
goto finish_iomap;
}
---
ext4_dax_huge_fault()
--
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
//guarantee the metadat get flushed to the persistent storage medium
-> vfs_fsync_range()
-> file_write_and_wait_range()
-> ext4_fsync_journal()
-> blkdev_issue_flush()
// insert page table
-> dax_insert_pfn_mkwrite()
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
}
--
bdi, namely backing_dev_info, represents the underlying device (maybe a virtual
device for nfs or fuse). It mainly works for writeback, but also carry the ra_pages
which is for readahead.
By default, the super_block has noop_backing_dev_info,
alloc_super()
---
s->s_bdi = &noop_backing_dev_info;
---
Then s_bdi will be set to the bdi of the underlaying device or a virtual one.
set_bdev_super() <- mount_bdev() <- ext2_mount()
<- ext4_mount()
<- get_tree_bdev() <- xfs_fs_get_tree()
super_setup_bdi_name() <- ceph_setup_bdi()
<- nfs_get_tree_common()
<- super_setup_bdi() <- btrfs_fill_super() >
If the s_bdi is noop_backing_dev_info, there won't be any writeback activity on this filesystem.
In the past, only direct IO and buffered read IO can be controlled by cgroup
blkio. The reason is as following diragram.
a cgroup VFS LAYER root cgroup
Task A writeback kworker
| |
| dirtying pages | flush dirty inodes
v v
[D] [D] [D] [D] -------> writepages
[D] [D] [D] [D] |
------------------------------------^--------------------
BLOCK LAYER |
v
submit_bio
| blkio qos/scheduler
v
[R] [R] [R] [R]
---------------------------------------------------------
SCSI/NVME/....
writeback kworker does the real IO but it belongs to the root cgroup.
More details, refer to Writeback and control groups
Let's look into the code,
Now, we know that the bios queued by writeback kworker have owned the
bdi_writeback is in charge of dirty page writeabck activity to the block device
behind the backing_dev_info. Look into it, we can find b_dirty/b_dirty_time
which are used to hang dirty inodes.
The role of backing_dev_info is weakened and become the brigde between bdi_writeback
and block device.
This is done by inode_attach_wb(). It is in the context of the task that's dirtying pages
inode_attach_wb <- __mark_inode_dirty()
<- wbc_attach_fdatawrite_inode() <- __filemap_fdatawrite_range()
<- __set_page_dirty_nobuffers()[M]
<- account_page_dirtied() <- __set_page_dirty() <- __set_page_dirty_buffers()[M]
<- mark_buffer_dirty()[M]
<- iomap_set_page_dirty()[M]
[M] : there is __mark_inode_dirty invoked in it
This is done by wbc_init_bio in .writepages
wbc_init_bio()
---
/*
* pageout() path doesn't attach @wbc to the inode being written
* out. This is intentional as we don't want the function to block
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* behind a slow cgroup. Ultimately, we want pageout() to kick off
^^^^^^^^^^^^^^^^^^^^
* regular writeback instead of writing things out itself.
*/
if (wbc->wb)
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
---
Take ext4 as example,
io_submit_init_bio()
---
bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
wbc_init_bio(io->io_wbc, bio);
---
information of cgroup that dirtied the pages.
And we still need to throttle the task that's dirtying the pages to match
the speed of dirtying and cleaning. And this has been done in balance_dirty_pages()
balance_dirty_pages()
---
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL;
//mdtc is for memcg dirty throttle control
if (mdtc) {
unsigned long filepages, headroom, writeback;
/*
* If @wb belongs to !root memcg, repeat the same
* basic calculations for the memcg domain.
*/
mem_cgroup_wb_stats(wb, &filepages, &headroom,
&mdtc->dirty, &writeback);
mdtc->dirty += writeback;
mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc);
}
...
if (mdtc) {
...
dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
((mdtc->dirty > mdtc->thresh) || strictlimit);
wb_position_ratio(mdtc);
if (mdtc->pos_ratio < gdtc->pos_ratio)
sdtc = mdtc;
}
---
domain_dirty_limits() is for calculating the limit of per-memcg
---
/* gdtc is !NULL iff @dtc is for memcg domain */
if (gdtc) {
unsigned long global_avail = gdtc->avail;
/*
* The byte settings can't be applied directly to memcg
* domains. Convert them to ratios by scaling against
* globally available memory. As the ratios are in
* per-PAGE_SIZE, they can be obtained by dividing bytes by
* number of pages.
*/
if (bytes)
ratio = min(DIV_ROUND_UP(bytes, global_avail),
PAGE_SIZE);
if (bg_bytes)
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
PAGE_SIZE);
bytes = bg_bytes = 0;
}
---
Whether enable this feature is controlled by wb->memcg_css->parent,
namely, whether this bdi_writeback belongs to a non-root memcg.
Refer to MDTC_INIT() and mdtc_valid();
And the belonging of the bdi_writeback is decided when
balance_dirty_pages_ratelimited()
---
if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL);
if (!wb)
wb = &bdi->wb;
---
Partial Write
/------------------------------------------------------------------/
__block_write_begin_int()
---
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
...
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++=bh;
}
}
---
Pages will be read in befoare written.
When allocate page cache, what's gfp_mask is used ?
inode_init_always()
---
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
---
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
/ \
GFP_USER | __GFP_HIGHMEM
/ \
__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL
ITOW, pagecache allocation use GFP_KERNEL. However, xfs is not.
xfs_setup_inode()
---
/*
* Ensure all page cache allocations are done from GFP_NOFS context to
* prevent direct reclaim recursion back into the filesystem and blowing
* stacks or deadlocking.
*/
gfp_mask = mapping_gfp_mask(inode->i_mapping);
mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
---
And this can influence the action in shrink_page_list,
if (PageWriteback(page)) {
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
stat->nr_immediate++;
goto activate_locked;
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!PageReclaim(page) || !may_enter_fs) {
SetPageReclaim(page);
stat->nr_writeback++;
goto activate_locked;
/* Case 3 above */
} else {
unlock_page(page);
wait_on_page_writeback(page);
/* then go back and try same page again */
list_add_tail(&page->lru, page_list);
continue;
}
}
This means xfs is easy to OOM with buffer IO, but ext4 is not.
[<0>] io_schedule+0x12/0x40
[<0>] wait_on_page_bit+0x137/0x230
[<0>] shrink_page_list+0xbab/0xc50
[<0>] shrink_inactive_list+0x254/0x580
[<0>] shrink_node_memcg+0x1fa/0x720
[<0>] shrink_node+0xce/0x440
[<0>] do_try_to_free_pages+0xc3/0x360
[<0>] try_to_free_mem_cgroup_pages+0xf9/0x210
[<0>] try_charge+0x192/0x780
[<0>] mem_cgroup_try_charge+0x8b/0x1a0
[<0>] __add_to_page_cache_locked+0x64/0x240
[<0>] add_to_page_cache_lru+0x64/0x100
[<0>] pagecache_get_page+0xf2/0x2c0
[<0>] grab_cache_page_write_begin+0x1f/0x40
[<0>] ext4_da_write_begin+0xce/0x470 [ext4]
[<0>] generic_perform_write+0xf4/0x1b0
[<0>] __generic_file_write_iter+0xfe/0x1c0
[<0>] ext4_file_write_iter+0xc6/0x3b0 [ext4]
[<0>] new_sync_write+0x124/0x170
[<0>] vfs_write+0xa5/0x1a0
[<0>] ksys_write+0x4f/0xb0
[<0>] do_syscall_64+0x5b/0x1b0
[<0>] entry_SYSCALL_64_after_hwframe+0x65/0xca
[<0>] 0xffffffffffffffff
When we mount a filesystem, there are mainly 3 things to be setup,
A umount mainly do following things,
All of the things about the filesystem is in the super block
root inode is the entry of the filesystem
bh is a legacy of old version kernel. Nowadays, it mainly works with following
compoments,
In linux kernel, a page is 4K, but the filesystem block could be smaller than it.
In ext2/ext4, bh is used to represent the filesystem block.
page->private -> bh0 -> bh1 -> bh2 -> bh3
When does the block device know the filesystem block size ?
It is
sb_set_blocksize()
-> sb_set_blocksize()
---
/* Don't change the size if it is same as current */
if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
sync_blockdev(bdev);
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
return 0;
---
The sb->s_blocksize would influence how many bhs are allocated for a page,
sb_getblk()
-> __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
-> __getblk_slow()
-> grow_buffers()
-> grow_dev_page()
-> alloc_page_buffers()
---
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
bh = alloc_buffer_head(gfp);
if (!bh)
goto no_grow;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
bh->b_size = size;
/* Link the buffer to its page */
set_bh_page(bh, page, offset);
}
---
Currently, there are mainly two positions that employ bh
(1) store bmap info
ext4 is an example
__block_write_begin_int()
---
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
if (get_block) {
err = get_block(inode, block, bh, 1);
if (err)
break;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++=bh;
}
}
/*
* If we issued read requests - let them complete.
*/
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
---
io_submit_add_bh()
-> io_submit_add_bh()
---
bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
wbc_init_bio(io->io_wbc, bio);
---
(2) fs metadata
__ext4_get_inode_loc()
---
/*
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
bh = sb_getblk(sb, block);
...
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
...
make_io:
blk_start_plug(&plug);
ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
blk_finish_plug(&plug);
wait_on_buffer(bh);
}
---
bio is the unit of IO in both fs (such as xfs) and block. And kernel also
provide wrapper interfaces for backward compatibility of legacy bh.
submit_bh_wbc()
---
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
if (buffer_meta(bh))
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
---
Yes !!! jbd2 still uses bh.
b_bh
jh ---------->
<--------- bh
b_private
>
- commit to jbd2
ext4_do_update_inode()
-> ext4_handle_dirty_metadata(handle, NULL, bh)
- after log is committed
__jbd2_journal_temp_unlink_buffer()
---
jh->b_jlist = BJ_None;
if (transaction && is_journal_aborted(transaction->t_journal))
clear_buffer_jbddirty(bh);
else if (test_clear_buffer_jbddirty(bh))
mark_buffer_dirty(bh); /* Expose it to the VM */
---
The modifications in a transaction will be done or nothing.
This is achieved by only flush modifications to disk after persist their log records to disk.
There are TWO points here,
One, ensure integrity of the log record on disk
Descriptor
journal_header_s
JBD2_DESCRIPTOR_BLOCK chksum of descriptor
| |
v v
|header+tag+tag+tag+tag+tag+tag+chsum--|
^ ^
| |
journal_block_tag_s tag with JBD2_FLAG_LAST_TAG
---
__be32 t_blocknr;
__be16 t_checksum;
__be16 t_flags;
__be32 t_blocknr_high;
---
commit_header (chksum of whole transaction)
|
v
|commit-------------------------------|
IOs
Wait the IO of Descriptor and LOG completed
v
[Descriptor] [LOG] [LOG] [LOG] [LOG] [LOG] [Commit]
\___________________ ____________________/ \___ ___/
v v
REQ_OP_WRITE + REQ_SYNC REQ_OP_WRITE + REQ_SYNC
+ REQ_PREFLUSH + REQ_FUA
There is async commit feature which commit w/o wait descriptor and log completed.
It depends on the chksum to ensure the integrity of log transaction.
Two, write out modifications only after log has been persistted to disk
After commit, the bh of metadata is handed over to writeback by
__jbd2_journal_refile_buffer()
-> __jbd2_journal_unfile_buffer()//jh->b_next_transaction == NULL
-> __jbd2_journal_temp_unlink_buffer()
---
else if (test_clear_buffer_jbddirty(bh))
mark_buffer_dirty(bh); /* Expose it to the VM */
---
The interesting thing is ext4 nevery invoke mark_inode_dirty but ext4_mark_inode_dirty
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
__ext4_mark_inode_dirty()
-> ext4_mark_iloc_dirty()
-> ext4_do_update_inode()
---
//Update ext4_inode on buffer_head from in-core inode
err = ext4_handle_dirty_metadata(handle, NULL, bh);
---
How many transaction can exist in kernel ?
There are 3 kinds of transaction,
- running
- committing
- checkpointing
There can be only one running and one commit transaction at the same time,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
start_this_handle()
-> add_transaction_credits()
---
needed = atomic_add_return(total, &t->t_outstanding_credits);
if (needed > journal->j_max_transaction_buffers) {
atomic_sub(total, &t->t_outstanding_credits);
/*
* Wait until running transaction passes to T_FLUSH state and new transaction
* can thus be started. Also starts the commit if needed. The function expects
* running transaction to exist and releases j_state_lock.
*/
wait_transaction_locked(journal);
return 1;
}
---
The T_FLUSH state means, the transaction has been totally closed (no opening handle any more)
and become the journal->j_committing_transaction
jbd2_journal_commit_transaction()
---
//Drain handles on commit_transaction->t_updates
...
write_lock(&journal->j_state_lock);
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
--
And there can be multiple checkpoint transaction,
jbd2_journal_commit_transaction()
---
spin_lock(&journal->j_list_lock);
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
commit_transaction->t_cpnext = commit_transaction;
commit_transaction->t_cpprev = commit_transaction;
} else {
commit_transaction->t_cpnext =
journal->j_checkpoint_transactions;
commit_transaction->t_cpprev =
commit_transaction->t_cpnext->t_cpprev;
commit_transaction->t_cpnext->t_cpprev =
commit_transaction;
commit_transaction->t_cpprev->t_cpnext =
commit_transaction;
}
spin_unlock(&journal->j_list_lock);
---
Max size of a transaction is
static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}
1/4 total journal size, excluding fast commit.
The transaction size is decided by method that triggers transaction committing
Refer to Commit
The space of journal is deemed as a ring
grow up ->
j_tail j_head
| |
v v
|----------drrrrrrcdrrrrrrrrc------------|
0 \___ __/\____ ___/ j_total_len
v v
tx t(x+1)
d: Descriptor
r: Log Record
c: Commit
-: free space
Log space allocation:
jbd2_journal_next_log_block()
---
write_lock(&journal->j_state_lock);
blocknr = journal->j_head;
journal->j_head++;
journal->j_free--;
if (journal->j_head == journal->j_last)
journal->j_head = journal->j_first;
write_unlock(&journal->j_state_lock);
return jbd2_journal_bmap(journal, blocknr, retp);
---
Log space freeing:
j_tail
|
v
|-------cpcpcpcpccccccccrrrrrrrrr--------|
\__ ___/\___ __/\___ ___/
v v v
checkpointing committing running
After checkpoint
j_tail
|
v
|---------------ccccccccrrrrrrrrr--------|
\__ __/ \___ __/\___ ___/
v v v
freed committing running
There are 3 space to free log space,
(1) __jbd2_log_wait_for_space()
-> jbd2_log_do_checkpoint()
(2) jbd2_journal_commit_transaction()
-> jbd2_update_log_tail()
The transaction can be committed in following opportunities
fsync and other where you want files to be synced to disk
file's metadata is in journal
ocfs2_sync_file()
-> jbd2_complete_transaction()
//Ext4 employs fastcommit, doesn't use jbd2_complete_transaction() in ext4_sync_file() now.
And ext4_sync_fs() is also such a thing
Running transaction exceeds its max capacity
Refer to add_transaction_credits()
Timeout, namely the transaction is too old
start_this_handle()
-> jbd2_get_transaction()
---
transaction->t_expires = jiffies + journal->j_commit_interval;
...
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
---
commit_timeout(_
---
journal_t *journal = from_timer(journal, t, j_commit_timer);
wake_up_process(journal->j_task);
---
This j_task is kjournald2
The expire time is 5 seconds by default
#define JBD2_DEFAULT_MAX_COMMIT_AGE 5
#define JBD2_DEFAULT_MAX_COMMIT_AGE 5
The order here means the ext4's order mode
In data=ordered mode, ext4 only officially journals metadata, but it logically
groups metadata information related to data changes with the data blocks into a
single unit called a transaction. When it's time to write the new metadata
out to disk, the associated data blocks are written first.
What's this mode for ?
After a system crash or a power failure, files that were written right before the
system went down could contain previously written data or other garbage.
With Ordered Mode, journal commits are deferred until the data blocks get written
to disk. This guarantees that any blocks in the file will be data written by the
application, avoiding a possibility of a security breach, which is especially
problematic on a multi-user system.
Ext3 Order VS Writeback mode
This is not an issue right now, because the unwritten statea
Both xfs and ext4 support this,
ext4_map_blocks()
---
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
map->m_flags |= ext4_es_is_written(&es) ?
EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
...
}
---
xfs_vm_readpage()
-> iomap_readpage()
-> iomap_apply()
-> iomap_readpage_actor()
---
if (iomap_block_needs_zeroing(inode, iomap, pos)) {
zero_user(page, poff, plen);
iomap_set_range_uptodate(page, poff, plen);
goto done;
}
---
But why does ext4 still use 'order mode' as default ?
In addition, there are some applications which depend on data=ordered
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
to automatically force data blocks to be written to disk soon after the
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
file is written. Using Writeback Mode extends the time from when a file
^^^^^^^^^^^^^^^
is written to when it is pushed out to disk to 30 seconds. This can be
surprising for some users; however, it should be noted that such problems
can still be an issue with Ordered Mode (although they are much rarer).
Again, a careful application or library should always use fsync() at points
where the application is at a stable commit point.
Let's look at how to implement it
ext4_jbd2_inode_add_write() <- ext4_map_blocks()
<- __ext4_journalled_writepage()
<- __ext4_block_zero_page_range()
<- ext4_page_mkwrite()
<- move_extent_per_page()
We only need to order the page writting when extent changes
But ocfs2 is not different,
ocfs2_jbd2_inode_add_write() <- ocfs2_write_failure()
<- ocfs2_map_and_dirty_page()
<- ocfs2_write_end_nolock()
<- ocfs2_zero_start_ordered_transaction()
ext4_fill_super()
-> ext4_load_journal()
-> jbd2_journal_load()
-> jbd2_journal_recover()
-> ext4_iget()
-> ext4_orphan_cleanup()
Decide the range of logs need to be replayed, or decide the end transaction ID
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
start end
| |
v v
|------dlllllcdrlllllcdlll----|
\__ __/\___ __/\_ _/
t t+1 t+2
d: JBD2_DESCRIPTOR_BLOCK, journal_header_t + n * journal_block_tag_t
l: copy of the block that need to be logged
c: JBD2_COMMIT_BLOCK, commit_header (a journal_header_t embedded)
r: JBD2_REVOKE_BLOCK, jbd2_journal_revoke_header_t (a journal_header_t embedded)
Start
Come from the journal superblock, updated by jbd2_update_log_tail()
End
(1) check magic, rule out 'l' or empty blocks
(2) check sequence, rule out stale logs (same mkfs)
(There is time stamp check to avoid previous mkfs's log)
There are following rulers when scan the log
(1) when meet 'd', bypass the following log blocks
next_log_block += count_tags(journal, bh);
(2) when meet 'l', if valid increase next_commit_ID
This indicates the commit id of last complete transaction
this commit id will be used in following pass to decide when to stop
(3) revoke and descriptor block checksum error cause recovery failure,
but commit not
Only scan the JBD2_REVOKE_BLOCK and record the blocks that need to be revoked
when replay
Refer to scan_revoke_records()
Pick up the JBD2_DESCRIPTOR_BLOCK and copy the following log blocks to it real
position based on array of journal_block_tag_t
do_one_pass()
---
while ((tagp - bh->b_data + tag_bytes) <= journal->j_blocksize - descr_csum_size) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
flags = be16_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
//Read in log block
err = jread(&obh, journal, io_block);
if (err) {
} else {
//get the real position on disk
blocknr = read_tag_block(journal, tag);
/* If the block has been revoked, then we're all done here. */
if (jbd2_journal_test_revoke(journal, blocknr, next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
/* Look for block corruption */
if (!jbd2_block_tag_csum_verify(journal, tag, obh->b_data, be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
block_error = 1;
goto skip_write;
}
nbh = __getblk(journal->j_fs_dev, blocknr, journal->j_blocksize);
lock_buffer(nbh);
memcpy(nbh->b_data, obh->b_data, journal->j_blocksize);
set_buffer_uptodate(nbh);
//Not issue the IO but just mark bh dirty
mark_buffer_dirty(nbh);
++info->nr_replays;
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
}
skip_write:
tagp += tag_bytes;
if (!(flags & JBD2_FLAG_SAME_UUID))
tagp += 16;
if (flags & JBD2_FLAG_LAST_TAG)
break;
}
---
In RELAY pass, the do_one_pass() doesn't issue io, but just mark dirty,
the io is issued here,
jbd2_journal_recover()
---
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
journal->j_transaction_sequence = ++info.end_transaction;
jbd2_journal_clear_revoke(journal);
err2 = sync_blockdev(journal->j_fs_dev);
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
err2 = blkdev_issue_flush(journal->j_fs_dev);
}
---
log grow ->
m f da+w
|------+----+----+----+----+----+----+--------|
\________ __________/\_ _/\_ _/
v v v
cp c r
m : modification to metadata
f : free the metadata block
da: data block allocation
w : data block writing
If we replay the log, the metadata modification will replayed on the blcok
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
which has contained data. This is terrible case because the data that has been
^^^^^^^^^^^^^^^^^^^^^^^^
synced to disk is corrupted.
Track the code invokes ext4_free_blocks with EXT4_FREE_BLOCKS_FORGET.
m f+r ma+w
|------+----+----+----+----+----+----+--------|
\________ __________/\_ _/\_ _/
v v v
cp c r
m : modification to metadata
f : free the metadata block
r : revoke record
ma: metadata block allocation
w : data block writing
When replay the log, the revoke record which is in the same transaction
with the freeing, will stop replaying of 'm' log record. Then it can protect
the following data allocation and writing. But if the block is allocated
by meta again, the revoke record won't prevent the replaying of 'ma+w' (or the 'w' only).
jbd2_journal_commit_transaction()
---
/*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
err = journal_submit_data_buffers(journal, commit_transaction);
...
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
jbd_debug(3, "JBD2: commit phase 2b\n");
---
write_one_revoke_record()
---
if (descriptor) {
if (offset + sz > journal->j_blocksize - csum_size) {
flush_descriptor(journal, descriptor, offset);
---
write_dirty_buffer() is invoked here to submit_bh
---
descriptor = NULL;
}
}
if (!descriptor) {
descriptor = jbd2_journal_get_descriptor_buffer(transaction,
JBD2_REVOKE_BLOCK);
jbd2_file_log_bh(log_bufs, descriptor);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (jbd2_has_feature_64bit(journal))
* ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
else
* ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
---
The format of revoke record block is,
journal_header_t
|
v
|---|---|-------------------|
^ \_________ _________/
| v
| block num array that need to revoked
jbd2_journal_revoke_header_t
Revoke records will be scanned before replay pass
do_one_pass()
---
case JBD2_REVOKE_BLOCK:
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
brelse(bh);
continue;
}
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
brelse(bh);
continue;
---
do_one_pass()
---
/* If the block has been
* revoked, then we're all done
* here. */
if (jbd2_journal_test_revoke(journal, blocknr, next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
--
bh and jh reference each other.
Refer to
jbd2_journal_add_journal_head()
---
if (!buffer_jbd(bh))
new_jh = journal_alloc_journal_head();
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
jh = bh2jh(bh);
} else {
jh = new_jh;
new_jh = NULL; /* We consumed it */
set_buffer_jbd(bh);
bh->b_private = jh;
jh->b_bh = bh;
get_bh(bh);
}
jh->b_jcount++;
jbd_unlock_bh_journal_head(bh);
---
Note, the reference above happens under bh's lock
Firstly, we must know we cannot modify bh when its previous transaction has not
been committed. Because the bhs' modification in one transaction compose an atomic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
filesystem operations.
^^^^^^^^^^^^^^^^^^^^^
How does jbd2 handle this case ?
Let's look into
jbd2_journal_commit_transaction()
-> jbd2_journal_write_metadata_buffer()
---
spin_lock(&jh_in->b_state_lock);
repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
}
...
set_bh_page(new_bh, new_page, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
*bh_out = new_bh;
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
spin_unlock(&jh_in->b_state_lock);
---
In a word, if there is b_frozen_data, use it, otherwise use the buffer of bh
that need to be logged.
The most critical point is the jh->b_state_lock which serialize everything.
do_get_write_access()
---
lock_buffer(bh);
spin_lock(&jh->b_state_lock);
/*
* The buffer is already part of this transaction if b_transaction or
* b_next_transaction points to it
* This also tell us extra copy only happens when one bh needs to join
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* the next transaction and previous one has not been committed. The extra
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* is the data that compose the atomic filesystem operations in previous
* transaction.
*/
if (jh->b_transaction == transaction ||
jh->b_next_transaction == transaction)
goto done;
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
*/
if (jh->b_frozen_data) {
goto attach_next;
}
/*
* There is one case we have to be very careful about. If the
* committing transaction is currently writing this buffer out to disk
* and has NOT made a copy-out, then we cannot modify the buffer
* contents at all right now. The essence of copy-out is that it is
* the extra copy, not the primary copy, which gets journaled. If the
* primary copy is already going to disk then we cannot do copy-out
* here.
*/
if (buffer_shadow(bh)) {
spin_unlock(&jh->b_state_lock);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
/*
* Only do the copy if the currently-owning transaction still needs it.
* If buffer isn't on BJ_Metadata list, the committing transaction is
* past that stage (here we use the fact that BH_Shadow is set under
* bh_state lock together with refiling to BJ_Shadow list and at this
* point we know the buffer doesn't have BH_Shadow set).
*
* Subtle point, though: if this is a get_undo_access, then we will be
* relying on the frozen_data to contain the new value of the
* committed_data record after the transaction, so we HAVE to force the
* frozen_data copy in that case.
*/
if (jh->b_jlist == BJ_Metadata || force_copy) {
if (!frozen_buffer) {
spin_unlock(&jh->b_state_lock);
frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
goto repeat;
}
jh->b_frozen_data = frozen_buffer;
frozen_buffer = NULL;
jbd2_freeze_jh_data(jh);
}
attach_next:
smp_wmb();
jh->b_next_transaction = transaction;
done:
spin_unlock(&jh->b_state_lock);
---
There are two reference of the inode,
In this section, we would look into the second kind of reference above.
This reference is the count of dentries that point to this inode and
it decides the lifecycle of the inode on disk.
This reference decides the lifecycle of the inode in memory
The main user of inode is dcache.
void iput(struct inode *inode)
{
...
if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
...
iput_final(inode);
}
}
When the last reference of an inode is dropped, we may have two choices,
Look at the code,
iput_final()
---
if (op->drop_inode)
drop = op->drop_inode(inode);
else
drop = generic_drop_inode(inode);
// (1) retain inode in cache if fs is active
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
}
...
WRITE_ONCE(inode->i_state, state | I_FREEING);
if (!list_empty(&inode->i_lru))
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
// (2) evict the inode
evict(inode);
---
We usually get the reference of the inode when look up in inode cache,
for example,
iget_locked()
---
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
---
hlist_for_each_entry(inode, head, i_hash) {
if (inode->i_ino != ino)
continue;
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
}
...
__iget(inode);
spin_unlock(&inode->i_lock);
return inode;
}
---
spin_unlock(&inode_hash_lock);
---
The reference get of inode is done under inode->i_lock which is protect it
against iput_final.
fd user
------------------------------
file kernel
v file->f_path.dentry
dcache
v dentry->d_inode
inode
__fput()
---
dput(dentry);
...
mntput(mnt);
---
dput()
-> dentry_kill()
-> __dentry_kill()
-> dentry_unlink_inode()
-> iput()
The commit is
commit 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8
Author: Theodore Ts'o
Firstly, let's look at the 3 time fields in POSIX,
And they will be modified in following code path,
The time of inode is updated in generic_update_time,
file_accessed()
-> touch_atime()
-> update_time() //S_ATIME
file_accessed() could be invoked by
- generic_file_read_iter() // in direct IO path
- generic_file_buffered_read()
- generic_file_mmap()
- ext4_dax_read_iter()
...
file_update_time()
-> update_time() // S_MTIME or S_CTIME
filemap_page_mkwrite()
-> file_update_time()
__generic_file_write_iter() //cover both buffer and direct IO path
-> file_update_time()
generic_update_time()
-> __mark_inode_dirty()
-> sb->s_op->dirty_inode()
ext4_dirty_inode()
---
// If flags only contain I_DIRTY_TIME, just return and leave the
// modified times fields in memory
if (flags == I_DIRTY_TIME)
return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
...
ext4_mark_inode_dirty(handle, inode);
-> __ext4_mark_inode_dirty()
-> ext4_mark_iloc_dirty()
-> ext4_do_update_inode() // synchronize the inode in-memory to the one on-disk
ext4_journal_stop(handle);
---
If we look into the __mark_inode_dirty,
we could find out that I_DIRTY_TIME is set on inode->i_state, and the inode is inserted into the wb->b_dirty_time
There is no modifications on the on-disk inode buffer in-memory, what does the wb flush ?
Well, a normal case that flush out the time fields of inode should be in iput()
wb_writeback()
-> queue_io()
---
if (!work->for_sync)
time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
time_expire_jif);
---
unsigned int dirtytime_expire_interval = 12 * 60 * 60;
See it !?, it is 12 hours
__writeback_single_inode()
---
ret = do_writepages(mapping, wbc);
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
* I/O completion. We don't do it for sync(2) writeback because it has a
* separate, external IO completion path and ->sync_fs for guaranteeing
* inode metadata is written back correctly.
*/
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
if ((inode->i_state & I_DIRTY_TIME) &&
((dirty & I_DIRTY_INODE) ||
wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
time_after(jiffies, inode->dirtied_time_when +
dirtytime_expire_interval * HZ))) {
dirty |= I_DIRTY_TIME;
trace_writeback_lazytime(inode);
}
inode->i_state &= ~dirty;
...
smp_mb();
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
//See it ? I_DIRTY_SYNC will be set here and ext4_dirty_inode() won't do
nothing anymore
if (dirty & I_DIRTY_TIME)
mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
}
---
void iput(struct inode *inode)
{
if (!inode)
return;
BUG_ON(inode->i_state & I_CLEAR);
retry:
if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
trace_writeback_lazytime_iput(inode);
mark_inode_dirty_sync(inode);
goto retry;
}
iput_final(inode);
}
}
Where is the inode cache ?
inode_hashtable =
alloc_large_system_hash("Inode-cache",
sizeof(struct hlist_head),
ihash_entries,
14,
HASH_ZERO,
&i_hash_shift,
&i_hash_mask,
0
The inode would be inserted into this hash table to be looked up quickly.
Some filesystems, such as xfs, maintain a inode cache itself.
insert_inode_locked() <- __ext4_new_inode()
// Look up in cache, create a new one if not exist(I_NEW).
iget_locked() <- __ext4_iget()
Search for the inode specified by @hashval and @data in the inode cache,
and if present it is return it with an increased reference count. This is
a generalized version of iget_locked() for file systems where the inode
number is not sufficient for unique identification of an inode.
inode_insert5 <- iget5_locked()
<- insert_inode_locked4()
__insert_inode_hash() <- insert_inode_hash()
A very common code path should be,
ext4_lookup()
---
//Try to lookup the file in directory entry
bh = ext4_lookup_entry(dir, dentry, &de);
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
brelse(bh);
...
// Get a valid inode id, try to get the inode for it
inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
-> __ext4_iget()
---
inode = iget_locked(sb, ino);
...
// It is cached in inode cache
if (!(inode->i_state & I_NEW))
return inode;
ei = EXT4_I(inode);
iloc.bh = NULL;
// Get the inode on disk
ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
...
raw_inode = ext4_raw_inode(&iloc);
---
---
xfs_lookup()
-> xfs_dir_lookup()
-> xfs_iget()
---
// See it ? this is a fs private inode cache and it is more scalable
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
agino = XFS_INO_TO_AGINO(mp, ino);
again:
error = 0;
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
if (ip) {
error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
...
} else {
rcu_read_unlock();
...
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
flags, lock_flags);
...
---
// Allocate xfs_inode where a vfs inode is embedded in
ip = xfs_inode_alloc(mp, ino);
...
} else {
// get the xfs_buf for this inode
error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0);
// fill the inode with data on disk
error = xfs_inode_from_disk(ip, dip);
xfs_trans_brelse(tp, bp);
}
...
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
d_mark_dontcache(VFS_I(ip));
xfs_iflags_set(ip, iflags);
/* insert the new inode */
spin_lock(&pag->pag_ici_lock);
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
spin_unlock(&pag->pag_ici_lock);
---
}
xfs_perag_put(pag);
---
Another thing need to be talked is the way to reclaim indoe cache
For xfs
iput()
-> iput_final()
---
if (op->drop_inode)
drop = op->drop_inode(inode);
else
drop = generic_drop_inode(inode);
// In generic_drop_inode, there are 3 conditions,
// - !inode->i_nlink, means the file or hardlink have been cut off
// - inode_unhashed(), means this inode is not in inode hash table
for xfs, this true, because it use inode cache of its own
// - I_DONTCACHE
if (!drop && (sb->s_flags & SB_ACTIVE)) {
inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
}
...
WRITE_ONCE(inode->i_state, state | I_FREEING);
if (!list_empty(&inode->i_lru))
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
evict(inode);
---
evict()
-> destroy_inode()
-> sb ops->destroy_inode()
xfs_fs_destroy_inode()
-> xfs_inactive() //Won't do more because i_nlink is not zero>
-> xfs_inode_set_reclaim_tag()
--
xfs_perag_set_reclaim_tag(pag);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
--
This XFS_IRECLAIMABLE will be handled by
xfs_iget_cache_hit()
---
/*
* If IRECLAIMABLE is set, we've torn down the VFS inode already.
* Need to carefully get it back into useable state.
*/
if (ip->i_flags & XFS_IRECLAIMABLE) {
...
}
---
For normal
prune_icache_sb()
-> inode_lru_isolate()
---
/*
* Referenced or dirty inodes are still in use. Give them another pass
* through the LRU as we canot reclaim them now.
*/
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
---
prune_dcache_sb()
---
freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
dentry_lru_isolate, &dispose);
shrink_dentry_list(&dispose);
---
dentry_lru_isolate()
---
if (!spin_trylock(&dentry->d_lock))
return LRU_SKIP;
/*
* Referenced dentries are still in use. If they have active
* counts, just remove them from the LRU. Otherwise give them
* another pass through the LRU.
*/
if (dentry->d_lockref.count) {
d_lru_isolate(lru, dentry);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
}
if (dentry->d_flags & DCACHE_REFERENCED) {
dentry->d_flags &= ~DCACHE_REFERENCED;
spin_unlock(&dentry->d_lock);
return LRU_ROTATE;
}
d_lru_shrink_move(lru, dentry, freeable);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
---
shrink_dentry_list()
-> __dentry_kill()
-> __d_drop()
-> ___d_drop() //remove the dentry from the hashtable
-> dentry_free()
-> call_rcu(&dentry->d_u.d_rcu, __d_free);Nothing will be freed under rcu_read_lock
dget()
---
if (dentry)
lockref_get(&dentry->d_lockref);
return dentry;
---
dput()
---
rcu_read_lock();
if (likely(fast_dput(dentry))) {
rcu_read_unlock();
return;
}
/* Slow case: now with the dentry lock held */
rcu_read_unlock();
if (likely(retain_dentry(dentry))) {
spin_unlock(&dentry->d_lock);
return;
}
dentry = dentry_kill(dentry);
---
retain_dentry()
---
//vfs_unlink() -> __d_drop() could cause this.
if (unlikely(d_unhashed(dentry)))
return false;
...
// Two rounds in lru
dentry->d_lockref.count--;
if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
d_lru_add(dentry);
else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
dentry->d_flags |= DCACHE_REFERENCED;
return true;
---
To prevent dentry from being reclaimed,
(1) hold a reference with dget() (d_lru_isolate() will remove it from lru list in dentry_lru_isolate)
(2) use it and then DCACHE_REFERENCED will always be set
do_filp_open()
---
// Try to use LOOKUP_RCU first, if failed with ECHILD,
// try again w/o LOOKUP_RCU
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags)
---
path_openat()
---
const char *s = path_init(nd, flags); //rcu_read_lock is get here
/home/jianchwa/linux-stable/fs/dcache.c
\____________ _______________/ \__ __/
v v
link_path_walk() open_last_lookups()
while (!(error = link_path_walk(s, nd)) &&
(s = open_last_lookups(nd, file, op)) != NULL)
;
if (!error)
error = do_open(nd, file, op); // do the finally open
terminate_walk(nd);
---
There are 3 points here regarding to fast lookup
(1) it is only to accelerate the dentry cache hit cases
if not hit, lookup would fallback to non-rcu mode
(2) dentry won't be freed during this but maybe killed
(3) sequence count is used to protect the lookup against rename
Regarding to the point 3,
vfs_rename()
-> d_move()
-> __d_move()
---
spin_lock_nested(&dentry->d_lock, 2);
spin_lock_nested(&target->d_lock, 3);
...
write_seqcount_begin(&dentry->d_seq);
write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
/* unhash both */
if (!d_unhashed(dentry))
___d_drop(dentry);
if (!d_unhashed(target))
___d_drop(target);
/* ... and switch them in the tree */
dentry->d_parent = target->d_parent;
if (!exchange) {
copy_name(dentry, target);
target->d_hash.pprev = NULL;
dentry->d_parent->d_lockref.count++;
if (dentry != old_parent) /* wasn't IS_ROOT */
WARN_ON(!--old_parent->d_lockref.count);
}
...
list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
__d_rehash(dentry);
...
write_seqcount_end(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
---
__d_lookup_rcu()
---
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
unsigned seq;
seqretry:
/*
* The dentry sequence count protects us from concurrent
* renames, and thus protects parent and name fields.
* Otherwise, we may get a wrong entry, for example,
* during rename /home/will/aaaa to /home/will/bbbb,
* we could get baaa, bbaa, bbba, and if these files
* do exist, look up is screwed up.
*/
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
if (d_unhashed(dentry))
continue;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
...
} else {
if (dentry->d_name.hash_len != hashlen)
continue;
if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
continue;
}
*seqp = seq;
return dentry;
}
---
lookup_fast()
---
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
...
*inode = d_backing_inode(dentry);
if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
return ERR_PTR(-ECHILD);
/*
* This sequence count validates that the parent had no
* changes while we did the lookup of the dentry above.
*/
Why do we need to care about the parent ?
renaming of the parent won't influence the childrens, right ?
if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
return ERR_PTR(-ECHILD);
*seqp = seq;
status = d_revalidate(dentry, nd->flags);
if (likely(status > 0))
return dentry;
}
---
Before the finally open, we would recheck the sequence count,
do_open()
-> complete_walk()
-> unlazy_walk()
-> legitimize_path()
---
if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
path->dentry = NULL;
return false;
}
return !read_seqcount_retry(&path->dentry->d_seq, seq);
---
The main points that dcache interacts with filesystem
link_path_walk()
-> walk_component()
-> lookup_slow()
-> inode_lock_shared()
-> __lookup_slow()
-> d_alloc_parallel() //allocate a dentry structure
-> inode->i_op->lookup()
look up dentry in metadata,
do d_add() if found which would add dentry into dcache hashtable
open_last_lookups()
-> lookup_open()
-> d_lookup() //do look up under rename raed seqlock
-> dir_inode->i_op->create() //do create if needed
When a dentry is in the dcache hash table, namely, dentry_hashtable
__d_lookup could find it and d_unhashed() returns false.
When is the dentry inserted to the dentry_hashtable ?
When the file does not exist,
lookup_open()
// allocate dentry structure with d_alloc()
-> d_alloc_parallel()
-> dir_inode->i_op->lookup()
ext2_lookup()
-> ext2_inode_by_name() //not found at this moment
-> d_splice_alias() //inode is NULL at this moment
-> dir_inode->i_op->create()
ext2_create()
-> ext2_add_nondir()
-> d_instantiate_new()
-> d_instantiate() //fill inode information for a dentry
When the file exists,
lookup_slow()
-> ext2_lookup()
-> d_splice_alias()
-> __d_add()
-> __d_set_inode_and_type() // install the inode into the dentry
-> __d_rehash()
There are two basic types of disk quotas
The disk quotas can be set per 3 roles,
There are two limit,
The quota data is in memory in runtime and need to be persistted on disk.
On disk disk quota entry format,
struct v2r1_disk_dqblk {
__le32 dqb_id; /* id this quota applies to */
__le32 dqb_pad;
__le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
__le64 dqb_isoftlimit; /* preferred inode limit */
__le64 dqb_curinodes; /* current # allocated inodes */
__le64 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
__le64 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
__le64 dqb_curspace; /* current space occupied (in bytes) */
__le64 dqb_btime; /* time limit for excessive disk use */
__le64 dqb_itime; /* time limit for excessive inode use */
};
dqget()
-> find_dquot()// under dq_list_lock
---
hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash)
if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
return dquot;
---
The disk quota data is persistted in reserved inode of filesystem.
Look at the ext4,
ext4_enable_quotas()
-> ext4_quota_enable()
---
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
iput(qf_inode);
---
The dquot file is formated as a hash tree,
|.|.|
/ \
|.|.| |.|.|
/ \ / \
|.|.| ... |.|.|
/ \ / \
|.|.| |.|.|...|.|.| |.|.|
|
|e|e|e|...|
- The block size is 1024
- Every tree block contains 256 Indexes
- Tree level is fixed to 4 level, refer to qtree_depth()
- dquot entry is flat in the leaf block.
When a leaf block is empty, it will be linked into the free list in
sync the dquot memory cache into disk,
ext4_sync_fs()
-> dquot_writeback_dquots()
ext4_write_dquot()
-> ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
-> dquot_commit()
-> v2_write_dquot()
->qtree_write_dquot()
->ext4_quota_write
---
handle_t *handle = journal_current_handle();
do {
bh = ext4_bread(handle, inode, blk,
EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
} while (PTR_ERR(bh) == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
err = ext4_journal_get_write_access(handle, bh);
lock_buffer(bh);
memcpy(bh->b_data+offset, data, len);
flush_dcache_page(bh->b_page);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
---
ext4_file_open()
-> dquot_file_open()
-> __dquot_initialize()
---
/* First get references to structures we might need. */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
...
switch (cnt) {
case USRQUOTA:
qid = make_kqid_uid(inode->i_uid);
break;
case GRPQUOTA:
qid = make_kqid_gid(inode->i_gid);
break;
case PRJQUOTA:
rc = inode->i_sb->dq_op->get_projid(inode, &projid);
qid = make_kqid_projid(projid);
break;
}
dquot = dqget(sb, qid);
got[cnt] = dquot;
}
---
ext4_mb_new_blocks()
-> dquot_alloc_block()
-> dquot_alloc_space()
-> dquot_alloc_space_nodirty()
-> __dquot_alloc_space()
-> dquot_add_space()
-> __inode_add_bytes()_
-> mark_all_dquot_dirty()
-> mark_dquot_dirty()
ext4_mark_dquot_dirty()
---
if (ext4_is_quota_journalled(sb)) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
} else {
return dquot_mark_dquot_dirty(dquot);
}
---
__ext4_new_inode()
This is invoked after inode is created
-> dquot_alloc_inode()
For the filesystem block size == 4K, The 1st block group is as following,
|rrrrssss....|ggggggggg|rgrgrgrgrg|dd|ii|tttt|....|
rrrr: reserved 1024 bytes for x86 boot sectors
ssss: in the same fs block with rrrr, filesystem super block
gggg: block group descriptors in a contiguous space
rgrg: reserved group descriptors table for filesystem resize,
dd : data block bitmap
ii : inode block bitmap
tttt: inode table
All of positions above can be got from block group descriptor
During .fill_super, all of the buffer_head's of block group descriptors are loaded into memory,
ext4_fill_super()
---
/* Pre-read the descriptors into the buffer cache */
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
ext4_sb_breadahead_unmovable(sb, block);
}
for (i = 0; i < db_count; i++) {
struct buffer_head *bh;
block = descriptor_loc(sb, logical_sb_block, i);
bh = ext4_sb_bread_unmovable(sb, block);
rcu_read_lock();
rcu_dereference(sbi->s_group_desc)[i] = bh;
rcu_read_unlock();
}
sbi->s_gdb_count = db_count;
---
descriptor_loc() is used to calculate the position of block group descriptors,
---
//When meta block group is not enabled, it is very simple
if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
return logical_sb_block + nr + 1;
---
ext4 use block group descriptor to get the position of data bitmap, inode bitmap
and inode table. For example,
__ext4_get_inode_loc()
---
iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
/*
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
bh = sb_getblk(sb, block);
---
There are mainly 3 data structure here,
struct ext4_extent_header {
__le16 eh_magic; /* probably will support different formats */
__le16 eh_entries; /* number of valid entries */
__le16 eh_max; /* capacity of store in entries*/
__le16 eh_depth; /* has tree real underlying blocks? */
__le32 eh_generation; /* generation of the tree */
};
struct ext4_extent_idx {
__le32 ei_block; /* index covers logical blocks from 'block' */
__le32 ei_leaf_lo; /* pointer to the physical block of the nex level. leaf or next index could be there */
__le16 ei_leaf_hi; /* high 16 bits of physical block */
__u16 ei_unused;
};
struct ext4_extent {
__le32 ee_block; /* first logical block extent covers */
__le16 ee_len; /* number of blocks covered by extent */
__le16 ee_start_hi; /* high 16 bits of physical block */
__le32 ee_start_lo; /* low 32 bits of physical block */
};
Note:
- ext4_extent.ee_block is 32bits, so the max file size of ext4 is 2^32 * 2^12 (fs blk) = 2^44 (16TiB)
- ee_start_hi/ee_start_lo is 48bits, so the max block device size of ext4 is 2^48 * 2^12 = 2^60 (1EiB)
- highest bit of ee_len is to indicates unwritten state, refer to ext4_ext_mark_unwritten()
- if eh_depth > 0, ext4_extent_idx follows it, otherwise ext4_extent
- max level of extent btree is 5, (4 * (((2^12 - 12)/12) ^ n) >= 2^32, n = 5)
inode->i_block (inode inlined)
|
[hiiii] eh.eh_max == 4
/ \
[hiiii] [hiiii] 4K - sizeof(eh) / sizeof(ei)
/ \ \
[heeee] [heeee] [heeee]
ei and ee in the nodes are sorted by logical block
[hiii-]
/ \
[hiii-] [hiiii]
/ \ \ ^
[hee-e] [heee-] [heeee] | walk from down to up
ext4_ext_create_new_leaf()
---
/* walk up to the tree and look for where need new index entry */
curp = path + depth;
while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
i--;
curp--;
}
---
[hiiii]______
/ \ \
[hiii-] [hiii] [hii]
/ \ \ \
[hee-e] [heee-] [hee] [hee]
Split entries from the full ee and ei's and then build a new sub tree above.
Then insert it to tree.
Note: the new node [hee] is always 1st entry of new [hii]
Let's look at the code,
ext4_ext_split()
---
//leaf node has been setup in similar way
while (k--) {
oldblock = newblock;
newblock = ablocks[--a];
bh = sb_getblk(inode->i_sb, newblock);
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
neh = ext_block_hdr(bh);
//setup the new node's header
neh->eh_entries = cpu_to_le16(1);
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
neh->eh_generation = 0;
fidx = EXT_FIRST_INDEX(neh);
//insert the 1st one, namely, the newly split child
//logical block is same across all new ei and ee
fidx->ei_block = border;
ext4_idx_store_pblock(fidx, oldblock);
/* start copy indexes */
m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
if (m) {
memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m);
}
//zero out unused area in the extent block
//Is it necessary ? eh->eh_entries should be enough
ext_size = sizeof(struct ext4_extent_header) + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, inode, bh);
//correct old index
//See it ? only update eh->eh_entries here
if (m) {
err = ext4_ext_get_access(handle, inode, path + i);
le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
err = ext4_ext_dirty(handle, inode, path + i);
}
i--;
}
/* insert new index */
err = ext4_ext_insert_index(handle, inode, path + at,
le32_to_cpu(border), newblock);
---
All of the ei and ee are full, including the top one that's in inode->i_block
[hiiii] copy inode->i_block into new block
/ \ [hiiii]
[hiiii] [hiiii] --\ / \
/ \ \ --/ [hiiii] [hiiii]
[heeee] [heeee] [heeee] / \ \
[heeee] [heeee] [heeee]
inode->i_block [hi] ||
| \/
[hiiii] link the new block to inode->i_block
/ \
[hiiii [hiiii]
/ \ \
[heeee] [heeee] [heeee]
Look at the code,
ext4_ext_grow_indepth()
---
bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
ext_size = sizeof(EXT4_I(inode)->i_data);
/* move top-level index/leaf into new block */
memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
/* zero out unused area in the extent block */
memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
/* set size of new block */
neh = ext_block_hdr(bh);
//if ext_depth(inode) is zero, inode->i_block contain ee.
//then new node carries the initial 4 ee's.
if (ext_depth(inode))
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
else
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
err = ext4_handle_dirty_metadata(handle, inode, bh);
/* Update top-level index: num,max,pointer */
neh = ext_inode_hdr(inode);
neh->eh_entries = cpu_to_le16(1);
ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
if (neh->eh_depth == 0) {
/* Root extent block becomes index block */
neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block;
}
le16_add_cpu(&neh->eh_depth, 1);
err = ext4_mark_inode_dirty(handle, inode);
---
When we need to remove blocks ?
ext4_ext_truncate()
ext4_collapse_range()
ext4_punch_hole()
[hii] 3
[hii] [hii] 2
[hii] [hii] [hii] [hii] 1
[hee] [hee] [hee] [hee] [hee] [hee] [hee] [hee] 0
<--- rm direction
||
\/
[hii] 3
[hii] [hii] 2
[hii] [hi] [h] [h] 1
[hee] [hee] [hee] 0
||
\/
[hii] 3
[hii] [h] 2
[hii] [hi] 1
[hee] [hee] [hee] 0
||
\/
[hi] 3
[hii] 2
[hii] [hi] 1
[hee] [hee] [hee] 0
The code is very complicated, so we don't show all of it here, but just some
important part.
(1) continue or next level
[hii] 3
[hii] [hii] 2
[hii] [hii] [hii] [hii] 1
[hee] [hee] [hee] [hee] [hee] [hee] [hee] [hee] 0
ext4_ext_rm_leaf() can only handle one node, namely a [hee], when to access the
next one (the right one) ?
ext4_ext_remove_space()
---
while (i >= 0 && err == 0) {
if (i == depth) {
/* this is leaf block */
err = ext4_ext_rm_leaf(handle, inode, path,
&partial, start, end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
i--;
continue;
}
if (!path[i].p_idx) {
} else {
/* we were already here, see at next index */
path[i].p_idx--;
}
// determine based on path->p_idx and EXT_FIRST_INDEX(path->p_hdr)
first now first
v v v
heeeeeeeeeeee heeeeeeeeeeee
^
now
// no more need to remove previous one is need to be removed
if (ext4_ext_more_to_rm(path + i)) {
struct buffer_head *bh;
//Read in the ee or ei block
bh = read_extent_tree_block(inode, ext4_idx_pblock(path[i].p_idx), depth - i - 1, EXT4_EX_NOCACHE);
i++;
} else {
}
}
---
(2) revoke ?
When free metadata block, we need to invoke them
ext4_remove_blocks()
---
flags = get_default_free_blocks_flags(inode);
---
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
---
...
flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
---
ext4_ext_rm_idx()
---
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
---
Delayed allocation, where the filesystem defers the allocation of blocks on disk for data
written by applications until that data is actually written to disk. The idea is to wait
until the application finishes its operations on the file, then allocate the actual number
of data blocks needed on the disk at once. This optimization limits unneeded operations
related to short-lived, small files, batches large writes, and helps ensure that data space
is allocated contiguously.
generic_perform_write()
---
do {
...
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
// 'i' here is iov_iter which contains the user buffer
// write_begin callback return one page every time.
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
...
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
---
ext4_da_write_begin()
-> __block_write_begin(page, pos, len, ext4_da_get_block_prep);
-> ext4_da_get_block_prep()
-> ext4_da_map_blocks()
-> ext4_insert_delayed_block() //ext4_ext_map_blocks() returns 0
-> ext4_da_reserve_space() //Reserve space in memory
-> ext4_es_insert_delayed_block()
extent status rb tree maintains extent status including,
//These flags live in the high bits of extent_status.es_pblk
enum {
ES_WRITTEN_B,
ES_UNWRITTEN_B,
ES_DELAYED_B,
ES_HOLE_B,
ES_REFERENCED_B,
ES_FLAGS
};
In ext4_writepages(), we first need to get the extent in which pages have same
state,
- Delayed, pages with delayed allocation,
- Unwritten, fallocate
- Mapped, pages with Written state
- Dirty, has dirty data in cache
This is done in mpage_prepare_extent_to_map(), it first collects the pages on
demand, check their state and get the contiguous pages with same state.
mpage_prepare_extent_to_map()
-> mpage_process_page_bufs()
-> mpage_add_bh_to_extent()
---
if (map->m_len == 0) {
/* We cannot map unless handle is started... */
if (!mpd->do_map)
return false;
map->m_lblk = lblk;
map->m_len = 1;
// BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
map->m_flags = bh->b_state & BH_FLAGS;
return true;
}
/* Don't go larger than mballoc is willing to allocate */
// MAX_WRITEPAGES_EXTENT_LEN is 2048 (2K * 4K = 8M)
if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
return false;
/* Can we merge the block to our big extent? */
if (lblk == map->m_lblk + map->m_len &&
(bh->b_state & BH_FLAGS) == map->m_flags) {
map->m_len++;
return true;
}
return false;
---
ext4_writepages()
-> mpage_map_and_submit_extent)()
-> mpage_map_one_extent()
-> ext4_map_blocks()//do reall allocation here
-> mpage_map_and_submit_buffers()
-> mpage_process_page()//update bhs' state
-> mpage_submit_page()//submit pages to io
ext4_truncate()
-> ext4_ext_truncate()
-> ext4_es_remove_extent()
---
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end, &reserved);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_da_release_space(inode, reserved);
---
There seems not special optimization related to delayed allocation
In Linux, the ext2, ext3, ext4, JFS, Squashfs, Yaffs2, ReiserFS, Reiser4, XFS, Btrfs,
OrangeFS, Lustre, OCFS2 1.6, ZFS, and F2FS[11] filesystems support extended attributes
(abbreviated xattr) when enabled in the kernel configuration. Any regular file or directory
may have extended attributes consisting of a name and associated data. The name must be a
null-terminated string prefixed by a namespace identifier and a dot character. Currently,
four namespaces exist: user, trusted, security and system. The user namespace has no
restrictions with regard to naming or contents. The system namespace is primarily used by
the kernel for access control lists. The security namespace is used by SELinux, for example.
The Linux kernel allows extended attribute to have names of up to 255 bytes and values of
up to 64 KiB,[15] as do XFS and ReiserFS, but ext2/3/4 and btrfs impose much smaller limits,
requiring all the attributes (names and values) of one file to fit in one "filesystem block"
(usually 4 KiB). Per POSIX.1e,[citation needed] the names are required to start with one of
security, system, trusted, and user plus a period. This defines the four namespaces of
extended attributes.[16]
Extended attributes can be accessed and modified using the getfattr and setfattr commands
from the attr package on most distributions.[17] The APIs are called getxattr and setxattr.
Where is the xattr of ext4 stored ? There are 2 places,
Ext4 has an 256 inode size by default(can be configured), but the ext4_inode structure is only 160
bytes. The left space is for storing xattr.
|iiiiiiixxxxx|iiiiiiixxxxx|iiiiiiixxxxx|iiiiiiixxxxx|
i: inode on disk
x: xattrs
ext4_xattr_ibody_get()
---
error = ext4_get_inode_loc(inode, &iloc);
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE(128) + EXT4_I(inode)->i_extra_isize)
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
entry = IFIRST(header);
error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
---
ext4_xattr_block_get()
---
//one bh is enough as its size is one filesystem block
bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
entry = BFIRST(bh);
end = bh->b_data + bh->b_size;
error = xattr_find_entry(inode, &entry, end, name_index, name, 1);
---
If value of an attribute is greater than 2048 bytes the value is not
saved in the external EA block, instead it is saved in an inode. The EA
entry saves the inode number in e_value_inum field (earlier this was
e_value_block that was unused). The maximum size of the EA is limited to
64K due to VFS limitations as can be seen in linux/limits.h. A new
EXT4_FEATURE_INCOMPAT_EA_INODE feature has been added for this.
ext4_xattr_entry.e_value_inum
The inode where the value is stored. Zero indicates the value is in the
same block as this entry. This field is only used if the INCOMPAT_EA_INODE
feature is enabled.
Note ! this kind of xattr entry can be stored in both inode body and external block.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ext4_xattr_ibody_get()/ext4_xattr_block_get()
---
if (entry->e_value_inum) {
error = ext4_xattr_inode_get(inode, entry, buffer,
size);
} else {
u16 offset = le16_to_cpu(entry->e_value_offs);
void *p = bh->b_data + offset;
memcpy(buffer, p, size);
}
---
First let's look at some basic data structure
The directory file can be formated as following,
struct ext4_dir_entry_2 {
__le32 inode; /* Inode number */
__le16 rec_len; /* Directory entry length */
__u8 name_len; /* Name length */
__u8 file_type; /* See file type macros EXT4_FT_* below */
char name[EXT4_NAME_LEN]; /* File name */
};
EXT4_NAME_LEN is 255.
The reason why we still need rec_len when we have name_len,
foo doo eoo aoo
|-----|--------|-----|------|
after unlink eoo
foo doo aoo
|-----|--------------|------|
\______ _______/
v
doo.rec_len
The rec_len is used to jump to the next dentry.
Refer to the code of ext4_search_dir()
By default, the directory file is a list of ext4_dir_entry_2 linked by rec_len.
The lookup method is to serach a dentry in the list which is very ineffcient.
To imporve this, hash tree dentry is introduced.
struct dx_root and dx_node of index
{
struct fake_dirent dot; // for '.', rec_len = 12
char dot_name[4];
struct fake_dirent dotdot; // for '..', block_size - 12
char dotdot_name[4];
struct dx_root_info
{
__le32 reserved_zero;
u8 hash_version;
u8 info_length; /* 8 */
u8 indirect_levels;
u8 unused_flags;
}
info;
struct dx_entry entries[];
};
struct dx_node
{
struct fake_dirent fake; //rec_len = block size
struct dx_entry entries[];
};
For backwards read-only compatibility with ext2, this tree is actually
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
hidden inside the directory file, the fake_dirent is to masquerad as
"empty" directory data blocks! The ext2 can read the leaf blocks.
struct dx_entry
{
__le32 hash;
__le32 block;
};
The first dx_entry of both root or node is special,
__le16 limit Maximum number of dx_entries that can follow this header,
plus 1 for the header itself.
__le16 count Actual number of dx_entries that follow this header,
plus 1 for the header itself.
__le32 block The block number (within the directory file) that goes
with the lowest hash value of this block. This value is
stored in the parent block. For the root, this hash vaule
is 0.
R: dx_root N: dx_node E: dx_entry D: ext4_dir_entry_2
{D/D/D/} {D/D/D/D} {D/D/D/D} {D/D/D/D}
or
{R E/E/E/E/E } \
... / \ ... |
{N E/E/E/E} {N E/E/E/E} > index
/ \ ... |
{N E/E/E/E} {N E/E/E/E} /
/ \ ...
{D/D/D/D} {D/D/D/D} } dentry
(1) The E in index is sorted by hash
(2) The D in dentry is is a list linked by rec_len
Obviously, the index tree is introduced to promote the lookup performance.
Regarding to the dentry hash tree, there are following points,
Look for the dx_entry of the leaf (a block filled by dentry)
dx_probe()
---
entries = (struct dx_entry *)(((char *)&root->info) +
root->info.info_length);
while (1) {
count = dx_get_count(entries);
p = entries + 1;
q = entries + count - 1;
while (p <= q) {
m = p + (q - p) / 2;
if (dx_get_hash(m) > hash)
q = m - 1;
else
p = m + 1;
}
at = p - 1;
frame->entries = entries;
frame->at = at;
if (!indirect--)
return frame;
frame++;
frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
entries = ((struct dx_node *) frame->bh->b_data)->entries;
}
---
When a block is full, we need to split it into two
[] -\ []
/ -/ / \
[xxxxxx] [xxx] [xxx]
- index dx_node, split the dx_entry by half, as dx_entry's size is fixed
refer to code of ext4_dx_add_entry
- leaf blocks, split it by the size of dentries, as the dentry's size is different
refer to the code of do_split
Note, before do leaf block, we need to check whether the space of index dx_node
is enough, if not, do split on index first.
When there is no space in index tree, we have to add level
[yyyyyy]1 [z]1 [z]
/ -\ / -\ / \
[xxx] [xxx] -/ [yyyyyy]2 -/ [yyyy] [yyy]
/ / \
[xxx] [xxx] [xxx] [xxx]
The ext4 index's max level is 2 or 3 when largedir feature is enabled
Noted, when add new level, dx_entries in block1, "yyyyyyy", is copied to the new
block 2, then add a dx_entry into block 1 to point to block2.
This policy is also suitable for the case when level is 2. Because the root
block must be logical blk0 of directoy file.
ext4_inode.i_block[EXT4_N_BLOCKS] //__le32
#define EXT4_NDIR_BLOCKS 12
#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS
#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1)
#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1)
#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1)
60 bytes per inode
__ext4_new_inode()
---
ei->i_inline_off = 0;
if (ext4_has_feature_inline_data(sb) &&
(!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
---
ext4_init_new_dir()
---
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
err = ext4_try_create_inline_dir(handle, dir, inode);
...
}
---
inode inline block index
+---+ +----+ +----+
| | -\ |dddd| -\ | |
|ddd| } dentry -/ |dddd| -/ +----+
+---+ [1] +----+ [2] / \
+----+ +----+
|dddd| |dddd|
+----+ +----+
[1] inline to block
ext4_try_add_inline_entry()
-> ext4_convert_inline_data_nolock()
---
map.m_lblk = 0;
map.m_len = 1;
map.m_flags = 0;
error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
...
data_bh = sb_getblk(inode->i_sb, map.m_pblk);
...
lock_buffer(data_bh);
error = ext4_journal_get_create_access(handle, data_bh);
memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
if (!S_ISDIR(inode->i_mode)) {
// The interesting thing is both the inode and data block need
// to be recorded in log. Because they need to be completed in
// one transaction
memcpy(data_bh->b_data, buf, inline_size);
set_buffer_uptodate(data_bh);
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
// There are 3 steps here
// (1) create '.' and '..' dentry
// (2) copy the inlined dentries
// (3) set the tail dentry of which rec_len covers the whole block
error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
buf, inline_size);
}
unlock_buffer(data_bh);
---
[2] block to index
ext4_add_entry()
-> make_indexed_dir()
---
/* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
bh2 = ext4_append(handle, dir, &block);
ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data2 = bh2->b_data;
memcpy(data2, de, len);
memset(de, 0, len); /* wipe old data */
//Get the last dentry and make its rec_len cover the whole block
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
de = de2;
de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
(char *) de, blocksize);
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
de->rec_len = ext4_rec_len_to_disk(
blocksize - ext4_dir_rec_len(2, NULL), blocksize);
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
if (ext4_hash_in_dirent(dir))
root->info.hash_version = DX_HASH_SIPHASH;
else
root->info.hash_version =
EXT4_SB(dir->i_sb)->s_def_hash_version;
entries = root->entries;
// The first leaf block's offset is 1
dx_set_block(entries, 1);
dx_set_count(entries, 1);
dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
...
retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
// Split the leaf block as it is filled up
de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2);
---
Ext4 mballoc maintains a buddy system for every block group in memory.
Every block group needs two blocks (4K), which format is as following,
O0 O1 O2 O3
|--------------------------|-------------|------|--|..|
\____________ ____________/ \____________ ____________/
v v
block 0 block 1
bd_bitmap bd_buddy
O0: bitmap for order 0
O1: bitmap for order 1
...
Extent (bit - off) << order, 1 << order) >
The region for every order is defined in ext4_sb_info.s_mb_offsets/s_mb_max
which is calculated in ext4_mb_init()
mb_find_buddy() helps to find the bitmap address and range (max)
---
/* at order 0 we see each particular block */
if (order == 0) {
*max = 1 << (e4b->bd_blkbits + 3);
return e4b->bd_bitmap;
}
bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
---
How does this buddy system work ?
What's information stored in the buddy (except for order 0 bitmap)?
The freed and contiguous blocks of every order.
For example, if you have 128M blocks, then the highest order bitmap is cleared,
all of other are set.
If you have 64M, 32M, 16M, and 8M, then there will a single bit cleared in the
bitmap of those orders. All of other are set.
The buddy cache is initialized when load buddy,
ext4_mb_load_buddy_gfp()
-> ext4_mb_init_group()
-> ext4_mb_init_cache()
-> memset(data, 0xff, blocksize);//See it ? default state is set
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-> ext4_mb_generate_buddy()
---
i = mb_find_next_zero_bit(bitmap, max, 0);
grp->bb_first_free = i;
while (i < max) {
fragments++;
first = i;
i = mb_find_next_bit(bitmap, max, i);
len = i - first;
free += len;
if (len > 1)
ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
else
grp->bb_counters[0]++;
if (i < max)
i = mb_find_next_zero_bit(bitmap, max, i);
}
---
Find the freed and contiguous blocks and mark them free in the buddy
Find by goal
By searching order 0, we can know whether the block has been allocated.
By searching order 1 and larger ones, we can know the max contiguous blocks that
contains the goal block.
A general function we need to know is,
mb_find_order_for_block()
---
while (order <= e4b->bd_blkbits + 1) {
bb = mb_find_buddy(e4b, order, &max);
if (!mb_test_bit(block >> order, bb)) {
/* this block is part of buddy of order 'order' */
return order;
}
//check in ascending order
order++;
}
return 0;
---
mb_find_extent() returns the range that contains the goal block and required
length. It can carry multiple buddies of different size,
goal required len
| ______^_________
v/ \
|[x][x][x][x][x][x][x][-][-][-][-][-][-][-][-][-]|
\___ ____/ \_ __/ | | \___________ __________/
v v v v v
O2 O1 O0 O0 O3
[x] : allocated
[-] : freed
This is done by mb_mark_used(), every order of buddies that's involved will be set,
mb_mark_used()
---
/* let's maintain buddy itself */
while (len) {
//Note, it return the order that has free blocks
ord = mb_find_order_for_block(e4b, start);
if (((start >> ord) << ord) == start && len >= (1 << ord)) {
/* the whole chunk may be allocated at once! */
mlen = 1 << ord;
buddy = mb_find_buddy(e4b, ord, &max);
mb_set_bit(start >> ord, buddy);
start += mlen;
len -= mlen;
continue;
}
/* we have to split large buddy */
// split by descending order
// mb_find_order_for_block() in next loop won't return this
// order any more, as it has been set
buddy = mb_find_buddy(e4b, ord, &max);
mb_set_bit(start >> ord, buddy);
e4b->bd_info->bb_counters[ord]--;
ord--;
cur = (start >> ord) & ~1U;
buddy = mb_find_buddy(e4b, ord, &max);
mb_clear_bit(cur, buddy);
mb_clear_bit(cur + 1, buddy);
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
//Set the bits on the bitmap of order 0
ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
---
The code works as following,
(1)
off len
| ______^_________
v/ \
|[x][x][x][x][x][x][x][-][-][-][-][-][-][-][-][-]|
\___ ____/ \_ __/ | | \___________ __________/
v v v v v
O2 O1 O0 O0 O3
mb_find_order_for_block() return order 0, needn't split, offset is pushed forward
(2)
off len
| ____^_____
v/ \
|[x][x][x][x][x][x][x][x][-][-][-][-][-][-][-][-]|
\___ ____/ \_ __/ | | \___________ __________/
v v v v v
O2 O1 O0 O0 O3
mb_find_order_for_block() return order 3, need to split.
The entry of off in order 3 is set, so in next loop,
mb_find_order_for_block() can only return order 2
(3)
off len
| ____^_____
v/ \
|[x][x][x][x][x][x][x][x][-][-][-][-][-][-][-][-]|
\___ ____/ \_ __/ | | \____ ____/ \____ ____/
v v v v v v
O2 O1 O0 O0 O2 O2
mb_find_order_for_block() return order 2, needn't to split, off is pushed forward
(4)
off
|
v
|[x][x][x][x][x][x][x][x][x][x][x][x][-][-][-][-]|
\___ ____/ \_ __/ | | \____ ____/ \____ ____/
v v v v v v
O2 O1 O0 O0 O2 O2
mb_find_order_for_block() return order 2, need to split
The entry of off in order 2 is set, mb_find_order_for_block() will return
order 1 in next loop.
(5)
off
|
v
|[x][x][x][x][x][x][x][x][x][x][x][x][-][-][-][-]|
\___ ____/ \_ __/ | | \____ ____/ \_ _/ \_ _/
v v v v v v v
O2 O1 O0 O0 O2 O1 O1
mb_find_order_for_block() return order 1, need to split again
The entry of off in order 1 is set, mb_find_order_for_block() will return
order 0 in next loop.
(6)
off
|
v
|[x][x][x][x][x][x][x][x][x][x][x][x][-][-][-][-]|
\___ ____/ \_ __/ | | \____ ____/ | | \_ _/
v v v v v v v v
O2 O1 O0 O0 O2 O0 O0 O1
mb_find_order_for_block() return order 0, needn't to split, off is pushed forward
Too lazy to read the code here. To be continued later
Allocate
ext4_mb_new_blocks()
---
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
...
// do allocation in mb
*errp = ext4_mb_regular_allocator(ac);
...
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
}
---
Free
ext4_free_blocks()
---
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
err = ext4_journal_get_write_access(handle, bitmap_bh);
err = ext4_journal_get_write_access(handle, gd_bh);
...
err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (ext4_handle_valid(handle) &&
((flags & EXT4_FREE_BLOCKS_METADATA) ||
!ext4_should_writeback_data(inode))) {
...
ext4_lock_group(sb, block_group);
//Set in the bitmap on disk
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
}
ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
ext4_free_group_clusters_set(sb, gdp, ret);
ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
...
ext4_mb_unload_buddy(&e4b);
...
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
---
What is ext4 preallocation for ?
The main goal is to provide better allocation for small and large files.
This is achieved by using a different strategy for different allocation
requests. For a relatively small allocation request, Ext4 tries to allocate
from a per-CPU locality group, which is shared by all allocations under
the same CPU, in order to try to keep these small files close to each other.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A large allocation request is allocated from per-file preallocation first.
There are two steps,
(1) decide the goal allocation length
ext4_mb_normalize_request()
---
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
ext4_mb_normalize_group_request(ac);
return ;
}
// The code to decide length of per-file preallocation is a bit long, so
// don't share it here. The basic principle is take the 1st one larger than
// what you want,
// 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M, what you want
---
(2) allocate from mb
ext4_mb_regular_allocator()
---
/*
* ac->ac_2order is set only if the fe_len is a power of 2
* if ac->ac_2order is set we also set criteria to 0 so that we
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* try exact allocation using buddy.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
*/
i = fls(ac->ac_g_ex.fe_len);
ac->ac_2order = 0;
/*
* We search using buddy data only if the order of the request
* is greater than equal to the sbi_s_mb_order2_reqs
* You can tune it via /sys/fs/ext4/
If EXT4_MB_HINT_GROUP_ALLOC is set, employ per-cgroup locality group alloc
ext4_mb_group_or_file()
---
size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
if ((size == isize) && !ext4_fs_is_busy(sbi) &&
!inode_is_open_for_write(ac->ac_inode)) {
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
// /sys/fs/ext4/xxx/mb_group_prealloc
if (sbi->s_mb_group_prealloc <= 0) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
/* don't use group allocation for large files */
// The block allocation happens when .writepages. Writing has been
// aggregated through delayed-allocation.
size = max(size, isize);
if (size > sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
/*
* locality group prealloc space are per cpu. The reason for having
* per cpu locality group is to reduce the contention between block
* request from multiple CPUs.
*/
ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
/* we're going to use group allocation */
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
/* serialize all allocations in the group */
mutex_lock(&ac->ac_lg->lg_mutex);
---
Preallocation is only in-core, not on-disk.
When the allocation completes, only the spaces used by file is flushed to disk,
ext4_mb_new_blocks()
-> ext4_mb_mark_diskspace_used()
---
ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
---
Don't worry, this is not the one used in ext4_mb_use_best_found()
It has been adapted,
ext4_mb_use_best_found()
-> ext4_mb_new_preallocation()
-> ext4_mb_new_inode_pa()
-> ext4_mb_use_inode_pa()
---
/* found preallocated blocks, use them */
start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
len = EXT4_NUM_B2C(sbi, end - start);
ac->ac_b_ex.fe_len = len;
ac->ac_status = AC_STATUS_FOUND;
ac->ac_pa = pa;
---
Everything in tmpfs is temporary in the sense that no files will be
created on your hard drive. The files live in memory and swap
space. If you unmount a tmpfs instance, everything stored therein is
lost.
An in-core inode with specific operations.
See shmem_get_inode()
An important thing is the dentry of this inode will be pined in memory
shmem_mknod()
---
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
...
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
}
---
The dentry for the inode will be perserved in dcache until it is deleted
An in-core dentry points to the linked inode
shmem_link()
---
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
---
Let's take some relatively simple examples,
iclog ring and state machine
.-->
/ ---
/ \
| iclog |
\ /
--- /
<--'' -->
|------------------------------------------------|
physical log space
The iclog has two parameters,
l_iclog_bufs 8
l_iclog_size 32K (max 256K)
iclog is allocated in xlog_alloc_log()
---
for (i = 0; i < log->l_iclog_bufs; i++) {
int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
sizeof(struct bio_vec);
iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
...
iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
KM_MAYFAIL | KM_ZERO);
...
iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
...
// link all of the iclog in a ring
iclogp = &iclog->ic_next;
}
---
The iclog in the ring will be employed one by one.
And every iclog has a state machine works as following,
XLOG_STATE_ACTIVE Be able to receive log
- xlog_alloc_log()
- xlog_state_do_callback()
-> xlog_state_clean_iclog()
-> xlog_state_activate_iclogs()
-> xlog_state_activate_iclog()
XLOG_STATE_WANT_SYNC
- xlog_state_switch_iclogs()
XLOG_STATE_SYNCING
- xlog_state_release_iclog()
-> __xlog_state_release_iclog()
-> xlog_sync()
XLOG_STATE_DONE_SYNC
- xlog_ioend_work()
-> xlog_state_done_syncing()
XLOG_STATE_CALLBACK
- xlog_state_done_syncing()
-> xlog_state_do_callback()
XLOG_REG_TYPE_TRANSHDR XLOG_REG_TYPE_COMMIT
| |
< oph >< oph >< reg0 >< oph >< reg1 >< oph >...< regn >
| |
XLOG_START_TRANS XLOG_REG_TYPE_ICORE
The transaction above is carried by iclog when sent out to disk.
Note !!! iclog is just used when issue IO. The log item is carried in anther place.
xlog_write()
---
if (copy_len > 0) {
// The reg here is a xfs_log_iovec
// It will copy the data from xfs_log_iovec to iclog buffer
memcpy(ptr, reg->i_addr + copy_off, copy_len);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
copy_len);
}
---
The buffer carries log is allocated here,
Refer to xlog_cil_alloc_shadow_bufs() / xlog_cil_iovec_space()
The buffer for iclog contains header and payload, the header's layout is
xlog_rec_header_t.
Every iclog contains a crc num.
xlog_sync()
---
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
iclog->ic_datap, size)
---
In theory, when the commit record IO is completed, xlog guarantee that the whole
transaction has been on disk. How to implement this ?
There are two points here:
Quote from https://www.infradead.org/~mchehab/kernel_docs/filesystems/xfs-delayed-logging-design.html
XFS allows multiple separate modifications to a single object to be carried in the log at any given time.
This allows the log to avoid needing to flush each change to disk before recording a new change to the object.
XFS does this via a method called “re-logging”. Conceptually, this is quite simple - all it requires is
that any new change to the object is recorded with a new copy of all the existing changes in the new
transaction that is written to the log.
Regarding to the comment with underline, we could refer to the implementation of jbd2,
jbd2 could be deemed as a WAL in blocks, namely, before flush the dirty blocks to real
position on disk, jbd2 would record them on journal first. In common case, jbd2 would
shadow the original buffer_head to do the journal IO.
jbd2_journal_write_metadata_buffer()
---
spin_lock(&jh_in->b_state_lock);
repeat:
if (jh_in->b_frozen_data) {
...
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
}
...
set_bh_page(new_bh, new_page, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
*bh_out = new_bh;
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
spin_unlock(&jh_in->b_state_lock); //Protect this journal buffer head
---
do_get_write_access()
---
spin_lock(&jh->b_state_lock);
...
if (buffer_shadow(bh)) {
spin_unlock(&jh->b_state_lock);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
...
---
A very important thing need to be noted is that the modification has been made in
buffer_head which is the cache of the disk.
Deferred Operations is a very bad name and could mislead the readers.
IMO, it should be called Big Transaction. The deferred operations,
cooperating with intent log, could split a complicated transaction into
multiple small transactions and still keep the atomicity of the original
transaction. Look at the following example,
To complete T, we need following multiple operations,
T_A, T_B, T_C, T_D. And each of them need 3 sub-operations.
Deferred Operations would complete this work as following,
Intent log for T_A \
Intent log for T_B \ t0
Intent log for T_C /
Intent log for T_D /
-------------------
Done log for T_A \
Real log for T_A0 \ t1
Real log for T_A1 /
Real log for T_A2 /
-------------------
Done log for T_B \
Real log for T_B0 \ t2
Real log for T_B1 /
Real log for T_B2 /
-------------------
Done log for T_C \
Real log for T_C0 \ t4
Real log for T_C1 /
Real log for T_C2 /
-------------------
Done log for T_D \
Real log for T_D0 \ t5
Real log for T_D1 /
Real log for T_D2 /
-------------------
The Intent log could guarantee the whole big transaction's atomicity.
xfs_defer_finish_noroll() is to carry out the work,
xfs_defer_finish_noroll()
---
/* Until we run out of pending work to finish... */
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
//Create intent log for every deferred_operations
xfs_defer_create_intents(*tp);
list_splice_init(&(*tp)->t_dfops, &dop_pending);
//Roll the transaction
error = xfs_defer_trans_roll(tp);
...
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
//Pick a defered operation and finish it
// - create done
// - finish_item do the real work
error = xfs_defer_finish_one(*tp, dfp);
}
---
tail head
cycle=100 cycle=100
| |
v v
|-------xxxxxxxxxxxx------------|
head tail
cycle=101 cycle=100
| |
v v
|xxxxxxx-----------------xxxxxxx|
It is related to XFS_TRANS_PERM_LOG_RES,
This kind of transaction could be rolled by multiple times.
Such as
#define XFS_SYMLINK_LOG_COUNT 3
#define XFS_REMOVE_LOG_COUNT 2
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT_REFLINK 8
A reflink transaction could be split into 8 sub-transactions.
If one sub-transaction need T bytes log space, we need
(1) reserve 8 * T log space on l_reserve_head,
(2) reserve T log space when transaction is rolled.
Refer to the code in xfs_trans_reserve()
---
// rolled transaction
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(mp, tp->t_ticket);
} else {
error = xfs_log_reserve(mp,
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
permanent);
}
---
The log space reserved here is always surplus.
There are two value in ticket, t_curr_res and t_unit_res.
xlog_cil_alloc_shadow_bufs() will adapt the t_curr_res
based on the real situation and then give the left back in
xfs_log_commit_cil()
-> xfs_log_ticket_regrant/ungrant()
The log space will given back after AIL commit with push the lsn tail.
In software,
xfs_cil_ctx represents a transaction.
xfs_log_vec represents a log item and its space in memory. (li_lv and li_lv_shadow)
lsn represents cycle << 32 | log block number //>>
There are two lsn for a Transaction
(1) start lsn
xlog_cil_push_work()
-> xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true)
---
/* start_lsn is the first lsn written to. That's all we need. */
if (!*start_lsn)
*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
---
(2) commit lsn
xlog_cil_push_work()
-> xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
The io completion callback of log block (iclog) is invoked in the order of lsn.
And the sign of successful committing of a transaction is the commit lsn (the log block) has been on disk
The releasing of log space is through pushing the tail lsn forward.
The tail lsn is actually the minimum of start lsn of non-applied log.
The tail lsn could be modified in two ways,
(1) relog
xfs_trans_committed()
// See it ? ctx>start_lsn is used here !!!
-> xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort)
-> xfs_log_item_batch_insert()
-> xfs_trans_ail_update_bulk()
---
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
// Has been on AIL list, relog case !!!
// It also inicates that the previous log has been on disk !!!
if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
/* check if we really need to move the item */
if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
continue;
// relog would release its previous log space !!!
if (mlip == lip && !tail_lsn)
tail_lsn = lip->li_lsn;
// remove from the ail list and re-insert it later
xfs_ail_delete(ailp, lip);
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
lip->li_lsn = lsn;
list_add(&lip->li_ail, &tmp);
}
// all the log items uses the same lsn, namely, the start lsn of the transaction
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
xfs_ail_update_finish(ailp, tail_lsn);
---
(2) checkpoint
xfs_buf_ioend()
-> xfs_buf_item_done()
-> xfs_trans_ail_delete()
-> xfs_ail_delete_one()
---
// The minimum one is the head of the ailp->ail_head
struct xfs_log_item *mlip = xfs_ail_min(ailp);
xfs_lsn_t lsn = lip->li_lsn;
xfs_ail_delete(ailp, lip);
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
if (mlip == lip)
return lsn;
return 0;
---
-> xfs_ail_update_finish()
Why there is only one xfs_buf_ioend() here ?
Look at the definition of iop_push, only inode, dquot and buf define it.
This indicates that most of the transactions are made of xfs buf updating.
And inode updating would be applied on xfs buf finally.
Refer to
xfs_inode_item_push()
-> xfs_iflush_cluster()
-> xfs_buf_delwri_queue()
The inode number if xfs is composed with 3 parts,
AG number Bn in AG In in B
|----------------|---------------|-------------|
Bn : block number in an AG
In : inode number in a block (a block could carry multiple inodes)
This inode number has telled us the position of the inode on disk.
xfs's inodes are dynamically allocated instead of preallocating in static
position like ext4.
How to allocate in dynamical way ?
Allocate a block in that AG !
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Then create a xfs_inobt_rec as following,
struct xfs_inobt_rec {
__be32 ir_startino; //start ino num of this chunk
__be32 ir_freecount; //number of free inodes
__be64 ir_free; //bitmap
}
and insert it into the AG inode b+tree
xfs uses xfs_buf to manage its metadata instead of using vfs pagecache.
xfs_buf_get_map()
---
error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
if (!error)
goto found;
if (error != -ENOENT)
return error;
error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
// sema_init(&bp->b_sema, 0); /* held, no waiters */
error = xfs_buf_allocate_memory(new_bp, flags);
---
/*
* for buffers that are contained within a single page, just allocate
* the memory from the heap - there's no need for the complexity of
* page arrays to keep allocation down to order 0.
*/
size = BBTOB(bp->b_length);
if (size < PAGE_SIZE) {
int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
bp->b_addr = kmem_alloc_io(size, align_mask,
KM_NOFS | kmflag_mask);
...
bp->b_offset = offset_in_page(bp->b_addr);
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = kmem_to_page(bp->b_addr);
bp->b_page_count = 1;
bp->b_flags |= _XBF_KMEM;
return 0;
}
use_alloc_page:
...
for (i = 0; i < bp->b_page_count; i++) {
struct page *page;
uint retries = 0;
retry:
page = alloc_page(gfp_mask);
...
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
bp->b_pages[i] = page;
offset = 0;
}
---
// Do insert if new_bp is not NULL
error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
...
found:
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
...
}
---
xfs_buf_find()
---
pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
xfs_buf_hash_params);
if (bp) {
atomic_inc(&bp->b_hold);
goto found;
}
...
found:
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
// We need to return a locked xfs_buf here !!!
if (!xfs_buf_trylock(bp)) {
xfs_buf_lock(bp);
XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
}
---
xfs_buf_read_map()
---
error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
...
if (!(bp->b_flags & XBF_DONE)) {
/* Initiate the buffer read and wait. */
bp->b_ops = ops;
error = _xfs_buf_read(bp, flags);
-> xfs_buf_submit()
-> __xfs_buf_submit() // wait = !(bp->b_flags & XBF_ASYNC)
---
/*
* Grab a reference so the buffer does not go away underneath us. For
* async buffers, I/O completion drops the callers reference, which
* could occur before submission returns.
*/
xfs_buf_hold(bp);
...
_xfs_buf_ioapply(bp);
...
if (wait)
error = xfs_buf_iowait(bp);
xfs_buf_rele(bp);
---
/* Readahead iodone already dropped the buffer, so exit. */
if (flags & XBF_ASYNC)
return 0;
}
---
In non-readahead case, we have to wait the buffer to be read in.
When read completes, xfs would verify it.
xfs_buf_ioend()
---
if (bp->b_flags & XBF_READ) {
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
}
---
The xfs buf being written to disk must have been checkpointed.
During the IO, the xfs_buf is locked. Nobody can touch it.
xfs_buf_delwri_submit_buffers()
-> xfs_buf_lock()
-> __xfs_buf_submit() // wait is false
---
// buf has been locked, after unpin, nobody can pin it any more.
// the modifications in buf should have been in log.
if (bp->b_flags & XBF_WRITE)
xfs_buf_wait_unpin(bp);
---
if (atomic_read(&bp->b_pin_count) == 0)
return;
add_wait_queue(&bp->b_waiters, &wait);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (atomic_read(&bp->b_pin_count) == 0)
break;
io_schedule();
}
remove_wait_queue(&bp->b_waiters, &wait);
set_current_state(TASK_RUNNING);
---
---
How to ensure the metadata to be on disk instead of disk caching, after release
the log space ?
xlog_sync()
---
/*
* Flush the data device before flushing the log to make sure all meta
* data written back from the AIL actually made it to disk before
* stamping the new log tail LSN into the log buffer. For an external
* log we need to issue the flush explicitly, and unfortunately
* synchronously here; for an internal log we can simply use the block
* layer state machine for preflushes.
*/
if (log->l_targ != log->l_mp->m_ddev_targp || split) {
xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
need_flush = false;
}
xlog_verify_iclog(log, iclog, count);
xlog_write_iclog(log, iclog, bno, count, need_flush);
---
xfs_buf_rele()
---
spin_lock(&bp->b_lock);
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
if (!release) {
if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
__xfs_buf_ioacct_dec(bp);
goto out_unlock;
}
...
// insert into the lru list
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
spin_unlock(&pag->pag_buf_lock);
}
---
xfs_buftarg_shrink_scan()
---
freed = list_lru_shrink_walk(&btp->bt_lru, sc,
xfs_buftarg_isolate, &dispose);
while (!list_empty(&dispose)) {
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
freed = list_lru_shrink_walk(&btp->bt_lru, sc,
xfs_buftarg_isolate, &dispose);
while (!list_empty(&dispose)) {
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
---
The story of b_hold
[0] initial value is 1
_xfs_buf_alloc()
---
atomic_set(&bp->b_hold, 1);
---
[1] xfs_buf_find() get and lock
---
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
xfs_buf_hash_params);
if (bp) {
atomic_inc(&bp->b_hold);
goto found;
}
...
found:
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
return -EAGAIN;
}
xfs_buf_lock(bp);
XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
}
---
There are mainly two ways to release this reference
a. xfs_log_commit_cil()
-> xfs_trans_free_items()
-> iop_unlock()
xfs_buf_item_unlock()
-> xfs_buf_relse()
b. xfs_trans_brelse()
-> xfs_trans_del_item()
-> xfs_buf_relse()
release a buf if didn't dirty it
[2] buf log item
The buf log item holds a reference of the xfs_buf
xfs_buf_item_init()
-> xfs_buf_hold()
This reference will be released in xfs_buf_iodone()
xfs_buf_iodone()
---
xfs_buf_rele(bp);
spin_lock(&ailp->ail_lock);
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
---
[3] delwri_queue
xfs_buf_delwri_queue()
---
bp->b_flags |= _XBF_DELWRI_Q;
if (list_empty(&bp->b_list)) {
atomic_inc(&bp->b_hold);
list_add_tail(&bp->b_list, list);
}
---
get lock in xfs_buf_delwri_submit_buffers()
xfs_buf_iodone_callbacks()
-> xfs_buf_ioend()
-> xfs_buf_relse()
unlock and release ref
allocated block extent has two state in xfs
The meaning of the state is as the name shows
When the block extent is newly allocated, it is unwritten.
xfs_bmapi_allocate()
---
if (bma->flags & XFS_BMAPI_PREALLOC)
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
---
The two main allocation paths,
The unwritten state can influence the xfs in following ways,
xfs_map_blocks()
-> xfs_convert_blocks()
-> xfs_bmapi_convert_delalloc()
---
bma.flags = XFS_BMAPI_PREALLOC;
...
error = xfs_bmapi_allocate(&bma);
---
xfs_file_fallocate()
---
if (!xfs_is_always_cow_inode(ip)) {
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
---
After write IO completes, xfs will convert unwritten to normal state
iomap_readpage_actor()
---
if (iomap_block_needs_zeroing(inode, iomap, pos)) {
zero_user(page, poff, plen);
iomap_set_range_uptodate(page, poff, plen);
goto done;
}
---
iomap_block_needs_zeroing()
---
return iomap->type != IOMAP_MAPPED ||
(iomap->flags & IOMAP_F_NEW) ||
pos >= i_size_read(inode);
---
__iomap_write_begin()
---
// iomap->type != IOMAP_MAPPED means needs zeroing
if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
return -EIO;
zero_user_segments(page, poff, from, to, poff + plen);
} else {
int status = iomap_read_page_sync(block_start, page,
poff, plen, srcmap);
if (status)
return status;
}
---
xfs_bmap_del_extent_real()
---
if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
__xfs_bmap_add_free(tp, del->br_startblock,
del->br_blockcount, NULL,
(bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN);
}
}
// If the extent is UNWRITTEN, the skip_discard is true
---
xfs_end_bio()
-> queue_work i_ioend_work
xfs_end_io()
-> xfs_end_ioend()
-> xfs_iomap_write_unwritten()
---
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_CONVERT, resblks, &imap,
&nimaps);
---
-> xfs_bmapi_convert_unwritten()