XFS_V2

The link count of a file tells the total number of links a file has
which is nothing but the number of hard-links a file has. This count,
however, does not include the soft-link count.

Note: The soft-link is not part of the link count since the soft-link's
inode number is different from the original file. 

When a directory is created, except for the link count for the dentry in
parent directory, two extra link count is increased,
(1) one on the parent directory for ".." dentry in child directory
(2) one on the child directory for "." dentry ion child directory

unlink

Quote from https://pubs.opengroup.org/onlinepubs/9699919799/functions/unlink.html#tag_16_635

When the file's link count becomes 0 and no process has the file open, the space occupied by
the file shall be freed and the file shall no longer be accessible. If one or more processes
have the file open when the last link is removed, the link shall be removed before unlink()
                                                                                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
returns, but the removal of the file contents shall be postponed until all references to the
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
file are closed.
^^^^^^^^^^^^^^

How does xfs implement this ?

When nlink of inode reaches zero

xfs_droplink()
---
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);

        // drop the link in the indoe on disk

        drop_nlink(VFS_I(ip));
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

        if (VFS_I(ip)->i_nlink)
                return 0;

        return xfs_iunlink(tp, ip);
---

// link this inode on the list on agi->agi_unlinked[]

xfs_iunlink()
    -> xfs_iunlink_update_bucket()
    ---
        agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
        offset = offsetof(struct xfs_agi, agi_unlinked) +
                        (sizeof(xfs_agino_t) * bucket_index);
        xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
---

when reference of inode reaches zero

xfs_fs_destroy_inode()
    -> xfs_inactive()
    ---
        if (VFS_I(ip)->i_nlink != 0) {
                ...
                return;
        }

        if (S_ISREG(VFS_I(ip)->i_mode) &&
                (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
                 ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
                truncate = 1;
        ...
        if (S_ISLNK(VFS_I(ip)->i_mode))
                error = xfs_inactive_symlink(ip);
        else if (truncate)
                error = xfs_inactive_truncate(ip);
        if (error)
                return;
        ...
        /*
         * Free the inode.
         */
        error = xfs_inactive_ifree(ip);
            -> xfs_ifree()
                -> xfs_iunlink_remove()
    ---

Recover AGI unlinked lists

This is called during recovery to process any inodes which we unlinked but
not freed when the system crashed.    These inodes will be on the lists in the
AGI blocks. What we do here is scan all the AGIs and fully truncate and free
any inodes found on the lists. Each inode is removed from the lists when it
has been fully truncated and is freed. The freeing of the inode and its
removal from the list must be atomic.

xlog_recover_process_iunlinks()
    -> xlog_recover_process_one_iunlink()
    ---
        ino = XFS_AGINO_TO_INO(mp, agno, agino);
        error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
        ...
        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
        ...
        agino = be32_to_cpu(dip->di_next_unlinked);
        xfs_buf_relse(ibp);

        xfs_irele(ip);
            -> iput(VFS_I(ip));

            //The reference would be zero and triger xfs_fs_destroy_inode()


    ---

Actually, both xfs and ext4-jbd2 employ asynchronous journal which
could batch IOs to journal to promote performance. Asynchronous journal
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ may lead to lose data (inode could also be lost) but still keep metadata consistent.
Let's look at how does xfs and ext4-jbd2 implement the asynchronous journal

XFS

__xfs_trans_commit()
    -> xfs_log_commit_cil()
        -> xlog_cil_push_background()
---

        // min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
        //                                                                                 ^^^^^^^^^^^^^^^^^^^^^^^^
        //                                                                                 Max of the xlog buffer

        if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
                up_read(&cil->xc_ctx_lock);
                return;
        }

        spin_lock(&cil->xc_push_lock);
        if (cil->xc_push_seq < cil->xc_current_sequence) {
                cil->xc_push_seq = cil->xc_current_sequence;
                queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
        }

        up_read(&cil->xc_ctx_lock);


        /*
         * If we are well over the space limit, throttle the work that is being
         * done until the push work on this context has begun.
         */

        if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
                xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
                return;
        }
        spin_unlock(&cil->xc_push_lock);
---

EXT4-JBD2

When to start commit jdb2?
(1) log space is not enough
start_this_handle()
    -> add_transaction_credits()
    ---
        needed = atomic_add_return(total, &t->t_outstanding_credits);
        if (needed > journal->j_max_transaction_buffers) {

                /*
                 * If the current transaction is already too large,
                 * then start to commit it: we can then go back and
                 * attach this handle to a new transaction.
                 */

                atomic_sub(total, &t->t_outstanding_credits);
                ...
                wait_transaction_locked(journal);
                return 1;
        }
    ---
(2) transaction is too old
jbd2_journal_stop()
---

        /*
         * If the handle is marked SYNC, we need to set another commit
         * going!    We also want to force a commit if the transaction is too
         * old now.
         */

        if (handle->h_sync ||
                time_after_eq(jiffies, transaction->t_expires)) {
                /* This is non-blocking */
                jbd2_log_start_commit(journal, tid);
        }
---
(3) commit timeouts
jbd2_get_transaction()
---
        /* Set up the commit timer for the new transaction. */
        journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
        add_timer(&journal->j_commit_timer);
---

commit_timeout()
---
        journal_t *journal = from_timer(journal, t, j_commit_timer);

        wake_up_process(journal->j_task);
---

In constrast with asynchronous journal, synchronous journal could guarantee
the metadata has been on disk after the syscall returns, but this could hurts
the performance. Batch synchronous journal IO is to promote this which has been
supported by jbd2.

jbd2_journal_stop()
---

        /*
         * Implement synchronous transaction batching.If the handle
         * was synchronous, don't force a commit immediately.    Let's
         * yield and let another thread piggyback onto this
         * transaction.    Keep doing that while new threads continue to
         * arrive.    It doesn't cost much - we're about to run a committed
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
         * and sleep on IO anyway.    Speeds up many-threaded, many-dir
             ^^^^^^^^^^^^^^^^^^^^^^
                                The io here means the journal IO if we commit immediately.
         * operations by 30x or more...
         *
         * We try and optimize the sleep time against what the
         * underlying disk can do, instead of having a static sleep
         * time.    This is useful for the case where our storage is so
         * fast that it is more optimal to go ahead and force a flush
         * and wait for the transaction to be committed than it is to
         * wait for an arbitrary amount of time for new writers to
         * join the transaction.    We achieve this by measuring how
         * long it takes to commit a transaction, and compare it with
         * how long this transaction has been running, and if run time
         * < commit time then we sleep for the delta and commit.    This
         * greatly helps super fast disks that would see slowdowns as
         * more threads started doing fsyncs.
         *
         * But don't do this if this process was the most recent one '
         * to perform a synchronous write.    We do this to detect the
         * case where a single process is doing a stream of sync
         * writes.    No point in waiting for joiners in that case.
         *
         * Setting max_batch_time to 0 disables this completely.
         */

        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid &&
                journal->j_max_batch_time) {
                u64 commit_time, trans_time;

                journal->j_last_sync_writer = pid;

                read_lock(&journal->j_state_lock);
                commit_time = journal->j_average_commit_time;
                read_unlock(&journal->j_state_lock);

                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
                                                     transaction->t_start_time));

                commit_time = max_t(u64, commit_time,
                                        1000*journal->j_min_batch_time);
                commit_time = min_t(u64, commit_time,
                                        1000*journal->j_max_batch_time);

                if (trans_time < commit_time) {
                        ktime_t expires = ktime_add_ns(ktime_get(),
                                                             commit_time);
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
                }
        }

        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;

        if (handle->h_sync ||
                time_after_eq(jiffies, transaction->t_expires)) {
                jbd2_log_start_commit(journal, tid);
                /*
                 * Special case: JBD2_SYNC synchronous updates require us
                 * to wait for the commit to complete.
                 */
                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
                        wait_for_commit = 1;
        }

        stop_this_handle(handle);

        if (wait_for_commit)
                err = jbd2_log_wait_commit(journal, tid);
---

file permissions

When a process performs an operation to a file, the Linux kernel performs
the check in the following order:

Discretionary Access Control (DAC)

Or __user__ dictated access control

This includes both classic UNIX style permission checks and POSIX Access Control Lists (ACL).

Classical UNIX checks compare the current process UID and GID versus the UID and GID of the
file being accessed with regards to which modes have been set (Read/Write/eXecute).

Access Control List extends classic UNIX checks to allow more options regarding permission control.

Mandatory Access Control (MAC)

        
Or policy based access control.
This is implemented using Linux Security Modules (LSM) which are not real modules anymore (they
used to be but it was dropped). They enable additionnal checks based on other models than the
classical UNIX style security checks. All of those models are based on a policy describing what
kind of opeartions are allowed for which process in which context.

unix style permissions

The classical unix style permissions is based on two points:

inode uid/gid and read/write/exec permissions of user/group/other

# ll test.c
-rw-r--r-- 1 will will 71 Mar 25 09:37 test.c

access process's gid/uid

# id will
uid=1000(will) gid=1000(will) groups=1000(will)

Through following two steps:

Step 1

Based on the accessing process' gid/uid to know who you are, user ? members
of same group ? other ?

Step 2

Based on the role decided by step one, get your permission, which is composed
by following compoments

#define S_IRUSR 00400
#define S_IWUSR 00200
#define S_IXUSR 00100

#define S_IRGRP 00040
#define S_IWGRP 00020
#define S_IXGRP 00010

#define S_IROTH 00004
#define S_IWOTH 00002
#define S_IXOTH 00001

Note !!! They are Octal !!!

The code of permission check is as following,

do_last()
    -> may_open()
        -> inode_permission()
            -> sb_permission()
                 ---

                        /* Nobody gets write access to a read-only fs. */

                        if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
                                return -EROFS;
                 ---
            -> do_inode_permission()
                -> generic_permission()
                    -> acl_permission_check()


acl_permission_check()
---
        if (likely(uid_eq(current_fsuid(), inode->i_uid)))
                mode >>= 6;
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                        int error = check_acl(inode, mask);
                        if (error != -EAGAIN)
                                return error;
                }

                if (in_group_p(inode->i_gid))
                        mode >>= 3;
        }

        if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
                return 0;

        return -EACCES;
---

infrastructure

filesystem statistics

/proc/PID/io

When we read the /proc/PID/io, we would get following things,

rchar: 2814484553
wchar: 2326047
syscr: 689899
syscw: 7616
read_bytes: 507052032
write_bytes: 1105920
cancelled_write_bytes: 393216

What do these fields stand for ? Let's look at the code.

proc_tid_io_accounting/proc_tgid_io_accounting()
    -> do_io_accounting()
    ---
        struct task_io_accounting acct = task->ioac;
        ...
        if (whole && lock_task_sighand(task, &flags)) {
                struct task_struct *t = task;

                task_io_accounting_add(&acct, &task->signal->ioac);
                while_each_thread(task, t)
                        task_io_accounting_add(&acct, &t->ioac);

                unlock_task_sighand(task, &flags);
        }
        seq_printf(m,
                     "rchar: %llu\n"
                     "wchar: %llu\n"
                     "syscr: %llu\n"
                     "syscw: %llu\n"
                     "read_bytes: %llu\n"
                     "write_bytes: %llu\n"
                     "cancelled_write_bytes: %llu\n",
                     (unsigned long long)acct.rchar,
                     (unsigned long long)acct.wchar,
                     (unsigned long long)acct.syscr,
                     (unsigned long long)acct.syscw,
                     (unsigned long long)acct.read_bytes,
                     (unsigned long long)acct.write_bytes,
                     (unsigned long long)acct.cancelled_write_bytes);
    ---

Let's look into the fields only by one

rchar, bytes read through synchronous read

add_rchar() <- vfs_read()
                        <- do_readv()
                        <- do_preadv()
                        <- compat_readv()
                        <- do_sendfile()
                        <- vfs_copy_file_range() >

rchar records the bytes read through read/readv/preadv...
the data can be from not only a regular file but also a socket

wchar, bytes write through synchronous write

add_wchar() <- vfs_write()
                        <- do_writev()
                        <- do_pwritev()
                        <- compat_writev()
                        <- do_sendfile()
                        <- vfs_copy_file_range() >

wchar is almost same with rchar

syscr & syscw, count of read/write syscall

inc_syscr/syscw() are hooked along with add_rchar/wchar()

read_bytes, the number of bytes which this task has caused to be read from storage

task_io_account_read() <- submit_bio()
                       <- nfs_file_direct_read()
                       <- read_cache_pages()         <-nfs_readpages()
                                                     <-fuse_readpages() >

read is always __synchronous__ no mater buffer read nor direct IO, so submit_bio is always necessary, except for special filesystem,
such as networking filesystem.

write_bytes, the number of bytes which this task has caused, or shall cause to be written to disk

task_io_account_write() <- __blkdev_direct_IO_simple()
                           <- __blkdev_direct_IO()
                           <- submit_page_section()       <- dio_zero_block()
                                                          <- do_direct_IO()
                           <- iomap_dio_bio_actor()
                           <- nfs_file_direct_write()
                           <- account_page_dirtied()      <- __set_page_dirty()
                                                          <- __set_page_dirty_nobuffers() >
buffer write is asynchronous, the final writeback is done by writeback workers, so we cannot
account it in submit_bio.

iov_iter

What's iov_iter ?

struct iov_iter {
        unsigned int type;     Type, IOVEC/KVEC/BVEC
        size_t iov_offset;     current offset, like bio.bi_iter.bi_sector
        size_t count;                residual count like bio.bi_iter.bi_size
        union {
                const struct iovec *iov;     
                const struct kvec *kvec;
                const struct bio_vec *bvec;
                ...
        };

        See the bio_vec, basically, iov and kvec have similar functions.
        The bio.bi_io_vec always points the head of the bvec array.
        iov/kvec/bvec always points the current vector, like bio.bi_io_vec[bio->bi_iter.bi_idx]

        ...
};

Just refer to iterate_and_advance() to know how does the iov_iter work

Even though we try to compare the bio with iov_iter, but they are not different.
bio has a map between the block device and buffer in memory,
however, iov_iter only describes the buffer, which is more similar with sglist.
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Next, we will figure out what does the IOVEC/KVEC/BVEC mean and who dose use them.

IOVEC, address from userland


struct iovec
{
        void __user *iov_base;
        __kernel_size_t iov_len;
};

There are two cases here,
(1) construct iov_iter in kernel, and address from userland
vfs_read()
    -> new_sync_read() //read_iter
    ---
        struct iovec iov = { .iov_base = buf, .iov_len = len };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);

        // We have said that iov_iter is just buffer, the io is described 
        // by kiocb.ki_filp and kiocb.ki_pos

        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_init(&iter, READ, &iov, 1, len);

        ret = call_read_iter(filp, &kiocb, &iter);
    ---
(2) iov array is from userland

static ssize_t vfs_readv(
        struct file *file,
        const struct iovec __user *vec,
        unsigned long vlen,
        loff_t *pos,
        rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        ssize_t ret;

        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);

        // __import_iovec()->iovec_from_user() would copy the iovec array from userland

        if (ret >= 0) {
                ret = do_iter_read(file, &iter, pos, flags);
                kfree(iov);
        }

        return ret;
}

KVEC, address from kernel, BVEC, a page in it

struct kvec {
        void *iov_base;
        size_t iov_len;
};

struct bio_vec {
        struct page        *bv_page;
        unsigned int        bv_len;
        unsigned int        bv_offset;
};

nbd_send_cmd() use both of them,

nbd_send_cmd()
---
        struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
        struct iov_iter from;

        //1st kvec is used to send nbd command

        iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
        ...
        cmd->index = index;
        cmd->cookie = nsock->cookie;
        cmd->retries = 0;
        request.type = htonl(type | nbd_cmd_flags);
        if (type != NBD_CMD_FLUSH) {
                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
                request.len = htonl(size);
        }
        handle = nbd_cmd_handle(cmd);
        memcpy(request.handle, &handle, sizeof(handle));

        //the iov_iter 'from' carry the nbd command

        result = sock_xmit(nbd, index, 1, &from,
                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);

send_pages:
        if (type != NBD_CMD_WRITE)
                goto out;

        //Then let's send out the request with bvec iov_iter_bvec

        bio = req->bio;
        while (bio) {
                struct bio *next = bio->bi_next;
                struct bvec_iter iter;
                struct bio_vec bvec;

                bio_for_each_segment(bvec, bio, iter) {
                        bool is_last = !next && bio_iter_last(bvec, iter);
                        int flags = is_last ? 0 : MSG_MORE;


                        //Setup a iov_iter for every bvec

                        iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
                        ...
                        result = sock_xmit(nbd, index, 1, &from, flags, &sent);
                        ...
                        if (is_last)
                                break;
                }
                bio = next;
        }
---

The other modules that receive iov_iter could use following interfaces to handle it


size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        if (unlikely(!page_copy_sane(page, offset, bytes)))
                return 0;

        if (i->type & (ITER_BVEC|ITER_KVEC)) {

                void *kaddr = kmap_atomic(page);
                size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
                kunmap_atomic(kaddr);
                return wanted;
        } else if (unlikely(iov_iter_is_discard(i)))
                return bytes;
        else if (likely(!iov_iter_is_pipe(i)))
                return copy_page_to_iter_iovec(page, offset, bytes, i);
        else
                return copy_page_to_iter_pipe(page, offset, bytes, i);
}


size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        const char *from = addr;
        if (unlikely(iov_iter_is_pipe(i)))
                return copy_pipe_to_iter(addr, bytes, i);
        if (iter_is_iovec(i))
                might_fault();
        iterate_and_advance(i, bytes, v,

                //IOVEC, userland buffers, the 'I' in iterate_and_advance

                copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),

                //BVEC, pages, the 'B' in iterate_and_advance

                memcpy_to_page(v.bv_page, v.bv_offset,
                                     (from += v.bv_len) - v.bv_len, v.bv_len),

                //KVEC, kernel buffers, the 'K' in iterate_and_advance

                memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
        )

        return bytes;
}

page fault

inode lock and page fault

We cannot take the inode lock (inode.i_rwsem) in page fault path.
For example,

ext4_dax_vm_ops.ext4_dax_huge_fault()
---
        if (write) {
                ...
        } else {
                down_read(&EXT4_I(inode)->i_mmap_sem);
        }
        result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
        if (write) {
                ...
        } else {
                up_read(&EXT4_I(inode)->i_mmap_sem);
        }
---

xfs_file_vm_ops.xfs_filemap_fault()
---

        // XFS_MMAPLOCK_SHARED -> xfs_inode_t.i_mmaplock

        xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
        if (IS_DAX(inode)) {
                pfn_t pfn;

                ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
                                (write_fault && !vmf->cow_page) ?
                                 &xfs_direct_write_iomap_ops :
                                 &xfs_read_iomap_ops);
                if (ret & VM_FAULT_NEEDDSYNC)
                        ret = dax_finish_sync_fault(vmf, pe_size, pfn);
        } else {
                if (write_fault)
                        ret = iomap_page_mkwrite(vmf,
                                        &xfs_buffered_write_iomap_ops);
                else
                        ret = filemap_fault(vmf);
        }
        xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
---

Why ? Refer to link, https://lwn.net/Articles/548098

The problem has to do with the order in which locks are acquired.
For normal filesystem operations, the filesystem code will obtain any locks it requires;
the memory management subsystem will then grab mmap_sem should that be required — to bring a read or write buffer into RAM, for example. 
When a page fault happens, though, the lock ordering is reversed:
first mmap_sem is taken, then the filesystem code is invoked to bring the needed page into RAM.

Let's look at the code that could show us the lock ordering,

fs lock -> mmap lock

ext4_dax_read_iter()
    -> inode_lock_shared()
    -> dax_iomap_rw()
        -> iomap_apply()
            -> dax_iomap_actor()
                -> dax_copy_to_iter()
                    -> _copy_mc_to_iter()
                    
                    // when we write to userland buffer, page fault would happen, and then
                    // do_user_addr_fault() would come

mmap lock -> fs lock

do_user_addr_fault()
    -> mmap_read_lock(mm)
    -> handle_mm_fault()
        -> __handle_mm_fault()
            -> handle_pte_fault()
                -> do_fault()
                    -> do_read_fault()
                        -> __do_fault()
                            -> vma->vm_ops->fault(vmf);
                                 ext4_dax_fault()
                                 xfs_filemap_fault()

mmap_sem in page fault

In the previous section, we have known that the whole page fault path is
under the mm->mmap_sem. In page fault path, we could do a lot of thing,
including some read or write IO, And this could cause a lot of problems, such as

Holding the mmap_sem while doing IO is problematic because it can cause
system-wide priority inversions.  Consider some large company that does a
lot of web traffic.  This large company has load balancing logic in it's
core web server, cause some engineer thought this was a brilliant plan.
This load balancing logic gets statistics from /proc about the system,
which trip over processes mmap_sem for various reasons.  Now the web
server application is in a protected cgroup, but these other processes may
not be, and if they are being throttled while their mmap_sem is held we'll
stall, and cause this nice death spiral.

The upstream solves this problem as following,

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                            struct file *fpin)
{
    int flags = vmf->flags;

    if (fpin)
        return fpin;

    if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) {

        fpin = get_file(vmf->vma->vm_file);
        mmap_read_unlock(vmf->vma->vm_mm);

    }
    return fpin;
}

(1) Get the reference of the file under mmap_sem
(2) Unlock the mmap_sem

To understand this solotuion, we need to know what does the mmap_sem protect.

vma rbtree ()

This rbtree is to quickly finding the VMA associated with a given address,
or finding a gap in the address space that is large enough to hold a new VMA

vma list

Making it possible to walk through the entire space.

vm_area_struct itself

It includes the page fault policy behind this vma and also the things behind it,
a file or just some pages.

The common case is that the vma is backed by a regular file.

vm_mmap_pgoff()
  -> mmap_write_lock_killable()
  -> do_mmap()
    -> mmap_region()
    ---
        vma = vm_area_alloc(mm);
        ...
        if (file) {
            vma->vm_file = get_file(file);
            error = call_mmap(file, vma);
            ...
        } 
    ---

A reference is grabbed here to keep it alive during the mapping

__do_munmap()
  -> unmap_region()
    -> unmap_vmas()
       ---
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
            unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);

            // Page table is removed here

       ---
  -> remove_vma_list()
    -> remove_vma_list()
      -> remove_vma()
      ---
        if (vma->vm_file)

            fput(vma->vm_file)

      ---

After maybe_unlock_mmap_for_io() pins the file and unlock the mmap_sem,
it could guarantee the file will be gone during this, and it looks like
a normal IO through syscall read/write. When the IO is completed, we could
retry the page fault path, at the moment, the IO has been ready.

do_user_addr_fault()
---
    if (unlikely(!mmap_read_trylock(mm))) {
        ...
retry:
        mmap_read_lock(mm);
    } else {
        might_sleep();
    }

    vma = find_vma(mm, address);
    ...
    fault = handle_mm_fault(vma, address, flags, regs);
    ...

    /*
     * If we need to retry the mmap_lock has already been released,
     * and if there is a fatal signal pending there is no guarantee
     * that we made any progress. Handle this case first.
     */

    if (unlikely((fault & VM_FAULT_RETRY) &&
             (flags & FAULT_FLAG_ALLOW_RETRY))) {
        flags |= FAULT_FLAG_TRIED;
        goto retry;
    }
---

steps of do_shared_fault

do_shared_fault() handles the page fault on

PAGE_SIZE (not PUD_SIZE or PMD_SIZE)

triggered by write on the shared mapping

The main steps of do_shared_fault is as following,

vm_ops->fault, prepare the page

Prepare the page in page cache and fill it.
ext4 follows the basic fashion of linux page fault. It invokes
filemap_fault()
---

    //Try to find the page, if not exist, create a new one

    page = find_get_page(mapping, offset);
    if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
        fpin = do_async_mmap_readahead(vmf, page);
    } else if (!page) {
        ...
retry_find:
        page = pagecache_get_page(mapping, offset, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask);
    }
    ...
page_not_uptodate:
    ClearPageError(page);
    fpin = maybe_unlock_mmap_for_io(vmf, fpin);

    // If not update, read it in

    error = mapping->a_ops->readpage(file, page);
    if (!error) {
        wait_on_page_locked(page);
        if (!PageUptodate(page))
            error = -EIO;
    }
---

vm_ops->page_mkwrite, make the page writable

What do we need to do to make a page writable ?
Look int othe ext4_page_mkwrite()
---
    do {
        err = block_page_mkwrite(vma, vmf,
                       ext4_da_get_block_prep);
    } while (err == -ENOSPC &&
           ext4_should_retry_alloc(inode->i_sb, &retries));

The most important thing here is to reserve the space for the page (delay
allocation case)
---

finish_fault, cook the page table entry

finish_fault()
---
    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                      vmf->address, &vmf->ptl);
    ret = 0;
    /* Re-check under ptl */
    if (likely(pte_none(*vmf->pte)))
        do_set_pte(vmf, page, vmf->address);
    else
        ret = VM_FAULT_NOPAGE;

    update_mmu_tlb(vma, vmf->address, vmf->pte);
    pte_unmap_unlock(vmf->pte, vmf->ptl);

---

balance dirty, limit the speed of dirtying pages

sendfile

sendfile is one of the feature to implement zero-copy


        read             write                            sendfile
                |uuuuuu|                ---\
                ^            \                ---/
-------/--------\--------------------------------------
            /                    v
    |ssssss|     |dddddd|                     |ssssss|    -> |dddddd|

do_sendfile()
    -> splice_direct_to_actor()
        -> do_splice_to() //read from src into pipe
            -> f_op->splice_read()
        -> do_splice_from() //write from pipe into dest
            -> f_op->splice_write()

Let's how does the pipe dance here ?

splice_read


generic_file_splice_read()
    -> ext4_file_read_iter()
        -> generic_file_read_iter()
            -> generic_file_read_iter()
                -> copy_page_to_iter()
                    -> copy_page_to_iter_pipe()
---
        off = i->iov_offset;
        buf = &pipe->bufs[i_head & p_mask];
        if (off) {
                if (offset == off && buf->page == page) {
                        /* merge with the last one */
                        buf->len += bytes;
                        i->iov_offset += bytes;
                        goto out;
                }
                i_head++;
                buf = &pipe->bufs[i_head & p_mask];
        }
        if (pipe_full(i_head, p_tail, pipe->max_usage))
                return 0;

        buf->ops = &page_cache_pipe_buf_ops;
        get_page(page);

        buf->page = page;
        buf->offset = offset;
        buf->len = bytes;

        pipe->head = i_head + 1;
        i->iov_offset = offset + bytes;
        i->head = i_head;
out:
        i->count -= bytes;
---

The pipe here looks like the another kind of bvec arrays.

splice_write

iter_file_splice_write()
---
        pipe_lock(pipe);

        splice_from_pipe_begin(&sd);
        while (sd.total_len) {
                struct iov_iter from;
                unsigned int head, tail, mask;
                size_t left;
                int n;

                ret = splice_from_pipe_next(pipe, &sd);
                if (ret <= 0)
                        break;

                ...
                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                /* build the vector */
                left = sd.total_len;
                for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t this_len = buf->len;

                        if (this_len > left)
                                this_len = left;
                        ...
                        array[n].bv_page = buf->page;
                        array[n].bv_len = this_len;
                        array[n].bv_offset = buf->offset;
                        left -= this_len;
                }

                // construct a bvec iov_iter from the pipe

                iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
                ret = vfs_iter_write(out, &from, &sd.pos, 0);
                if (ret <= 0)
                        break;

                sd.num_spliced += ret;
                sd.total_len -= ret;
                *ppos = sd.pos;

                /* dismiss the fully eaten buffers, adjust the partial one */

                tail = pipe->tail;
                while (ret) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        if (ret >= buf->len) {
                                ret -= buf->len;
                                buf->len = 0;
                                pipe_buf_release(pipe, buf);
                                tail++;
                                pipe->tail = tail;
                                if (pipe->files)
                                        sd.need_wakeup = true;
                        } else {
                                buf->offset += ret;
                                buf->len -= ret;
                                ret = 0;
                        }
                }
        }
done:
        kfree(array);
        splice_from_pipe_end(pipe, &sd);

        pipe_unlock(pipe);

---

open_and_close

When you open or close a file, you could get or put many instances' reference.
Please refer to following comment from link https://lwn.net/Articles/494158/

The management of file structure reference counts is done with calls to fget() and fput().
A file structure, which represents an open file, can depend on a lot of resources:
as long as a file is open, the kernel must maintain its underlying storage device,
filesystem, network protocol information, security-related information, user-space
notification requests, and more. An fget() call will ensure that all of those resources
stay around as long as they are needed. A call to fput(), instead, might result in the
destruction of any of those resources. For example, closing the last file on an unmounted
filesystem will cause that filesystem to truly go away.

Next, let's try to figure out what they are.
We could get some hint in the __fput()

__fput is deferred to the task work context which will be invoked before the
task returns to userland. Refer to the following comment to get the point

----
What all this means is that a call to fput() can do a lot of work, and that
work may require the acquisition of a number of locks. The problem is that
fput() can also be called from any number of contexts; there are a few hundred
fput() and fput_light() calls in the kernel. Each of those call sites has its
own locking environment and, usually, no knowledge of what code in other
subsystems may be called from fput(). So the potential for problems like
locking-order violations is real.
----

void fput_many(struct file *file, unsigned int refs)
{
        if (atomic_long_sub_and_test(refs, &file->f_count)) {
                struct task_struct *task = current;

                if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                        init_task_work(&file->f_u.fu_rcuhead, ____fput);
                        if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
                                return;
                }

                if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
                        schedule_delayed_work(&delayed_fput_work, 1);
        }
}

exit_to_user_mode_loop()
    -> tracehook_notify_resume()
        -> task_work_run()

Let's focus on the dput() and mntput() in __fput().

mntput()

mntput()
    -> mntput_no_expire()
---
        ...
        lock_mount_hash();
        smp_mb();
        mnt_add_count(mnt, -1);
        count = mnt_get_count(mnt);
        if (count != 0) {
                WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        ...
        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }
        cleanup_mnt(mnt);
---

If the reference of the mnt is exhausted, cleanup_mnt() will be invoked.
cleanup_mnt()
---
        ...

        //put the dentry of the mount point

        dput(mnt->mnt.mnt_root);

        //This is the most important part, the real work of umount would be done
        //here

        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
---

deactivate_super()
    -> deactivate_locked_super()
        -> fs->kill_sb()
             kill_block_super()
             ---
                generic_shutdown_super(sb);
                sync_blockdev(bdev);
                WARN_ON_ONCE(!(mode & FMODE_EXCL));
                blkdev_put(bdev, mode | FMODE_EXCL);
             ---

generic_shutdown_super()
---

        // shrink the dcache to release all of the reference to inode cache

        shrink_dcache_for_umount(sb);
        sync_filesystem(sb);
        sb->s_flags &= ~SB_ACTIVE;


        /* evict all inodes with zero refcount */
        // Can we ensure all of the inode to be evicted ?
        // It should be YES, because every opened file holds a reference of mount.
        // When we reach here, all of them should be closed.

        evict_inodes(sb);
        ...
        if (!list_empty(&sb->s_inodes)) {
                printk("VFS: Busy inodes after unmount of %s. "
                     "Self-destruct in 5 seconds.    Have a nice day...\n",
                     sb->s_id);
        }
---

dput()

Every opened file holds a reference to the dentry and every dentry holds a
reference to inode.

The mnt and dentry associated with the file is assigned in vfs_open()

path_openat()
    -> link_path_walk()
    -> open_last_lookups()
    -> do_open()
        -> vfs_open()
int vfs_open(const struct path *path, struct file *file)
{

        file->f_path = *path;

        return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}

mount

Let's first look at what will be done during a mount

prepare filesystem context

do_new_mount()
    -> fs_context_for_mount()
        -> alloc_fs_context()
            -> fc->fs_type->init_fs_context()
                 legacy_init_fs_context()

                 fc->ops = &legacy_fs_context_ops

get_tree

vfs_get_tree()
    -> fc->ops->get_tree()
         legacy_get_tree()
         -> fc->fs_type->mount()
                ext4_mount()
                    -> mount_bdev() with ext4_fill_super()
mount_bdev()
---
        bdev = blkdev_get_by_path(dev_name, mode, fs_type);
        ...
        mutex_lock(&bdev->bd_fsfreeze_mutex);
        ...
        s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
                 bdev);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        if (s->s_root) {
                ...
        } else {
                s->s_mode = mode;
                snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
                sb_set_blocksize(s, block_size(bdev));
                error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
                ---
                        root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
                        ...

                        sb->s_root = d_make_root(root);

                ---
                ...
                s->s_flags |= SB_ACTIVE;
                bdev->bd_super = s;
        }

        return dget(s->s_root);
---

writenotify

When a file is mapped, how does the kernel know the file is written ?
The answer is writenotify

vma access permissions

do_mmap()
    -> mmap_region()
        -> vma_set_page_prot()
        ---
        unsigned long vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {

                vm_flags &= ~VM_SHARED;

                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }

        // Note, vma->vm_flags is not modified

        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
        ---
vma_wants_writenotify()
---
        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
                return 0;

        /* The backer wishes to know when pages are first written to? */

        if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
                                                     ^^^^^^^^^^^^                        ^^^^^^^^^^^
                return 1;
---
The vma->wm_page_prot will be used when vmf_insert_mixed_mkwrite()

Finally, the mapped page doesn't have write permission

page fault due to no write permission

handle_pte_fault()
---
        if (vmf->flags & FAULT_FLAG_WRITE) {

                if (!pte_write(entry))

                        return do_wp_page(vmf);
                entry = pte_mkdirty(entry);
        }

        // Set the pte access flags

        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
                update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
        }
---

do_wp_page()
    -> wp_page_shared()
        -> do_page_mkwrite()

The page will be marked dirty during this.

clear the write permission again

clear_page_dirty_for_io() is invoked during page writeback

synchronous page fault

What's synchronous page fault for ?

https://lwn.net/Articles/731706/
Normally, filesystems are in control of all I/O to the underlying storage
                                                                                ^^^^^^^
media; they use that control to ensure that the filesystem structure is
consistent at all times. Even when a file on a traditional storage device
is mapped into a process's virtual address space, the filesystem manages
the writeback of modified pages from the page cache to persistent storage.
Directly mapped persistent memory bypasses the filesystem, though, leading
to a number of potential problems including inconsistent metadata or data
corruption and loss if the filesystem relocates the file being modified.

For example, in ext4,
ext4_dax_huge_fault()
    -> dax_iomap_fault()
        -> dax_iomap_pte_fault()
            -> ext4_iomap_begin()
                -> ext4_iomap_alloc()
                    -> ext4_journal_start()
                    -> ext4_map_blocks()
                    -> ext4_journal_stop()

                    At this moment, the metadata has not been on persistent storage.

            -> vmf_insert_mixed_mkwrite() //insert page fault
     
After the page fault returns, the metadata may have not been on the page fault
due to the asynchronous journal employed by both xfs and ext4. The userland process
                     ^^^^^^^^^^^^^^^^^^^^
(usually a storage engine) can write data on the mapping. But if the system crashes
before the fs metadata get flushed the persistent storage medium, the written data
could be lost.

MAP_SYNC is such as flags that tells kernel do the fsync after before the page fault
returns.

ext4_dax_huge_fault()
    -> dax_iomap_fault()
        -> dax_iomap_pte_fault()
            -> ext4_iomap_begin()
---

                /*
                 * If we are doing synchronous page fault and inode needs fsync,
                 * we can insert PTE into page tables only after that happens.
                 * Skip insertion for now and return the pfn so that caller can
                 * insert it after fsync is done.
                 */

                if (sync) {
                        *pfnp = pfn;
                        ret = VM_FAULT_NEEDDSYNC | major;
                        goto finish_iomap;
                }
---

ext4_dax_huge_fault()
--
        result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
        if (write) {
                ext4_journal_stop(handle);

                /* Handling synchronous page fault? */
                if (result & VM_FAULT_NEEDDSYNC)
                        result = dax_finish_sync_fault(vmf, pe_size, pfn);

                            //guarantee the metadat get flushed to the persistent storage medium

                            -> vfs_fsync_range()
                                -> file_write_and_wait_range()
                                -> ext4_fsync_journal()
                                -> blkdev_issue_flush()

                                // insert page table

                            -> dax_insert_pfn_mkwrite()
                up_read(&EXT4_I(inode)->i_mmap_sem);
                sb_end_pagefault(sb);
        } 
--

backing dev info (bdi)

bdi, namely backing_dev_info, represents the underlying device (maybe a virtual
device for nfs or fuse). It mainly works for writeback, but also carry the ra_pages
which is for readahead.

noop and bdev

By default, the super_block has noop_backing_dev_info,
alloc_super()
---
        s->s_bdi = &noop_backing_dev_info;
---
Then s_bdi will be set to the bdi of the underlaying device or a virtual one.
set_bdev_super() <- mount_bdev()             <- ext2_mount()
                                                                             <- ext4_mount()
                                 <- get_tree_bdev()        <- xfs_fs_get_tree()
super_setup_bdi_name() <- ceph_setup_bdi()
                                             <- nfs_get_tree_common()
                                             <- super_setup_bdi()         <- btrfs_fill_super()                 >

If the s_bdi is noop_backing_dev_info, there won't be any writeback activity on this filesystem.

writeback and cgroup blkio

In the past, only direct IO and buffered read IO can be controlled by cgroup
blkio. The reason is as following diragram.


     a cgroup            VFS LAYER            root cgroup
        Task A                                         writeback kworker
            |                                                         |
            | dirtying pages                            | flush dirty inodes
            v                                                         v
[D] [D] [D] [D]            ------->     writepages
[D] [D] [D] [D]                                         |
------------------------------------^--------------------
                                    BLOCK LAYER             |
                                                                        v
                                                                 submit_bio
                                                                        | blkio qos/scheduler
                                                                        v
                                                             [R] [R] [R] [R]
---------------------------------------------------------
                                 SCSI/NVME/....

writeback kworker does the real IO but it belongs to the root cgroup.
More details, refer to Writeback and control groups
Let's look into the code,

bdi_writeback per backing_dev_info & memcg_css pair

bdi_writeback is in charge of dirty page writeabck activity to the block device
behind the backing_dev_info. Look into it, we can find b_dirty/b_dirty_time
which are used to hang dirty inodes.
The role of backing_dev_info is weakened and become the brigde between bdi_writeback
and block device.

associate bdi_writeback and inode

This is done by inode_attach_wb(). It is in the context of the task that's dirtying pages

inode_attach_wb    <-    __mark_inode_dirty()
                                 <- wbc_attach_fdatawrite_inode()    <-    __filemap_fdatawrite_range()
                                                                                                     <-    __set_page_dirty_nobuffers()[M]
                                 <- account_page_dirtied()                 <-    __set_page_dirty()    <-    __set_page_dirty_buffers()[M]
                                                                                                                                                     <-    mark_buffer_dirty()[M]
                                                                                                                                                     <-    iomap_set_page_dirty()[M]
[M] : there is __mark_inode_dirty invoked in it

associate blkcg and bio

This is done by wbc_init_bio in .writepages
wbc_init_bio()
---

        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.    This is intentional as we don't want the function to block
                                                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
         * behind a slow cgroup.    Ultimately, we want pageout() to kick off
             ^^^^^^^^^^^^^^^^^^^^
         * regular writeback instead of writing things out itself.
         */

        if (wbc->wb)
                bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
---

Take ext4 as example,
io_submit_init_bio()
---
        bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio_set_dev(bio, bh->b_bdev);
        bio->bi_end_io = ext4_end_bio;
        bio->bi_private = ext4_get_io_end(io->io_end);
        io->io_bio = bio;
        io->io_next_block = bh->b_blocknr;
        wbc_init_bio(io->io_wbc, bio);
---

Now, we know that the bios queued by writeback kworker have owned the
information of cgroup that dirtied the pages.
And we still need to throttle the task that's dirtying the pages to match
the speed of dirtying and cleaning. And this has been done in balance_dirty_pages()

balance_dirty_pages()
---
        struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
        struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL;

        //mdtc is for memcg dirty throttle control

                if (mdtc) {
                        unsigned long filepages, headroom, writeback;

                        /*
                         * If @wb belongs to !root memcg, repeat the same
                         * basic calculations for the memcg domain.
                         */
                        mem_cgroup_wb_stats(wb, &filepages, &headroom,
                                                &mdtc->dirty, &writeback);
                        mdtc->dirty += writeback;
                        mdtc_calc_avail(mdtc, filepages, headroom);

                        domain_dirty_limits(mdtc);
                }
                ...
                if (mdtc) {
                        ...
                        dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
                                ((mdtc->dirty > mdtc->thresh) || strictlimit);

                        wb_position_ratio(mdtc);
                        if (mdtc->pos_ratio < gdtc->pos_ratio)
                                sdtc = mdtc;
                }
---

domain_dirty_limits() is for calculating the limit of per-memcg
---
        /* gdtc is !NULL iff @dtc is for memcg domain */
        if (gdtc) {
                unsigned long global_avail = gdtc->avail;

                /*
                 * The byte settings can't be applied directly to memcg
                 * domains.    Convert them to ratios by scaling against
                 * globally available memory.    As the ratios are in
                 * per-PAGE_SIZE, they can be obtained by dividing bytes by
                 * number of pages.
                 */

                if (bytes)
                        ratio = min(DIV_ROUND_UP(bytes, global_avail),
                                        PAGE_SIZE);
                if (bg_bytes)
                        bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                                             PAGE_SIZE);
                bytes = bg_bytes = 0;
        }
---

Whether enable this feature is controlled by wb->memcg_css->parent,
namely, whether this bdi_writeback belongs to a non-root memcg.
Refer to MDTC_INIT() and mdtc_valid();

And the belonging of the bdi_writeback is decided when
balance_dirty_pages_ratelimited()
---
        if (inode_cgwb_enabled(inode))
                wb = wb_get_create_current(bdi, GFP_KERNEL);
        if (!wb)
                wb = &bdi->wb;
---

partial write

Partial Write

/------------------------------------------------------------------/
__block_write_begin_int()
---
        for(bh = head, block_start = 0; bh != head || !block_start;
                block++, block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                ...
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                        !buffer_unwritten(bh) &&
                         (block_start < from || block_end > to)) {
                        ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                        *wait_bh++=bh;
                }
        }
---
Pages will be read in befoare written.

__GFP_NOFS when alloc pagecache

When allocate page cache, what's gfp_mask is used ?

inode_init_always()
---
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
---

#define GFP_HIGHUSER_MOVABLE        (GFP_HIGHUSER | __GFP_MOVABLE)
                                                                     /     \
                                                        GFP_USER | __GFP_HIGHMEM
                                                         /         \
                        __GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL

ITOW, pagecache allocation use GFP_KERNEL. However, xfs is not.

xfs_setup_inode()
---

        /*
         * Ensure all page cache allocations are done from GFP_NOFS context to
         * prevent direct reclaim recursion back into the filesystem and blowing
         * stacks or deadlocking.
         */

        gfp_mask = mapping_gfp_mask(inode->i_mapping);
        mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
---

And this can influence the action in shrink_page_list,

                if (PageWriteback(page)) {
                        /* Case 1 above */
                        if (current_is_kswapd() &&
                                PageReclaim(page) &&
                                test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                                stat->nr_immediate++;
                                goto activate_locked;

                        /* Case 2 above */
                        } else if (writeback_throttling_sane(sc) ||
                                !PageReclaim(page) || !may_enter_fs) {
                                SetPageReclaim(page);
                                stat->nr_writeback++;
                                goto activate_locked;

                        /* Case 3 above */
                        } else {
                                unlock_page(page);
                                wait_on_page_writeback(page);
                                /* then go back and try same page again */
                                list_add_tail(&page->lru, page_list);
                                continue;
                        }
                }

This means xfs is easy to OOM with buffer IO, but ext4 is not.
[<0>] io_schedule+0x12/0x40
[<0>] wait_on_page_bit+0x137/0x230
[<0>] shrink_page_list+0xbab/0xc50
[<0>] shrink_inactive_list+0x254/0x580
[<0>] shrink_node_memcg+0x1fa/0x720
[<0>] shrink_node+0xce/0x440
[<0>] do_try_to_free_pages+0xc3/0x360
[<0>] try_to_free_mem_cgroup_pages+0xf9/0x210
[<0>] try_charge+0x192/0x780
[<0>] mem_cgroup_try_charge+0x8b/0x1a0
[<0>] __add_to_page_cache_locked+0x64/0x240
[<0>] add_to_page_cache_lru+0x64/0x100
[<0>] pagecache_get_page+0xf2/0x2c0
[<0>] grab_cache_page_write_begin+0x1f/0x40
[<0>] ext4_da_write_begin+0xce/0x470 [ext4]
[<0>] generic_perform_write+0xf4/0x1b0
[<0>] __generic_file_write_iter+0xfe/0x1c0
[<0>] ext4_file_write_iter+0xc6/0x3b0 [ext4]
[<0>] new_sync_write+0x124/0x170
[<0>] vfs_write+0xa5/0x1a0
[<0>] ksys_write+0x4f/0xb0
[<0>] do_syscall_64+0x5b/0x1b0
[<0>] entry_SYSCALL_64_after_hwframe+0x65/0xca
[<0>] 0xffffffffffffffff

umount

When we mount a filesystem, there are mainly 3 things to be setup,

superblock

All of the things about the filesystem is in the super block

root inode

root inode is the entry of the filesystem

mount point

A umount mainly do following things,

buffer_head

bh vs its friends

bh is a legacy of old version kernel. Nowadays, it mainly works with following compoments,

page cache

In linux kernel, a page is 4K, but the filesystem block could be smaller than it.
In ext2/ext4, bh is used to represent the filesystem block.

page->private -> bh0 -> bh1 -> bh2 -> bh3

When does the block device know the filesystem block size ?
It is
sb_set_blocksize()
  -> sb_set_blocksize()
  ---
    /* Don't change the size if it is same as current */
    if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
        sync_blockdev(bdev);
        bdev->bd_inode->i_blkbits = blksize_bits(size);
        kill_bdev(bdev);
    }
    return 0;
  ---

The sb->s_blocksize would influence how many bhs are allocated for a page,
sb_getblk()
  -> __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
    -> __getblk_slow()
      -> grow_buffers()
        -> grow_dev_page()
          -> alloc_page_buffers()
          ---
        offset = PAGE_SIZE;
        while ((offset -= size) >= 0) {
            bh = alloc_buffer_head(gfp);
            if (!bh)
                goto no_grow;

            bh->b_this_page = head;
            bh->b_blocknr = -1;
            head = bh;

            bh->b_size = size;

            /* Link the buffer to its page */
            set_bh_page(bh, page, offset);
        }
          ---


Currently, there are mainly two positions that employ bh
(1) store bmap info
ext4 is an example

__block_write_begin_int()
---
        for(bh = head, block_start = 0; bh != head || !block_start;
                block++, block_start=block_end, bh = bh->b_this_page) {
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        if (get_block) {
                                err = get_block(inode, block, bh, 1);
                                if (err)
                                        break;
                        }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                        !buffer_unwritten(bh) &&
                         (block_start < from || block_end > to)) {
                        ll_rw_block(REQ_OP_READ, 0, 1, &bh);
                        *wait_bh++=bh;
                }
        }
        /*
         * If we issued read requests - let them complete.
         */
        while(wait_bh > wait) {
                wait_on_buffer(*--wait_bh);
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }

---

io_submit_add_bh()
    -> io_submit_add_bh()
    ---
        bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS);
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio_set_dev(bio, bh->b_bdev);
        bio->bi_end_io = ext4_end_bio;
        bio->bi_private = ext4_get_io_end(io->io_end);
        wbc_init_bio(io->io_wbc, bio);
    ---
        
(2) fs metadata
__ext4_get_inode_loc()
---

        /*
         * Figure out the offset within the block group inode table
         */

        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
        bh = sb_getblk(sb, block);
        ...
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
                ...
make_io:
                blk_start_plug(&plug);
                ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
                blk_finish_plug(&plug);
                wait_on_buffer(bh);
        }
---

bio

bio is the unit of IO in both fs (such as xfs) and block. And kernel also
provide wrapper interfaces for backward compatibility of legacy bh.
submit_bh_wbc()
---
        bio = bio_alloc(GFP_NOIO, 1);

        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio_set_dev(bio, bh->b_bdev);
        bio->bi_write_hint = write_hint;

        bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;

        if (buffer_meta(bh))
                op_flags |= REQ_META;
        if (buffer_prio(bh))
                op_flags |= REQ_PRIO;
        bio_set_op_attrs(bio, op, op_flags);

        submit_bio(bio);
---

journal_head

Yes !!! jbd2 still uses bh.

                    b_bh
        jh    ---------->
                <---------    bh
                 b_private
>
 - commit to jbd2
     ext4_do_update_inode()
         -> ext4_handle_dirty_metadata(handle, NULL, bh)

 - after log is committed
     __jbd2_journal_temp_unlink_buffer()
     ---
        jh->b_jlist = BJ_None;
        if (transaction && is_journal_aborted(transaction->t_journal))
                clear_buffer_jbddirty(bh);
        else if (test_clear_buffer_jbddirty(bh))
                mark_buffer_dirty(bh);        /* Expose it to the VM */
     ---

jbd2

transaction

Atomicity

The modifications in a transaction will be done or nothing.
This is achieved by only flush modifications to disk after persist their log records to disk.

There are TWO points here,
One, ensure integrity of the log record on disk
 
Descriptor
journal_header_s
JBD2_DESCRIPTOR_BLOCK    chksum of descriptor
  |                                |
  v                                v
 |header+tag+tag+tag+tag+tag+tag+chsum--|
          ^                   ^
          |                   |
   journal_block_tag_s     tag with JBD2_FLAG_LAST_TAG
   ---
    __be32 t_blocknr;
    __be16 t_checksum;
    __be16 t_flags;
    __be32 t_blocknr_high;
   ---

commit_header (chksum of whole transaction)
    |
    v
 |commit-------------------------------|

IOs
                        Wait the IO of Descriptor and LOG completed
                                           v
 [Descriptor] [LOG] [LOG] [LOG] [LOG] [LOG] [Commit]
 \___________________ ____________________/ \___ ___/
                     v                          v       
          REQ_OP_WRITE + REQ_SYNC      REQ_OP_WRITE + REQ_SYNC
                                       + REQ_PREFLUSH + REQ_FUA

There is async commit feature which commit w/o wait descriptor and log completed.
It depends on the chksum to ensure the integrity of log transaction.

Two, write out modifications only after log has been persistted to disk
After commit, the bh of metadata is handed over to writeback by
__jbd2_journal_refile_buffer()
  -> __jbd2_journal_unfile_buffer()//jh->b_next_transaction == NULL
    -> __jbd2_journal_temp_unlink_buffer()
    ---
    else if (test_clear_buffer_jbddirty(bh))
        mark_buffer_dirty(bh);    /* Expose it to the VM */
    ---

The interesting thing is ext4 nevery invoke mark_inode_dirty but ext4_mark_inode_dirty
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

__ext4_mark_inode_dirty()
  -> ext4_mark_iloc_dirty()
    -> ext4_do_update_inode()
    ---

    //Update ext4_inode on buffer_head from in-core inode

    err = ext4_handle_dirty_metadata(handle, NULL, bh);
    ---

Count

How many transaction can exist in kernel ?
There are 3 kinds of transaction,
 - running
 - committing
 - checkpointing

There can be only one running and one commit transaction at the same time,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
start_this_handle()
  -> add_transaction_credits()
  ---
    needed = atomic_add_return(total, &t->t_outstanding_credits);
    if (needed > journal->j_max_transaction_buffers) {
        atomic_sub(total, &t->t_outstanding_credits);

        /*
         * Wait until running transaction passes to T_FLUSH state and new transaction
         * can thus be started. Also starts the commit if needed. The function expects
         * running transaction to exist and releases j_state_lock.
         */

        wait_transaction_locked(journal);
        return 1;
    }
  ---

The T_FLUSH state means, the transaction has been totally closed (no opening handle any more)
and become the journal->j_committing_transaction
jbd2_journal_commit_transaction()
---
//Drain handles on commit_transaction->t_updates
    ...
    write_lock(&journal->j_state_lock);
    commit_transaction->t_state = T_FLUSH;
    journal->j_committing_transaction = commit_transaction;
    journal->j_running_transaction = NULL;
    start_time = ktime_get();
    commit_transaction->t_log_start = journal->j_head;
    wake_up(&journal->j_wait_transaction_locked);
    write_unlock(&journal->j_state_lock);
--

And there can be multiple checkpoint transaction,
jbd2_journal_commit_transaction()
---
    spin_lock(&journal->j_list_lock);
    if (journal->j_checkpoint_transactions == NULL) {
        journal->j_checkpoint_transactions = commit_transaction;
        commit_transaction->t_cpnext = commit_transaction;
        commit_transaction->t_cpprev = commit_transaction;
    } else {
        commit_transaction->t_cpnext =
            journal->j_checkpoint_transactions;
        commit_transaction->t_cpprev =
            commit_transaction->t_cpnext->t_cpprev;
        commit_transaction->t_cpnext->t_cpprev =
            commit_transaction;
        commit_transaction->t_cpprev->t_cpnext =
                commit_transaction;
    }
    spin_unlock(&journal->j_list_lock);
---

Capacity

Max size of a transaction is
static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
    return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}
1/4 total journal size, excluding fast commit.

The transaction size is decided by method that triggers transaction committing
Refer to Commit

The space of journal is deemed as a ring

                     grow up ->

                 j_tail            j_head
                   |                 |
                   v                 v
        |----------drrrrrrcdrrrrrrrrc------------|
      0            \___ __/\____ ___/              j_total_len
                       v        v
                      tx      t(x+1)

        d: Descriptor
        r: Log Record
        c: Commit
        -: free space

Log space allocation:
jbd2_journal_next_log_block()
---
    write_lock(&journal->j_state_lock);
    blocknr = journal->j_head;

    journal->j_head++;
    journal->j_free--;

    if (journal->j_head == journal->j_last)
        journal->j_head = journal->j_first;
    write_unlock(&journal->j_state_lock);
    return jbd2_journal_bmap(journal, blocknr, retp);
---

Log space freeing:

            j_tail
               |
               v
       |-------cpcpcpcpccccccccrrrrrrrrr--------|
               \__ ___/\___ __/\___ ___/
                  v        v       v
         checkpointing committing running

After checkpoint
                     j_tail
                       |
                       v
       |---------------ccccccccrrrrrrrrr--------|
                \__ __/ \___ __/\___ ___/
                   v       v       v
                freed  committing running

There are 3 space to free log space,
(1) __jbd2_log_wait_for_space()
     -> jbd2_log_do_checkpoint()
(2) jbd2_journal_commit_transaction()
     -> jbd2_update_log_tail()

Commit

The transaction can be committed in following opportunities
fsync and other where you want files to be synced to disk
file's metadata is in journal
ocfs2_sync_file()
  -> jbd2_complete_transaction()

//Ext4 employs fastcommit, doesn't use jbd2_complete_transaction() in ext4_sync_file() now.


And ext4_sync_fs() is also such a thing

Running transaction exceeds its max capacity
Refer to add_transaction_credits()

Timeout, namely the transaction is too old
start_this_handle()
  -> jbd2_get_transaction()
  ---
    transaction->t_expires = jiffies + journal->j_commit_interval;
    ...
    /* Set up the commit timer for the new transaction. */
    journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
    add_timer(&journal->j_commit_timer);
  ---
 
commit_timeout(_
---
    journal_t *journal = from_timer(journal, t, j_commit_timer);

    wake_up_process(journal->j_task);
---
This j_task is kjournald2

The expire time is 5 seconds by default
#define JBD2_DEFAULT_MAX_COMMIT_AGE 5


#define JBD2_DEFAULT_MAX_COMMIT_AGE 5

Handle

Order

The order here means the ext4's order mode

In data=ordered mode, ext4 only officially journals metadata, but it logically
groups metadata information related to data changes with the data blocks into a
single unit called a transaction.  When it's time to write the new metadata
out to disk, the associated data blocks are written first.

What's this mode for ?

After a system crash or a power failure, files that were written right before the
system went down could contain previously written data or other garbage.
With Ordered Mode, journal commits are deferred until the data blocks get written
to disk. This guarantees that any blocks in the file will be data written by the
application, avoiding a possibility of a security breach, which is especially
problematic on a multi-user system.
Ext3 Order VS Writeback mode

This is not an issue right now, because the unwritten statea

Both xfs and ext4 support this,
ext4_map_blocks()
---
     ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
        if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
            map->m_pblk = ext4_es_pblock(&es) +
                    map->m_lblk - es.es_lblk;
            map->m_flags |= ext4_es_is_written(&es) ?
                    EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
        ...
}
---

xfs_vm_readpage()
  -> iomap_readpage()
    -> iomap_apply()
      -> iomap_readpage_actor()
      ---
    if (iomap_block_needs_zeroing(inode, iomap, pos)) {
        zero_user(page, poff, plen);
        iomap_set_range_uptodate(page, poff, plen);
        goto done;
    }
      ---

But why does ext4 still use 'order mode' as default ?

In addition, there are some applications which depend on data=ordered
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
to automatically force data blocks to be written to disk soon after the
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
file is written. Using Writeback Mode extends the time from when a file
^^^^^^^^^^^^^^^
is written to when it is pushed out to disk to 30 seconds. This can be
surprising for some users; however, it should be noted that such problems
can still be an issue with Ordered Mode (although they are much rarer).
Again, a careful application or library should always use fsync() at points
where the application is at a stable commit point.

Let's look at how to implement it

ext4_jbd2_inode_add_write() <- ext4_map_blocks()
                            <- __ext4_journalled_writepage()
                            <- __ext4_block_zero_page_range()
                            <- ext4_page_mkwrite()
                            <- move_extent_per_page()

We only need to order the page writting when extent changes


But ocfs2 is not different,
ocfs2_jbd2_inode_add_write()  <- ocfs2_write_failure()
                              <- ocfs2_map_and_dirty_page()
                              <- ocfs2_write_end_nolock()
                              <- ocfs2_zero_start_ordered_transaction()

Recovery

When

ext4_fill_super()
  -> ext4_load_journal()
    -> jbd2_journal_load()
      -> jbd2_journal_recover()
  -> ext4_iget()
  -> ext4_orphan_cleanup()

SCAN

Decide the range of logs need to be replayed, or decide the end transaction ID
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
       start            end
        |                |
        v                v
 |------dlllllcdrlllllcdlll----|
        \__ __/\___ __/\_ _/
           t      t+1    t+2

d: JBD2_DESCRIPTOR_BLOCK, journal_header_t + n * journal_block_tag_t
l: copy of the block that need to be logged
c: JBD2_COMMIT_BLOCK, commit_header (a journal_header_t embedded)
r: JBD2_REVOKE_BLOCK, jbd2_journal_revoke_header_t (a journal_header_t embedded)

Start
Come from the journal superblock, updated by jbd2_update_log_tail()
End
(1) check magic, rule out 'l' or empty blocks
(2) check sequence, rule out stale logs (same mkfs)

(There is time stamp check to avoid previous mkfs's log)

There are following rulers when scan the log
(1) when meet 'd', bypass the following log blocks

    next_log_block += count_tags(journal, bh);

(2) when meet 'l', if valid increase next_commit_ID
    This indicates the commit id of last complete transaction
    this commit id will be used in following pass to decide when to stop

(3) revoke and descriptor block checksum error cause recovery failure,
    but commit not

REVOKE

Only scan the JBD2_REVOKE_BLOCK and record the blocks that need to be revoked
when replay
Refer to scan_revoke_records()

RELAY

Pick up the JBD2_DESCRIPTOR_BLOCK and copy the following log blocks to it real
position based on array of journal_block_tag_t
do_one_pass()
---
            while ((tagp - bh->b_data + tag_bytes) <= journal->j_blocksize - descr_csum_size) {
                unsigned long io_block;

                tag = (journal_block_tag_t *) tagp;
                flags = be16_to_cpu(tag->t_flags);

                io_block = next_log_block++;
                wrap(journal, next_log_block);

                //Read in log block

                err = jread(&obh, journal, io_block);
                if (err) {
                } else {

                    //get the real position on disk

                    blocknr = read_tag_block(journal, tag);

                    /* If the block has been revoked, then we're all done here. */

                    if (jbd2_journal_test_revoke(journal, blocknr,  next_commit_ID)) {
                        brelse(obh);
                        ++info->nr_revoke_hits;
                        goto skip_write;
                    }
                   /* Look for block corruption */
                    if (!jbd2_block_tag_csum_verify(journal, tag, obh->b_data, be32_to_cpu(tmp->h_sequence))) {
                        brelse(obh);
                        block_error = 1;
                        goto skip_write;
                    }

                    nbh = __getblk(journal->j_fs_dev, blocknr, journal->j_blocksize);
                    lock_buffer(nbh);
                    memcpy(nbh->b_data, obh->b_data, journal->j_blocksize);
                    set_buffer_uptodate(nbh);
                    //Not issue the IO but just mark bh dirty
                    mark_buffer_dirty(nbh);
                    ++info->nr_replays;
                    unlock_buffer(nbh);
                    brelse(obh);
                    brelse(nbh);
                }

            skip_write:
                tagp += tag_bytes;
                if (!(flags & JBD2_FLAG_SAME_UUID))
                    tagp += 16;

                if (flags & JBD2_FLAG_LAST_TAG)
                    break;
            }
---

In RELAY pass, the do_one_pass() doesn't issue io, but just mark dirty,
the io is issued here,
jbd2_journal_recover()
---
    err = do_one_pass(journal, &info, PASS_SCAN);
    if (!err)
        err = do_one_pass(journal, &info, PASS_REVOKE);
    if (!err)
        err = do_one_pass(journal, &info, PASS_REPLAY);

    journal->j_transaction_sequence = ++info.end_transaction;

    jbd2_journal_clear_revoke(journal);
    err2 = sync_blockdev(journal->j_fs_dev);
    /* Make sure all replayed data is on permanent storage */
    if (journal->j_flags & JBD2_BARRIER) {
        err2 = blkdev_issue_flush(journal->j_fs_dev);
    }

---

Revoke

Why do we need REVOKE

                  log grow ->

         m     f  da+w                    
|------+----+----+----+----+----+----+--------|
       \________ __________/\_ _/\_ _/
                v             v    v
               cp             c    r

m : modification to metadata
f : free the metadata block
da: data block allocation
w : data block writing

If we replay the log, the metadata modification will replayed on the blcok
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
which has contained data. This is terrible case because the data that has been
^^^^^^^^^^^^^^^^^^^^^^^^
synced to disk is corrupted.

When to issue revoke

Track the code invokes ext4_free_blocks with EXT4_FREE_BLOCKS_FORGET.

How to handle revoke

         m   f+r  ma+w
|------+----+----+----+----+----+----+--------|
       \________ __________/\_ _/\_ _/
                v             v    v
               cp             c    r
m : modification to metadata
f : free the metadata block
r : revoke record
ma: metadata block allocation
w : data block writing

When replay the log, the revoke record which is in the same transaction
with the freeing, will stop replaying of 'm' log record. Then it can protect
the following data allocation and writing. But if the block is allocated
by meta again, the revoke record won't prevent the replaying of 'ma+w' (or the 'w' only).


 commit
jbd2_journal_commit_transaction()
---
    /*
     * Now start flushing things to disk, in the order they appear
     * on the transaction lists.  Data blocks go first.
     */
    err = journal_submit_data_buffers(journal, commit_transaction);
    ...
    blk_start_plug(&plug);
    jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);

    jbd_debug(3, "JBD2: commit phase 2b\n");
---
write_one_revoke_record()
---
    if (descriptor) {
        if (offset + sz > journal->j_blocksize - csum_size) {
            flush_descriptor(journal, descriptor, offset);
            ---
            write_dirty_buffer() is invoked here to submit_bh
            ---
            descriptor = NULL;
        }
    }

    if (!descriptor) {
        descriptor = jbd2_journal_get_descriptor_buffer(transaction,
                            JBD2_REVOKE_BLOCK);
        jbd2_file_log_bh(log_bufs, descriptor);

        offset = sizeof(jbd2_journal_revoke_header_t);
        *descriptorp = descriptor;
    }

    if (jbd2_has_feature_64bit(journal))
        * ((__be64 *)(&descriptor->b_data[offset])) =
            cpu_to_be64(record->blocknr);
    else
        * ((__be32 *)(&descriptor->b_data[offset])) =
            cpu_to_be32(record->blocknr);
---
The format of revoke record block is,

journal_header_t
  |
  v
|---|---|-------------------|
      ^ \_________ _________/
      |           v
      |   block num array that need to revoked
jbd2_journal_revoke_header_t

 recovery
Revoke records will be scanned before replay pass

do_one_pass()
---
    case JBD2_REVOKE_BLOCK:
            /* If we aren't in the REVOKE pass, then we can
             * just skip over this block. */
            if (pass != PASS_REVOKE) {
                brelse(bh);
                continue;
            }

            err = scan_revoke_records(journal, bh,
                          next_commit_ID, info);
            brelse(bh);
            continue;
---

do_one_pass()
---
                    /* If the block has been
                     * revoked, then we're all done
                     * here. */
                    if (jbd2_journal_test_revoke(journal, blocknr, next_commit_ID)) {
                        brelse(obh);
                        ++info->nr_revoke_hits;
                        goto skip_write;
                    }
--

BH

bh and jh

bh and jh reference each other.

Refer to
jbd2_journal_add_journal_head()
---
    if (!buffer_jbd(bh))
        new_jh = journal_alloc_journal_head();

    jbd_lock_bh_journal_head(bh);
    if (buffer_jbd(bh)) {
        jh = bh2jh(bh);
    } else {
        jh = new_jh;
        new_jh = NULL;        /* We consumed it */
        set_buffer_jbd(bh);

        bh->b_private = jh;
        jh->b_bh = bh;
        get_bh(bh);
    }
    jh->b_jcount++;

    jbd_unlock_bh_journal_head(bh);
---
Note, the reference above happens under bh's lock

bh and transaction

Firstly, we must know we cannot modify bh when its previous transaction has not
been committed. Because the bhs' modification in one transaction compose an atomic
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
filesystem operations.
^^^^^^^^^^^^^^^^^^^^^

How does jbd2 handle this case ?
Let's look into 
jbd2_journal_commit_transaction()
  -> jbd2_journal_write_metadata_buffer()
---
    spin_lock(&jh_in->b_state_lock);
repeat:
    /*
     * If a new transaction has already done a buffer copy-out, then
     * we use that version of the data for the commit.
     */
    if (jh_in->b_frozen_data) {
        done_copy_out = 1;
        new_page = virt_to_page(jh_in->b_frozen_data);
        new_offset = offset_in_page(jh_in->b_frozen_data);
    } else {
        new_page = jh2bh(jh_in)->b_page;
        new_offset = offset_in_page(jh2bh(jh_in)->b_data);
    }
    ...
    set_bh_page(new_bh, new_page, new_offset);
    new_bh->b_size = bh_in->b_size;
    new_bh->b_bdev = journal->j_dev;
    new_bh->b_blocknr = blocknr;
    new_bh->b_private = bh_in;
    set_buffer_mapped(new_bh);
    set_buffer_dirty(new_bh);

    *bh_out = new_bh;

    spin_lock(&journal->j_list_lock);
    __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
    spin_unlock(&journal->j_list_lock);
    set_buffer_shadow(bh_in);
    spin_unlock(&jh_in->b_state_lock);
---

In a word, if there is b_frozen_data, use it, otherwise use the buffer of bh
that need to be logged.

The most critical point is the jh->b_state_lock which serialize everything.
do_get_write_access()
---
    lock_buffer(bh);
    spin_lock(&jh->b_state_lock);
    /*
     * The buffer is already part of this transaction if b_transaction or
     * b_next_transaction points to it
     * This also tell us extra copy only happens when one bh needs to join
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     * the next transaction and previous one has not been committed. The extra
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     * is the data that compose the atomic filesystem operations in previous
     * transaction.
     */
    if (jh->b_transaction == transaction ||
        jh->b_next_transaction == transaction)
        goto done;

    /*
     * If there is already a copy-out version of this buffer, then we don't
     * need to make another one
     */
    if (jh->b_frozen_data) {
        goto attach_next;
    }
    /*
     * There is one case we have to be very careful about.  If the
     * committing transaction is currently writing this buffer out to disk
     * and has NOT made a copy-out, then we cannot modify the buffer
     * contents at all right now.  The essence of copy-out is that it is
     * the extra copy, not the primary copy, which gets journaled.  If the
     * primary copy is already going to disk then we cannot do copy-out
     * here.
     */
    if (buffer_shadow(bh)) {
        spin_unlock(&jh->b_state_lock);
        wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
        goto repeat;
    }

    /*
     * Only do the copy if the currently-owning transaction still needs it.
     * If buffer isn't on BJ_Metadata list, the committing transaction is
     * past that stage (here we use the fact that BH_Shadow is set under
     * bh_state lock together with refiling to BJ_Shadow list and at this
     * point we know the buffer doesn't have BH_Shadow set).
     *
     * Subtle point, though: if this is a get_undo_access, then we will be
     * relying on the frozen_data to contain the new value of the
     * committed_data record after the transaction, so we HAVE to force the
     * frozen_data copy in that case.
     */
    if (jh->b_jlist == BJ_Metadata || force_copy) {
        if (!frozen_buffer) {
            spin_unlock(&jh->b_state_lock);
            frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
                           GFP_NOFS | __GFP_NOFAIL);
            goto repeat;
        }
        jh->b_frozen_data = frozen_buffer;
        frozen_buffer = NULL;
        jbd2_freeze_jh_data(jh);
    }
attach_next:
    smp_wmb();
    jh->b_next_transaction = transaction;
done:
    spin_unlock(&jh->b_state_lock);
---

inode

reference of inode

There are two reference of the inode,

i_nlink

This reference is the count of dentries that point to this inode and
it decides the lifecycle of the inode on disk.

i_count

This reference decides the lifecycle of the inode in memory

In this section, we would look into the second kind of reference above.

gone of the inode

void iput(struct inode *inode)
{
    ...
        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
        ...
        iput_final(inode);
        }
}
When the last reference of an inode is dropped, we may have two choices,
Look at the code,
iput_final()
---
    if (op->drop_inode)
        drop = op->drop_inode(inode);
    else
        drop = generic_drop_inode(inode);

    // (1) retain inode in cache if fs is active

    if (!drop &&
            !(inode->i_state & I_DONTCACHE) &&
            (sb->s_flags & SB_ACTIVE)) {
        inode_add_lru(inode);
        spin_unlock(&inode->i_lock);
        return;
    }
    ...
    WRITE_ONCE(inode->i_state, state | I_FREEING);
    if (!list_empty(&inode->i_lru))
        inode_lru_list_del(inode);
    spin_unlock(&inode->i_lock);

    // (2) evict the inode

    evict(inode);
---

get of the inode

We usually get the reference of the inode when look up in inode cache,
for example,
iget_locked()
---
    spin_lock(&inode_hash_lock);
    inode = find_inode_fast(sb, head, ino);
    ---
        hlist_for_each_entry(inode, head, i_hash) {
            if (inode->i_ino != ino)
                continue;
            if (inode->i_sb != sb)
                continue;
            spin_lock(&inode->i_lock);
            if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                __wait_on_freeing_inode(inode);
                goto repeat;
            }
            ...
            __iget(inode);
            spin_unlock(&inode->i_lock);
            return inode;
        }
    ---
    spin_unlock(&inode_hash_lock);
---

The reference get of inode is done under inode->i_lock which is protect it
against iput_final.

The main user of inode is dcache.

           fd         user
------------------------------
           file       kernel
            v         file->f_path.dentry
          dcache
            v         dentry->d_inode
          inode

put

__fput()
---
    dput(dentry);
    ...
    mntput(mnt);
---
dput()
  -> dentry_kill()
    -> __dentry_kill()
      -> dentry_unlink_inode()
        -> iput()

lazytime mode

The commit is

commit 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8
Author: Theodore Ts'o 
Date:     Mon Feb 2 00:37:00 2015 -0500

        vfs: add support for a lazytime mount option
        
        Add a new mount option which enables a new "lazytime" mode.    This mode
        causes atime, mtime, and ctime updates to only be made to the
        in-memory version of the inode.    The on-disk times will only get
        updated when (a) if the inode needs to be updated for some non-time
        related change, (b) if userspace calls fsync(), syncfs() or sync(), or
        (c) just before an undeleted inode is evicted from memory.
        
        This is OK according to POSIX because there are no guarantees after a
        crash unless userspace explicitly requests via a fsync(2) call.
        
        For workloads which feature a large number of random write to a
        preallocated file, the lazytime mount option significantly reduces
        writes to the inode table.    The repeated 4k writes to a single block
        will result in undesirable stress on flash devices and SMR disk
        drives.    Even on conventional HDD's, the repeated writes to the inode
        table block will trigger Adjacent Track Interference (ATI) remediation
        latencies, which very negatively impact long tail latencies --- which
        is a very big deal for web serving tiers (for example).
        
        Google-Bug-Id: 18297052
        
        Signed-off-by: Theodore Ts'o 
        Signed-off-by: Al Viro

Firstly, let's look at the 3 time fields in POSIX,

ctime, create time
atime, access time
mtime, modify time

And they will be modified in following code path,

atime

file_accessed()
    -> touch_atime()
        -> update_time() //S_ATIME

file_accessed() could be invoked by
 - generic_file_read_iter() // in direct IO path
 - generic_file_buffered_read()
 - generic_file_mmap()
 - ext4_dax_read_iter()
 ...

ctime and mtime

file_update_time()
    -> update_time() // S_MTIME or S_CTIME

filemap_page_mkwrite()
    -> file_update_time()

__generic_file_write_iter() //cover both buffer and direct IO path
    -> file_update_time()

The time of inode is updated in generic_update_time,

generic_update_time()
    -> __mark_inode_dirty()
        -> sb->s_op->dirty_inode()
             ext4_dirty_inode()
         ---

                // If flags only contain I_DIRTY_TIME, just return and leave the
                // modified times fields in memory

                if (flags == I_DIRTY_TIME)
                        return;
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                ...
                ext4_mark_inode_dirty(handle, inode);
                    -> __ext4_mark_inode_dirty()
                        -> ext4_mark_iloc_dirty()
                            -> ext4_do_update_inode() // synchronize the inode in-memory to the one on-disk

                ext4_journal_stop(handle);
         ---

If we look into the __mark_inode_dirty,
we could find out that I_DIRTY_TIME is set on inode->i_state, and the inode is inserted into the wb->b_dirty_time
There is no modifications on the on-disk inode buffer in-memory, what does the wb flush ?

When will the inode on wb->b_dirty_time be flushed ?

wb_writeback()
    -> queue_io()
    ---
        if (!work->for_sync)
                time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                         time_expire_jif);
    ---

unsigned int dirtytime_expire_interval = 12 * 60 * 60;

See it !?, it is 12 hours

How to write out the dirty inode with I_DIRTY_TIME

__writeback_single_inode()
---
        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion. We don't do it for sync(2) writeback because it has a
         * separate, external IO completion path and ->sync_fs for guaranteeing
         * inode metadata is written back correctly.
         */

        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        spin_lock(&inode->i_lock);

        dirty = inode->i_state & I_DIRTY;
        if ((inode->i_state & I_DIRTY_TIME) &&
                ((dirty & I_DIRTY_INODE) ||
                 wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
                 time_after(jiffies, inode->dirtied_time_when +
                        dirtytime_expire_interval * HZ))) {
                dirty |= I_DIRTY_TIME;
                trace_writeback_lazytime(inode);
        }
        inode->i_state &= ~dirty;
        ...
        smp_mb();

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state |= I_DIRTY_PAGES;

        spin_unlock(&inode->i_lock);


        //See it ? I_DIRTY_SYNC will be set here and ext4_dirty_inode() won't do
        nothing anymore


        if (dirty & I_DIRTY_TIME)
                mark_inode_dirty_sync(inode);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
---

Well, a normal case that flush out the time fields of inode should be in iput()

void iput(struct inode *inode)
{
        if (!inode)
                return;
        BUG_ON(inode->i_state & I_CLEAR);
retry:
        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
                        atomic_inc(&inode->i_count);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_lazytime_iput(inode);

                        mark_inode_dirty_sync(inode);

                        goto retry;
                }
                iput_final(inode);
        }
}

inode cache

Where is the inode cache ?

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0

The inode would be inserted into this hash table to be looked up quickly.

INSERT

insert_inode_locked() <- __ext4_new_inode()

// Look up in cache, create a new one if not exist(I_NEW).

iget_locked() <- __ext4_iget()


Search for the inode specified by @hashval and @data in the inode cache,
and if present it is return it with an increased reference count. This is
a generalized version of iget_locked() for file systems where the inode
number is not sufficient for unique identification of an inode.

inode_insert5 <- iget5_locked()
                            <- insert_inode_locked4()

__insert_inode_hash() <- insert_inode_hash()

LOOKUP

A very common code path should be,
ext4_lookup()
---

        //Try to lookup the file in directory entry

        bh = ext4_lookup_entry(dir, dentry, &de);
        if (bh) {
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                ...

                // Get a valid inode id, try to get the inode for it

                inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
                    -> __ext4_iget()
                    ---
                        inode = iget_locked(sb, ino);
                        ...

                        // It is cached in inode cache

                        if (!(inode->i_state & I_NEW))
                                return inode;

                        ei = EXT4_I(inode);
                        iloc.bh = NULL;

                        // Get the inode on disk

                        ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
                        ...
                        raw_inode = ext4_raw_inode(&iloc);
                    ---
---

Some filesystems, such as xfs, maintain a inode cache itself.

xfs_lookup()
    -> xfs_dir_lookup()
    -> xfs_iget()
    ---

        // See it ? this is a fs private inode cache and it is more scalable

        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
        agino = XFS_INO_TO_AGINO(mp, ino);

again:
        error = 0;
        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);

        if (ip) {
                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                ...
        } else {
                rcu_read_unlock();
                ...

                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
                                                        flags, lock_flags);
                                                        
                ...
                ---

                // Allocate xfs_inode where a vfs inode is embedded in

                ip = xfs_inode_alloc(mp, ino);
                ...
                } else {

                        // get the xfs_buf for this inode

                        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0);

                        // fill the inode with data on disk

                        error = xfs_inode_from_disk(ip, dip);
                        xfs_trans_brelse(tp, bp);
                }
                ...

                iflags = XFS_INEW;

                if (flags & XFS_IGET_DONTCACHE)
                        d_mark_dontcache(VFS_I(ip));
                xfs_iflags_set(ip, iflags);

                /* insert the new inode */
                spin_lock(&pag->pag_ici_lock);
                error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
                spin_unlock(&pag->pag_ici_lock);
                ---
        }
        xfs_perag_put(pag);
    ---

Another thing need to be talked is the way to reclaim indoe cache

Which one could be reclaimed


For xfs

iput()
    -> iput_final()
    ---
        if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);

        // In generic_drop_inode, there are 3 conditions,
        //    - !inode->i_nlink, means the file or hardlink have been cut off
        //    - inode_unhashed(), means this inode is not in inode hash table
                    for xfs, this true, because it use inode cache of its own
        //    - I_DONTCACHE

        if (!drop && (sb->s_flags & SB_ACTIVE)) {
                inode_add_lru(inode);
                spin_unlock(&inode->i_lock);
                return;
        }
        ...
        WRITE_ONCE(inode->i_state, state | I_FREEING);
        if (!list_empty(&inode->i_lru))
                inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);

        evict(inode);
    ---
evict()
    -> destroy_inode()
        -> sb ops->destroy_inode()
             xfs_fs_destroy_inode()
                 -> xfs_inactive() //Won't do more because i_nlink is not zero>
                 -> xfs_inode_set_reclaim_tag()
                 --
                         xfs_perag_set_reclaim_tag(pag);
                        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
                 --

This XFS_IRECLAIMABLE will be handled by
xfs_iget_cache_hit()
---

        /*
         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
         * Need to carefully get it back into useable state.
         */

        if (ip->i_flags & XFS_IRECLAIMABLE) {
                ...
        }
---
For normal
prune_icache_sb()
    -> inode_lru_isolate()
    ---

        /*
         * Referenced or dirty inodes are still in use. Give them another pass
         * through the LRU as we canot reclaim them now.
         */

        if (atomic_read(&inode->i_count) ||
                (inode->i_state & ~I_REFERENCED)) {
                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
        }
    ---

Dcache

shrink of dcache

shrinker

prune_dcache_sb()
---
        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                         dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
---
dentry_lru_isolate()
---
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        /*
         * Referenced dentries are still in use. If they have active
         * counts, just remove them from the LRU. Otherwise give them
         * another pass through the LRU.
         */

        if (dentry->d_lockref.count) {
                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }

        if (dentry->d_flags & DCACHE_REFERENCED) {
                dentry->d_flags &= ~DCACHE_REFERENCED;
                spin_unlock(&dentry->d_lock);
                return LRU_ROTATE;
        }

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
---

shrink_dentry_list()
    -> __dentry_kill()
        -> __d_drop()
            -> ___d_drop() //remove the dentry from the hashtable
        -> dentry_free()
            -> call_rcu(&dentry->d_u.d_rcu, __d_free);Nothing will be freed under rcu_read_lock

get and put

dget()
---
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
---

dput()
---
        rcu_read_lock();
                if (likely(fast_dput(dentry))) {
                        rcu_read_unlock();
                        return;
                }

                /* Slow case: now with the dentry lock held */

                rcu_read_unlock();

                if (likely(retain_dentry(dentry))) {
                        spin_unlock(&dentry->d_lock);
                        return;
                }

                dentry = dentry_kill(dentry);
---
retain_dentry()
---

        //vfs_unlink() -> __d_drop() could cause this.

        if (unlikely(d_unhashed(dentry)))
                return false;
        ...

        // Two rounds in lru

        dentry->d_lockref.count--;
        if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
                d_lru_add(dentry);
        else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
                dentry->d_flags |= DCACHE_REFERENCED;
        return true;
---

To prevent dentry from being reclaimed,
(1) hold a reference with dget() (d_lru_isolate() will remove it from lru list in dentry_lru_isolate)
(2) use it and then DCACHE_REFERENCED will always be set

lookup

overview

do_filp_open()
---

        // Try to use LOOKUP_RCU first, if failed with ECHILD,
        // try again w/o LOOKUP_RCU

        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
                filp = path_openat(&nd, op, flags)
---

path_openat()
---
                const char *s = path_init(nd, flags); //rcu_read_lock is get here

                /home/jianchwa/linux-stable/fs/dcache.c 
                \____________ _______________/ \__ __/
                                         v                                        v
                            link_path_walk()         open_last_lookups()

                while (!(error = link_path_walk(s, nd)) &&
                             (s = open_last_lookups(nd, file, op)) != NULL)
                        ;
                if (!error)
                        error = do_open(nd, file, op); // do the finally open
                terminate_walk(nd);

---

fast lookup under rcu_read_lock

There are 3 points here regarding to fast lookup
(1) it is only to accelerate the dentry cache hit cases
        if not hit, lookup would fallback to non-rcu mode
(2) dentry won't be freed during this but maybe killed
(3) sequence count is used to protect the lookup against rename

Regarding to the point 3,
vfs_rename()
    -> d_move()
        -> __d_move()
        ---
        spin_lock_nested(&dentry->d_lock, 2);
        spin_lock_nested(&target->d_lock, 3);
        ...
        write_seqcount_begin(&dentry->d_seq);
        write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

        /* unhash both */
        if (!d_unhashed(dentry))
                ___d_drop(dentry);
        if (!d_unhashed(target))
                ___d_drop(target);

        /* ... and switch them in the tree */
        dentry->d_parent = target->d_parent;
        if (!exchange) {
                copy_name(dentry, target);
                target->d_hash.pprev = NULL;
                dentry->d_parent->d_lockref.count++;
                if (dentry != old_parent) /* wasn't IS_ROOT */
                        WARN_ON(!--old_parent->d_lockref.count);
        }
        ...
        list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
        __d_rehash(dentry);
        ...
        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);
        ---

__d_lookup_rcu()
---
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                unsigned seq;

seqretry:

                /*
                 * The dentry sequence count protects us from concurrent
                 * renames, and thus protects parent and name fields.
                 * Otherwise, we may get a wrong entry, for example,
                 * during rename /home/will/aaaa to /home/will/bbbb,
                 * we could get baaa, bbaa, bbba, and if these files
                 * do exist, look up is screwed up.
                 */

                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;

                if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
                        ...
                } else {
                        if (dentry->d_name.hash_len != hashlen)
                                continue;
                        if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
                                continue;
                }
                *seqp = seq;
                return dentry;
        }
---

lookup_fast()
---
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                dentry = __d_lookup_rcu(parent, &nd->last, &seq);
                ...
                *inode = d_backing_inode(dentry);
                if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
                        return ERR_PTR(-ECHILD);

                /*
                 * This sequence count validates that the parent had no
                 * changes while we did the lookup of the dentry above.
                 */

                 Why do we need to care about the parent ?
                 renaming of the parent won't influence the childrens, right ?

                if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
                        return ERR_PTR(-ECHILD);

                *seqp = seq;
                status = d_revalidate(dentry, nd->flags);
                if (likely(status > 0))
                        return dentry;
        }
---

Before the finally open, we would recheck the sequence count,
do_open()
    -> complete_walk()
        -> unlazy_walk()
            -> legitimize_path()
                     ---
                        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
                                path->dentry = NULL;
                                return false;
                        }
                        return !read_seqcount_retry(&path->dentry->d_seq, seq);
                    ---

dcache and metadata

The main points that dcache interacts with filesystem

link_path_walk()
    -> walk_component()
        -> lookup_slow()
            -> inode_lock_shared()
            -> __lookup_slow()
                -> d_alloc_parallel() //allocate a dentry structure
                -> inode->i_op->lookup()

                    look up dentry in metadata,
                    do d_add() if found which would add dentry into dcache hashtable                         

open_last_lookups()
    -> lookup_open()
        -> d_lookup() //do look up under rename raed seqlock
        -> dir_inode->i_op->create() //do create if needed

add to dcache hash

When a dentry is in the dcache hash table, namely, dentry_hashtable
__d_lookup could find it and d_unhashed() returns false.
When is the dentry inserted to the dentry_hashtable ?

lookup_open

When the file does not exist,
lookup_open()

    // allocate dentry structure with d_alloc()

    -> d_alloc_parallel()
    -> dir_inode->i_op->lookup()
         ext2_lookup()
             -> ext2_inode_by_name() //not found at this moment
             -> d_splice_alias() //inode is NULL at this moment
    -> dir_inode->i_op->create()
         ext2_create()
             -> ext2_add_nondir()
                 -> d_instantiate_new()
                     -> d_instantiate() //fill inode information for a dentry

lookup

When the file exists,
lookup_slow()
    -> ext2_lookup()
        -> d_splice_alias()
            -> __d_add()
                -> __d_set_inode_and_type() // install the inode into the dentry
                    -> __d_rehash()

Quota

There are two basic types of disk quotas

block quota, limits the amount of disk space that can be used

inode quota, limits the number of files and directories that can be created

The disk quotas can be set per 3 roles,

user
group
project

There are two limit,

a warning level, or soft quota, at which users are informed they are nearing their limit
effective limit, or hard quota. There may also be a small grace interval, which allows users to temporarily violate their quotas by certain amounts if necessary.

Where to store quota data

The quota data is in memory in runtime and need to be persistted on disk.
On disk disk quota entry format,

struct v2r1_disk_dqblk {
    __le32 dqb_id;        /* id this quota applies to */
    __le32 dqb_pad;
    __le64 dqb_ihardlimit;    /* absolute limit on allocated inodes */
    __le64 dqb_isoftlimit;    /* preferred inode limit */
    __le64 dqb_curinodes;    /* current # allocated inodes */
    __le64 dqb_bhardlimit;    /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
    __le64 dqb_bsoftlimit;    /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
    __le64 dqb_curspace;    /* current space occupied (in bytes) */
    __le64 dqb_btime;    /* time limit for excessive disk use */
    __le64 dqb_itime;    /* time limit for excessive inode use */
};

in-memory cache

dqget()
  -> find_dquot()// under dq_list_lock
  ---
    hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash)
        if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
            return dquot;
  ---

on-disk data

The disk quota data is persistted in reserved inode of filesystem.

Look at the ext4,

ext4_enable_quotas()
  -> ext4_quota_enable()
  ---
    unsigned long qf_inums[EXT4_MAXQUOTAS] = {
        le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
        le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
        le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
    };

    qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);

    /* Don't account quota for quota files to avoid recursion */

    qf_inode->i_flags |= S_NOQUOTA;
    err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
    iput(qf_inode);
  ---

The dquot file is formated as a hash tree,
                       |.|.|
                        / \
                    |.|.| |.|.|
                     / \   / \
                |.|.|   ...   |.|.|
                 / \           / \
             |.|.| |.|.|...|.|.| |.|.|
              |
         |e|e|e|...|

 - The block size is 1024
 - Every tree block contains 256 Indexes
 - Tree level is fixed to 4 level, refer to qtree_depth()
 - dquot entry is flat in the leaf block.

When a leaf block is empty, it will be linked into the free list in

How to use quota data

syncfs

sync the dquot memory cache into disk,
ext4_sync_fs()
  -> dquot_writeback_dquots()

write one

ext4_write_dquot()
  -> ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
  -> dquot_commit()
    -> v2_write_dquot()
      ->qtree_write_dquot()
        ->ext4_quota_write
---
    handle_t *handle = journal_current_handle();

    do {
        bh = ext4_bread(handle, inode, blk,
                EXT4_GET_BLOCKS_CREATE |
                EXT4_GET_BLOCKS_METADATA_NOFAIL);
    } while (PTR_ERR(bh) == -ENOSPC &&
         ext4_should_retry_alloc(inode->i_sb, &retries));
    err = ext4_journal_get_write_access(handle, bh);
    lock_buffer(bh);
    memcpy(bh->b_data+offset, data, len);
    flush_dcache_page(bh->b_page);
    unlock_buffer(bh);
    err = ext4_handle_dirty_metadata(handle, NULL, bh);
    brelse(bh);
---

install when open file

ext4_file_open()
  -> dquot_file_open()
    -> __dquot_initialize()
    ---
    /* First get references to structures we might need. */
    for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
        ...
        switch (cnt) {
        case USRQUOTA:
            qid = make_kqid_uid(inode->i_uid);
            break;
        case GRPQUOTA:
            qid = make_kqid_gid(inode->i_gid);
            break;
        case PRJQUOTA:
            rc = inode->i_sb->dq_op->get_projid(inode, &projid);
            qid = make_kqid_projid(projid);
            break;
        }
        dquot = dqget(sb, qid);
        got[cnt] = dquot;
    }
    ---

alloc block

ext4_mb_new_blocks()
  -> dquot_alloc_block()
    -> dquot_alloc_space()
      -> dquot_alloc_space_nodirty()
        -> __dquot_alloc_space()
          -> dquot_add_space()
          -> __inode_add_bytes()_
          -> mark_all_dquot_dirty()
            -> mark_dquot_dirty()
               ext4_mark_dquot_dirty()
               ---
            if (ext4_is_quota_journalled(sb)) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
            } else {
                return dquot_mark_dquot_dirty(dquot);
            }
               ---

alloc inode

__ext4_new_inode()
This is invoked after inode is created
  -> dquot_alloc_inode()

Ext4

Layout

Block Group

For the filesystem block size == 4K, The 1st block group is as following,

|rrrrssss....|ggggggggg|rgrgrgrgrg|dd|ii|tttt|....|

rrrr: reserved 1024 bytes for x86 boot sectors
ssss: in the same fs block with rrrr, filesystem super block
gggg: block group descriptors in a contiguous space
rgrg: reserved group descriptors table for filesystem resize,
dd  : data block bitmap
ii  : inode block bitmap
tttt: inode table
      All of positions above can be got from block group descriptor

During .fill_super, all of the buffer_head's of block group descriptors are loaded into memory,

ext4_fill_super()
---

    /* Pre-read the descriptors into the buffer cache */

    for (i = 0; i < db_count; i++) {
        block = descriptor_loc(sb, logical_sb_block, i);
        ext4_sb_breadahead_unmovable(sb, block);
    }

    for (i = 0; i < db_count; i++) {
        struct buffer_head *bh;

        block = descriptor_loc(sb, logical_sb_block, i);
        bh = ext4_sb_bread_unmovable(sb, block);
        rcu_read_lock();
        rcu_dereference(sbi->s_group_desc)[i] = bh;
        rcu_read_unlock();
    }
    sbi->s_gdb_count = db_count;
---

descriptor_loc() is used to calculate the position of block group descriptors,
---

    //When meta block group is not enabled, it is very simple

    if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
        return logical_sb_block + nr + 1;
---

ext4 use block group descriptor to get the position of data bitmap, inode bitmap
and inode table. For example,

__ext4_get_inode_loc()
---
    iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
    gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);

    /*
     * Figure out the offset within the block group inode table
     */

    inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
    inode_offset = ((ino - 1) %    EXT4_INODES_PER_GROUP(sb));
    block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
    iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

    bh = sb_getblk(sb, block);
---

Extent

Format

There are mainly 3 data structure here,
struct ext4_extent_header {
    __le16    eh_magic;    /* probably will support different formats */
    __le16    eh_entries;    /* number of valid entries */
    __le16    eh_max;            /* capacity of store in entries*/
    __le16    eh_depth;        /* has tree real underlying blocks? */
    __le32    eh_generation;    /* generation of the tree */
};

struct ext4_extent_idx {
    __le32    ei_block;    /* index covers logical blocks from 'block' */
    __le32    ei_leaf_lo;    /* pointer to the physical block of the nex level. leaf or next index could be there */
    __le16    ei_leaf_hi;    /* high 16 bits of physical block */
    __u16    ei_unused;
};

struct ext4_extent {
    __le32    ee_block;    /* first logical block extent covers */
    __le16    ee_len;        /* number of blocks covered by extent */
    __le16    ee_start_hi;    /* high 16 bits of physical block */
    __le32    ee_start_lo;    /* low 32 bits of physical block */
};

Note:
 - ext4_extent.ee_block is 32bits, so the max file size of ext4 is 2^32 * 2^12 (fs blk) = 2^44 (16TiB)
 - ee_start_hi/ee_start_lo is 48bits, so the max block device size of ext4 is 2^48 * 2^12 = 2^60 (1EiB)
 - highest bit of ee_len is to indicates unwritten state, refer to ext4_ext_mark_unwritten()
 - if eh_depth > 0, ext4_extent_idx follows it, otherwise ext4_extent
 - max level of extent btree is 5, (4 * (((2^12 - 12)/12) ^ n) >= 2^32, n = 5)


              inode->i_block  (inode inlined)
                   |            
                [hiiii] eh.eh_max == 4
                 /   \
          [hiiii]     [hiiii] 4K - sizeof(eh) / sizeof(ei)
           /  \          \
    [heeee]   [heeee]    [heeee]

ei and ee in the nodes are sorted by logical block

Split

                [hiii-]
                 /  \
          [hiii-]    [hiiii]
           /   \         \         ^
    [hee-e]    [heee-]   [heeee]   | walk from down to up

ext4_ext_create_new_leaf()
---

    /* walk up to the tree and look for where need new index entry */

    curp = path + depth;
    while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
        i--;
        curp--;
    }
---

                [hiiii]______
                 /  \        \
          [hiii-]    [hiii] [hii]
           /   \         \     \ 
    [hee-e]    [heee-]   [hee] [hee]

Split entries from the full ee and ei's and then build a new sub tree above.
Then insert it to tree.

Note: the new node [hee] is always 1st entry of new [hii]

Let's look at the code,
ext4_ext_split()
---

    //leaf node has been setup in similar way

    while (k--) {
        oldblock = newblock;
        newblock = ablocks[--a];
        bh = sb_getblk(inode->i_sb, newblock);
        lock_buffer(bh);
        err = ext4_journal_get_create_access(handle, bh);
        neh = ext_block_hdr(bh);

        //setup the new node's header

        neh->eh_entries = cpu_to_le16(1);
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
        neh->eh_depth = cpu_to_le16(depth - i);
        neh->eh_generation = 0;
        fidx = EXT_FIRST_INDEX(neh);

        //insert the 1st one, namely, the newly split child
        //logical block is same across all new ei and ee

        fidx->ei_block = border;
        ext4_idx_store_pblock(fidx, oldblock);
        /* start copy indexes */
        m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
        if (m) {
            memmove(++fidx, path[i].p_idx,    sizeof(struct ext4_extent_idx) * m);
            le16_add_cpu(&neh->eh_entries, m);
        }

        //zero out unused area in the extent block
        //Is it necessary ? eh->eh_entries should be enough

        ext_size = sizeof(struct ext4_extent_header) + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
        ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);

        err = ext4_handle_dirty_metadata(handle, inode, bh);

        //correct old index
        //See it ? only update eh->eh_entries here

        if (m) {
            err = ext4_ext_get_access(handle, inode, path + i);
            le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
            err = ext4_ext_dirty(handle, inode, path + i);
        }
        i--;
    }

    /* insert new index */
    err = ext4_ext_insert_index(handle, inode, path + at,
                    le32_to_cpu(border), newblock);

---

Grow level

All of the ei and ee are full, including the top one that's in inode->i_block
                [hiiii]                     copy inode->i_block into new block          
                 /  \                                     [hiiii]         
          [hiiii]    [hiiii]         --\                   /  \           
           /   \         \           --/              [hiiii]    [hiiii]           
    [heeee]    [heeee]   [heeee]                     /   \         \             
                                              [heeee]    [heeee]   [heeee]

          inode->i_block [hi]                           ||
                          |                             \/
                        [hiiii]      link the new block to inode->i_block
                         /  \           
                    [hiiii  [hiiii]           
                   /   \         \             
            [heeee]    [heeee]   [heeee]
Look at the code,
ext4_ext_grow_indepth()
---
    bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
    lock_buffer(bh);
    err = ext4_journal_get_create_access(handle, bh);
    ext_size = sizeof(EXT4_I(inode)->i_data);

    /* move top-level index/leaf into new block */

    memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
    /* zero out unused area in the extent block */
    memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);

    /* set size of new block */
    neh = ext_block_hdr(bh);

    //if ext_depth(inode) is zero, inode->i_block contain ee.
    //then new node carries the initial 4 ee's.

    if (ext_depth(inode))
        neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
    else
        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
    neh->eh_magic = EXT4_EXT_MAGIC;
    err = ext4_handle_dirty_metadata(handle, inode, bh);

    /* Update top-level index: num,max,pointer */

    neh = ext_inode_hdr(inode);
    neh->eh_entries = cpu_to_le16(1);
    ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
    if (neh->eh_depth == 0) {

        /* Root extent block becomes index block */

        neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
        EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block;
    }
    le16_add_cpu(&neh->eh_depth, 1);
    err = ext4_mark_inode_dirty(handle, inode);
---

Remove

When we need to remove blocks ?
ext4_ext_truncate()
ext4_collapse_range()
ext4_punch_hole()

                          [hii]                             3
                        [hii] [hii]                         2
         [hii]     [hii]          [hii]      [hii]          1
    [hee] [hee]  [hee]  [hee]  [hee]  [hee]  [hee] [hee]    0
                                             <--- rm direction
                           ||
                           \/

                          [hii]                             3
                        [hii] [hii]                         2
         [hii]     [hi]               [h]      [h]          1
    [hee] [hee]  [hee]                                      0

                           ||
                           \/

                          [hii]                             3
                        [hii] [h]                          2
         [hii]     [hi]                                     1
    [hee] [hee]  [hee]                                      0


                           ||
                           \/

                          [hi]                              3
                        [hii]                               2
         [hii]     [hi]                                     1
    [hee] [hee]  [hee]                                      0


The code is very complicated, so we don't show all of it here, but just some
important part.
(1) continue or next level
                          [hii]                             3
                        [hii] [hii]                         2
         [hii]     [hii]          [hii]      [hii]          1
    [hee] [hee]  [hee]  [hee]  [hee]  [hee]  [hee] [hee]    0

ext4_ext_rm_leaf() can only handle one node, namely a [hee], when to access the
next one (the right one) ?
ext4_ext_remove_space()
---
    while (i >= 0 && err == 0) {
        if (i == depth) {
            /* this is leaf block */
            err = ext4_ext_rm_leaf(handle, inode, path,
                           &partial, start, end);
            /* root level has p_bh == NULL, brelse() eats this */
            brelse(path[i].p_bh);
            path[i].p_bh = NULL;
            i--;
            continue;
        }

        if (!path[i].p_idx) {
        } else {
            /* we were already here, see at next index */
            path[i].p_idx--;
        }


        // determine based on path->p_idx and EXT_FIRST_INDEX(path->p_hdr)
                   first  now            first 
                     v    v                v  
                    heeeeeeeeeeee         heeeeeeeeeeee
                                          ^
                                          now
        // no more need to remove         previous one is need to be removed

        if (ext4_ext_more_to_rm(path + i)) {
            struct buffer_head *bh;

            //Read in the ee or ei block

            bh = read_extent_tree_block(inode, ext4_idx_pblock(path[i].p_idx), depth - i - 1, EXT4_EX_NOCACHE);
            i++;
        } else {
        }
    }
---
   
(2) revoke ?
When free metadata block, we need to invoke them

ext4_remove_blocks()
---

    flags = get_default_free_blocks_flags(inode);
    ---
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
            return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
    ---
    ...
    flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
    ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
---
ext4_ext_rm_idx()
---
    ext4_free_blocks(handle, inode, NULL, leaf, 1,
             EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
---

Delayed Allocation

Delayed allocation, where the filesystem defers the allocation of blocks on disk for data
written by applications until that data is actually written to disk. The idea is to wait
until the application finishes its operations on the file, then allocate the actual number
of data blocks needed on the disk at once. This optimization limits unneeded operations
related to short-lived, small files, batches large writes, and helps ensure that data space
is allocated contiguously.

write to page cache

generic_perform_write()
---
    do {
        ...
        status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                        &page, &fsdata);

         // 'i' here is iov_iter which contains the user buffer
         // write_begin callback return one page every time.

        copied = copy_page_from_iter_atomic(page, offset, bytes, i);
        status = a_ops->write_end(file, mapping, pos, bytes, copied,
                        page, fsdata);
        ...

        balance_dirty_pages_ratelimited(mapping);
    } while (iov_iter_count(i));
---

ext4_da_write_begin()
  -> __block_write_begin(page, pos, len, ext4_da_get_block_prep);
    -> ext4_da_get_block_prep()
      -> ext4_da_map_blocks()
        -> ext4_insert_delayed_block() //ext4_ext_map_blocks() returns 0
          -> ext4_da_reserve_space() //Reserve space in memory
          -> ext4_es_insert_delayed_block()

extent status rb tree maintains extent status including,
//These flags live in the high bits of extent_status.es_pblk
enum {
    ES_WRITTEN_B,
    ES_UNWRITTEN_B,
    ES_DELAYED_B,
    ES_HOLE_B,
    ES_REFERENCED_B,
    ES_FLAGS
};

flush to disk

In ext4_writepages(), we first need to get the extent in which pages have same
state, 
 - Delayed, pages with delayed allocation,
 - Unwritten, fallocate
 - Mapped, pages with Written state
 - Dirty, has dirty data in cache

This is done in mpage_prepare_extent_to_map(), it first collects the pages on
demand, check their state and get the contiguous pages with same state.
mpage_prepare_extent_to_map()
  -> mpage_process_page_bufs()
    -> mpage_add_bh_to_extent()
    ---
    if (map->m_len == 0) {
        /* We cannot map unless handle is started... */
        if (!mpd->do_map)
            return false;
        map->m_lblk = lblk;
        map->m_len = 1;

        // BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))

        map->m_flags = bh->b_state & BH_FLAGS;
        return true;
    }

    /* Don't go larger than mballoc is willing to allocate */
    // MAX_WRITEPAGES_EXTENT_LEN is 2048 (2K * 4K = 8M)

    if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
        return false;

    /* Can we merge the block to our big extent? */

    if (lblk == map->m_lblk + map->m_len &&
        (bh->b_state & BH_FLAGS) == map->m_flags) {
        map->m_len++;
        return true;
    }
    return false;
    ---

ext4_writepages()
  -> mpage_map_and_submit_extent)()
    -> mpage_map_one_extent()
      -> ext4_map_blocks()//do reall allocation here
  -> mpage_map_and_submit_buffers()
    -> mpage_process_page()//update bhs' state 
    -> mpage_submit_page()//submit pages to io

truncate

ext4_truncate()
  -> ext4_ext_truncate()
    -> ext4_es_remove_extent()
  ---
    write_lock(&EXT4_I(inode)->i_es_lock);
    err = __es_remove_extent(inode, lblk, end, &reserved);
    write_unlock(&EXT4_I(inode)->i_es_lock);
    ext4_da_release_space(inode, reserved);
  ---

There seems not special optimization related to delayed allocation

Xattr

In Linux, the ext2, ext3, ext4, JFS, Squashfs, Yaffs2, ReiserFS, Reiser4, XFS, Btrfs,
OrangeFS, Lustre, OCFS2 1.6, ZFS, and F2FS[11] filesystems support extended attributes 
(abbreviated xattr) when enabled in the kernel configuration. Any regular file or directory 
may have extended attributes consisting of a name and associated data. The name must be a 
null-terminated string prefixed by a namespace identifier and a dot character. Currently, 
four namespaces exist: user, trusted, security and system. The user namespace has no 
restrictions with regard to naming or contents. The system namespace is primarily used by 
the kernel for access control lists. The security namespace is used by SELinux, for example.

The Linux kernel allows extended attribute to have names of up to 255 bytes and values of 
up to 64 KiB,[15] as do XFS and ReiserFS, but ext2/3/4 and btrfs impose much smaller limits, 
requiring all the attributes (names and values) of one file to fit in one "filesystem block" 
(usually 4 KiB). Per POSIX.1e,[citation needed] the names are required to start with one of 
security, system, trusted, and user plus a period. This defines the four namespaces of 
extended attributes.[16]

Extended attributes can be accessed and modified using the getfattr and setfattr commands 
from the attr package on most distributions.[17] The APIs are called getxattr and setxattr.

Space for xattr

Where is the xattr of ext4 stored ? There are 2 places,

inode body

Ext4 has an 256 inode size by default(can be configured), but the ext4_inode structure is only 160
bytes. The left space is for storing xattr.

|iiiiiiixxxxx|iiiiiiixxxxx|iiiiiiixxxxx|iiiiiiixxxxx|

i: inode on disk
x: xattrs

ext4_xattr_ibody_get()
---
    error = ext4_get_inode_loc(inode, &iloc);
    raw_inode = ext4_raw_inode(&iloc);
    header = IHDR(inode, raw_inode);

        ((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE(128) + EXT4_I(inode)->i_extra_isize)

    end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
    entry = IFIRST(header);
    error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
---

external block, a filesystem block pointed by inode.i_file_acl_lo | i_file_acl_high

ext4_xattr_block_get()
---

    //one bh is enough as its size is one filesystem block

    bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
    entry = BFIRST(bh);
    end = bh->b_data + bh->b_size;
    error = xattr_find_entry(inode, &entry, end, name_index, name, 1);
---

extended attribute inode

If value of an attribute is greater than 2048 bytes the value is not
saved in the external EA block, instead it is saved in an inode. The EA
entry saves the inode number in e_value_inum field (earlier this was
e_value_block that was unused). The maximum size of the EA is limited to
64K due to VFS limitations as can be seen in linux/limits.h. A new
EXT4_FEATURE_INCOMPAT_EA_INODE feature has been added for this.

ext4_xattr_entry.e_value_inum

The inode where the value is stored. Zero indicates the value is in the
same block as this entry. This field is only used if the INCOMPAT_EA_INODE
feature is enabled.


Note ! this kind of xattr entry can be stored in both inode body and external block.
                                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ext4_xattr_ibody_get()/ext4_xattr_block_get()
---
        if (entry->e_value_inum) {

            error = ext4_xattr_inode_get(inode, entry, buffer,
                             size);

        } else {
            u16 offset = le16_to_cpu(entry->e_value_offs);
            void *p = bh->b_data + offset;

            memcpy(buffer, p, size);
        }
---

Dentry

DX

First let's look at some basic data structure

ext4_dir_entry_2

struct ext4_dir_entry_2 {
    __le32    inode;            /* Inode number */
    __le16    rec_len;        /* Directory entry length */
    __u8    name_len;        /* Name length */
    __u8    file_type;        /* See file type macros EXT4_FT_* below */
    char    name[EXT4_NAME_LEN];    /* File name */
};
EXT4_NAME_LEN is 255.
The reason why we still need rec_len when we have name_len,
    
     foo      doo    eoo   aoo
    |-----|--------|-----|------|
    
    after unlink eoo
    
     foo      doo          aoo
    |-----|--------------|------|
          \______ _______/
                 v
             doo.rec_len

The rec_len is used to jump to the next dentry.
Refer to the code of ext4_search_dir()

By default, the directory file is a list of ext4_dir_entry_2 linked by rec_len.
The lookup method is to serach a dentry in the list which is very ineffcient.
To imporve this, hash tree dentry is introduced.

dx_root

struct dx_root and dx_node of index
{
    struct fake_dirent dot; // for '.', rec_len = 12
    char dot_name[4];
    struct fake_dirent dotdot; // for '..', block_size - 12
    char dotdot_name[4];
    struct dx_root_info
    {
        __le32 reserved_zero;
        u8 hash_version;
        u8 info_length; /* 8 */
        u8 indirect_levels;
        u8 unused_flags;
    }
    info;
    struct dx_entry    entries[];
};

struct dx_node
{
    struct fake_dirent fake; //rec_len = block size
    struct dx_entry    entries[];
};

For backwards read-only compatibility with ext2, this tree is actually
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
hidden inside the directory file, the fake_dirent is to masquerad as
"empty" directory data blocks! The ext2 can read the leaf blocks.

dx_entry

struct dx_entry
{
    __le32 hash;
    __le32 block;
};

The first dx_entry of both root or node is special,

__le16     limit     Maximum number of dx_entries that can follow this header,
                plus 1 for the header itself.
__le16     count     Actual number of dx_entries that follow this header,
                plus 1 for the header itself.
__le32     block     The block number (within the directory file) that goes
                with the lowest hash value of this block. This value is
                stored in the parent block. For the root, this hash vaule
                is 0.

The directory file can be formated as following,

R: dx_root    N: dx_node    E: dx_entry     D: ext4_dir_entry_2

       {D/D/D/} {D/D/D/D} {D/D/D/D} {D/D/D/D}

or

                         {R E/E/E/E/E }        \
                      ...  /       \ ...       |
                  {N E/E/E/E}     {N E/E/E/E}   > index
                     /  \             ...      |
           {N E/E/E/E}  {N E/E/E/E}            /
                / \        ... 
       {D/D/D/D}  {D/D/D/D}                    }  dentry


(1) The E in index is sorted by hash
(2) The D in dentry is is a list linked by rec_len

Obviously, the index tree is introduced to promote the lookup performance.

Regarding to the dentry hash tree, there are following points,

probe

Look for the dx_entry of the leaf (a block filled by dentry)
dx_probe()
---
    entries = (struct dx_entry *)(((char *)&root->info) +
                      root->info.info_length);
    while (1) {
        count = dx_get_count(entries);
        p = entries + 1;
        q = entries + count - 1;
        while (p <= q) {
            m = p + (q - p) / 2;
            if (dx_get_hash(m) > hash)
                q = m - 1;
            else
                p = m + 1;
        }

        at = p - 1;
        frame->entries = entries;
        frame->at = at;

        if (!indirect--)
            return frame;

        frame++;
        frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
        entries = ((struct dx_node *) frame->bh->b_data)->entries;
    }
---

split

When a block is full, we need to split it into two

       []   -\      []
       /    -/     /  \
   [xxxxxx]    [xxx]  [xxx]

- index dx_node, split the dx_entry by half, as dx_entry's size is fixed
  refer to code of ext4_dx_add_entry
- leaf blocks, split it by the size of dentries, as the dentry's size is different
  refer to the code of do_split

Note, before do leaf block, we need to check whether the space of index dx_node
is enough, if not, do split on index first.

add level

When there is no space in index tree, we have to add level
     [yyyyyy]1             [z]1               [z]
      /            -\      /          -\      /  \
 [xxx]     [xxx]   -/  [yyyyyy]2      -/  [yyyy] [yyy]
                          /                 / \
                     [xxx]    [xxx]     [xxx] [xxx]

The ext4 index's max level is 2 or 3 when largedir feature is enabled

Noted, when add new level, dx_entries in block1, "yyyyyyy", is copied to the new
block 2, then add a dx_entry into block 1 to point to block2.
This policy is also suitable for the case when level is 2. Because the root
block must be logical blk0 of directoy file.

Inline

How many space can used by inlined data in inode

ext4_inode.i_block[EXT4_N_BLOCKS] //__le32

#define    EXT4_NDIR_BLOCKS        12
#define    EXT4_IND_BLOCK            EXT4_NDIR_BLOCKS
#define    EXT4_DIND_BLOCK            (EXT4_IND_BLOCK + 1)
#define    EXT4_TIND_BLOCK            (EXT4_DIND_BLOCK + 1)
#define    EXT4_N_BLOCKS            (EXT4_TIND_BLOCK + 1)

60 bytes per inode

When to use inlined data

__ext4_new_inode()
---
    ei->i_inline_off = 0;
    if (ext4_has_feature_inline_data(sb) &&
        (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
---
ext4_init_new_dir()
---
    if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
        err = ext4_try_create_inline_dir(handle, dir, inode);
        ...
    }
---

inline -> block -> index

inode inline            block             index
   +---+                +----+            +----+
   |   |            -\  |dddd|    -\      |    |
   |ddd| } dentry   -/  |dddd|    -/      +----+
   +---+           [1]  +----+   [2]      /    \
                                       +----+ +----+
                                       |dddd| |dddd|
                                       +----+ +----+

[1] inline to block

ext4_try_add_inline_entry()
  -> ext4_convert_inline_data_nolock()
  ---
    map.m_lblk = 0;
    map.m_len = 1;
    map.m_flags = 0;
    error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
    ...
    data_bh = sb_getblk(inode->i_sb, map.m_pblk);
    ...
    lock_buffer(data_bh);
    error = ext4_journal_get_create_access(handle, data_bh);
    memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);

    if (!S_ISDIR(inode->i_mode)) {

        // The interesting thing is both the inode and data block need
        // to be recorded in log. Because they need to be completed in
        // one transaction

        memcpy(data_bh->b_data, buf, inline_size);
        set_buffer_uptodate(data_bh);
        error = ext4_handle_dirty_metadata(handle,
                           inode, data_bh);
    } else {

        // There are 3 steps here
        // (1) create '.' and '..' dentry
        // (2) copy the inlined dentries
        // (3) set the tail dentry of which rec_len covers the whole block

        error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
                               buf, inline_size);
    }
    unlock_buffer(data_bh);
  ---

[2] block to index

ext4_add_entry()
  -> make_indexed_dir()
  ---
    /* The 0th block becomes the root, move the dirents out */
    fde = &root->dotdot;
    de = (struct ext4_dir_entry_2 *)((char *)fde +
        ext4_rec_len_from_disk(fde->rec_len, blocksize));
    len = ((char *) root) + (blocksize - csum_size) - (char *) de;
    /* Allocate new block for the 0th block's dirents */
    bh2 = ext4_append(handle, dir, &block);
    ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
    data2 = bh2->b_data;

    memcpy(data2, de, len);
    memset(de, 0, len); /* wipe old data */
//Get the last dentry and make its rec_len cover the whole block
    de = (struct ext4_dir_entry_2 *) data2;
    top = data2 + len;
    while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
        de = de2;
    de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
                       (char *) de, blocksize);
    /* Initialize the root; the dot dirents already exist */
    de = (struct ext4_dir_entry_2 *) (&root->dotdot);
    de->rec_len = ext4_rec_len_to_disk(
            blocksize - ext4_dir_rec_len(2, NULL), blocksize);
    memset (&root->info, 0, sizeof(root->info));
    root->info.info_length = sizeof(root->info);
    if (ext4_hash_in_dirent(dir))
        root->info.hash_version = DX_HASH_SIPHASH;
    else
        root->info.hash_version =
                EXT4_SB(dir->i_sb)->s_def_hash_version;

    entries = root->entries;
// The first leaf block's offset is 1
    dx_set_block(entries, 1);
    dx_set_count(entries, 1);
    dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
    ...
    retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
    retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
// Split the leaf block as it is filled up
    de = do_split(handle,dir, &bh2, frame, &fname->hinfo);

    retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2);
  ---

Blocks

Buddy

Ext4 mballoc maintains a buddy system for every block group in memory.

Every block group needs two blocks (4K), which format is as following,

                      O0                   O1        O2   O3 
         |--------------------------|-------------|------|--|..|
         \____________ ____________/ \____________ ____________/
                      v                           v
                  block 0                      block 1
                  bd_bitmap                    bd_buddy

O0: bitmap for order 0
O1: bitmap for order 1
    ...
                      Extent (bit - off) << order, 1 << order)                >

The region for every order is defined in ext4_sb_info.s_mb_offsets/s_mb_max
which is calculated in ext4_mb_init()

mb_find_buddy() helps to find the bitmap address and range (max)
---
    /* at order 0 we see each particular block */
    if (order == 0) {
        *max = 1 << (e4b->bd_blkbits + 3);
        return e4b->bd_bitmap;
    }

    bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
    *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
---

How does this buddy system work ?

what

What's information stored in the buddy (except for order 0 bitmap)?

The freed and contiguous blocks of every order.

For example, if you have 128M blocks, then the highest order bitmap is cleared,
all of other are set.

If you have 64M, 32M, 16M, and 8M, then there will a single bit cleared in the
bitmap of those orders. All of other are set.

The buddy cache is initialized when load buddy,
ext4_mb_load_buddy_gfp()
  -> ext4_mb_init_group()
    -> ext4_mb_init_cache()
      -> memset(data, 0xff, blocksize);//See it ? default state is set
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      -> ext4_mb_generate_buddy()
      ---
          i = mb_find_next_zero_bit(bitmap, max, 0);
        grp->bb_first_free = i;
        while (i < max) {
            fragments++;
            first = i;
            i = mb_find_next_bit(bitmap, max, i);
            len = i - first;
            free += len;
            if (len > 1)
                ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
            else
                grp->bb_counters[0]++;
            if (i < max)
                i = mb_find_next_zero_bit(bitmap, max, i);
        }
      ---

    Find the freed and contiguous blocks and mark them free in the buddy

find

Find by goal
By searching order 0, we can know whether the block has been allocated.
By searching order 1 and larger ones, we can know the max contiguous blocks that
contains the goal block.

A general function we need to know is,
mb_find_order_for_block()
---
    while (order <= e4b->bd_blkbits + 1) {
        bb = mb_find_buddy(e4b, order, &max);
        if (!mb_test_bit(block >> order, bb)) {
            /* this block is part of buddy of order 'order' */
            return order;
        }

        //check in ascending order

        order++;
    }
    return 0;
---

mb_find_extent() returns the range that contains  the goal block and required
length. It can carry multiple buddies of different size,

                        goal  required len
                         | ______^_________
                         v/                \
   |[x][x][x][x][x][x][x][-][-][-][-][-][-][-][-][-]|
     \___ ____/ \_ __/ |  | \___________ __________/
         v        v    v  v             v
         O2       O1   O0 O0           O3

[x] : allocated
[-] : freed

allocate

This is done by mb_mark_used(), every order of buddies that's involved will be set,
mb_mark_used()
---
    /* let's maintain buddy itself */
    while (len) {

        //Note, it return the order that has free blocks

        ord = mb_find_order_for_block(e4b, start);

        if (((start >> ord) << ord) == start && len >= (1 << ord)) {
            /* the whole chunk may be allocated at once! */
            mlen = 1 << ord;
            buddy = mb_find_buddy(e4b, ord, &max);
            mb_set_bit(start >> ord, buddy);
            start += mlen;
            len -= mlen;
            continue;
        }

        /* we have to split large buddy */
        // split by descending order
        // mb_find_order_for_block() in next loop won't return this
        // order any more, as it has been set

        buddy = mb_find_buddy(e4b, ord, &max);
        mb_set_bit(start >> ord, buddy);
        e4b->bd_info->bb_counters[ord]--;

        ord--;
        cur = (start >> ord) & ~1U;
        buddy = mb_find_buddy(e4b, ord, &max);
        mb_clear_bit(cur, buddy);
        mb_clear_bit(cur + 1, buddy);
    }
    mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);


    //Set the bits on the bitmap of order 0

    ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
---

The code works as following,
(1)
                        off     len
                         | ______^_________
                         v/                \
   |[x][x][x][x][x][x][x][-][-][-][-][-][-][-][-][-]|
     \___ ____/ \_ __/ |  | \___________ __________/
         v        v    v  v             v
         O2       O1   O0 O0           O3

    mb_find_order_for_block() return order 0, needn't split, offset is pushed forward

(2)
                            off   len
                             | ____^_____
                             v/          \
   |[x][x][x][x][x][x][x][x][-][-][-][-][-][-][-][-]|
     \___ ____/ \_ __/ |  | \___________ __________/
         v        v    v  v             v
         O2       O1   O0 O0           O3

    mb_find_order_for_block() return order 3, need to split.
    The entry of off in order 3 is set, so in next loop,
    mb_find_order_for_block() can only return order 2

(3)
                            off   len
                             | ____^_____
                             v/          \
   |[x][x][x][x][x][x][x][x][-][-][-][-][-][-][-][-]|
     \___ ____/ \_ __/ |  | \____ ____/ \____ ____/
         v        v    v  v      v           v
         O2       O1   O0 O0    O2          O2

    mb_find_order_for_block() return order 2, needn't to split, off is pushed forward

(4) 
                                        off
                                         |
                                         v
   |[x][x][x][x][x][x][x][x][x][x][x][x][-][-][-][-]|
     \___ ____/ \_ __/ |  | \____ ____/ \____ ____/
         v        v    v  v      v           v
         O2       O1   O0 O0    O2          O2

    mb_find_order_for_block() return order 2, need to split
    The entry of off in order 2 is set, mb_find_order_for_block() will return
    order 1 in next loop.

(5)
                                        off
                                         |
                                         v
   |[x][x][x][x][x][x][x][x][x][x][x][x][-][-][-][-]|
     \___ ____/ \_ __/ |  | \____ ____/ \_ _/  \_ _/
         v        v    v  v      v        v      v
         O2       O1   O0 O0    O2       O1     O1
    mb_find_order_for_block() return order 1, need to split again
    The entry of off in order 1 is set, mb_find_order_for_block() will return
    order 0 in next loop.

(6)
                                        off
                                         |
                                         v
   |[x][x][x][x][x][x][x][x][x][x][x][x][-][-][-][-]|
     \___ ____/ \_ __/ |  | \____ ____/  |  | \_ _/
         v        v    v  v      v       v  v   v
         O2       O1   O0 O0    O2      O0  O0  O1

    mb_find_order_for_block() return order 0, needn't to split, off is pushed forward

free

Too lazy to read the code here. To be continued later

flush

Allocate
ext4_mb_new_blocks()
---
    if (!ext4_mb_use_preallocated(ac)) {
        ac->ac_op = EXT4_MB_HISTORY_ALLOC;
        ext4_mb_normalize_request(ac, ar);
        ...

        // do allocation in mb

        *errp = ext4_mb_regular_allocator(ac);
        ...
    }
    if (likely(ac->ac_status == AC_STATUS_FOUND)) {
        *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
    }

---
Free
ext4_free_blocks()
---
    bitmap_bh = ext4_read_block_bitmap(sb, block_group);
    gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
    err = ext4_journal_get_write_access(handle, bitmap_bh);
    err = ext4_journal_get_write_access(handle, gd_bh);
    ...
    err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
                     GFP_NOFS|__GFP_NOFAIL);
    if (ext4_handle_valid(handle) &&
        ((flags & EXT4_FREE_BLOCKS_METADATA) ||
         !ext4_should_writeback_data(inode))) {
        ...
        ext4_lock_group(sb, block_group);

        //Set in the bitmap on disk

        mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
        ext4_mb_free_metadata(handle, &e4b, new_entry);
    }

    ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
    ext4_free_group_clusters_set(sb, gdp, ret);
    ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
    ext4_group_desc_csum_set(sb, block_group, gdp);
    ext4_unlock_group(sb, block_group);
    ...
    ext4_mb_unload_buddy(&e4b);
    ...
    err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
---

Pre Allocation

What is ext4 preallocation for ?

The main goal is to provide better allocation for small and large files.
This is achieved by using a different strategy for different allocation
requests. For a relatively small allocation request, Ext4 tries to allocate
from a per-CPU locality group, which is shared by all allocations under
the same CPU, in order to try to keep these small files close to each other.
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A large allocation request is allocated from per-file preallocation first.

How to determine the size of preallocation

There are two steps,
(1) decide the goal allocation length
ext4_mb_normalize_request()
---
    if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
        ext4_mb_normalize_group_request(ac);
        return ;
    }

    // The code to decide length of per-file preallocation is a bit long, so
    // don't share it here. The basic principle is take the 1st one larger than
    // what you want,
    // 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M, what you want

---
(2) allocate from mb
ext4_mb_regular_allocator()
---
    /*
     * ac->ac_2order is set only if the fe_len is a power of 2
     * if ac->ac_2order is set we also set criteria to 0 so that we
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     * try exact allocation using buddy.
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     */
    i = fls(ac->ac_g_ex.fe_len);
    ac->ac_2order = 0;
    /*
     * We search using buddy data only if the order of the request
     * is greater than equal to the sbi_s_mb_order2_reqs
     * You can tune it via /sys/fs/ext4//mb_order2_req
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     * We also support searching for power-of-two requests only for
     * requests upto maximum buddy size we have constructed.
     */
    if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
        if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
            ac->ac_2order = array_index_nospec(i - 1,
                               MB_NUM_ORDERS(sb));
    }
---

ext4_mb_simple_scan_group()
searching the buddy, if there is available one,
ext4_mb_use_best_found()
---

    //Note, it will use goal length instead of the one get from buddy

    ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
    ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
    ret = mb_mark_used(e4b, &ac->ac_b_ex);
    ...
    /*
     * As we've just preallocated more space than
     * user requested originally, we store allocated
     * space in a special descriptor.
     */
    if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
        ext4_mb_new_preallocation(ac);
---

How to decide use per-cpu and per-inode

If EXT4_MB_HINT_GROUP_ALLOC is set, employ per-cgroup locality group alloc
ext4_mb_group_or_file()
---
    size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
    isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
        >> bsbits;

    if ((size == isize) && !ext4_fs_is_busy(sbi) &&
        !inode_is_open_for_write(ac->ac_inode)) {
        ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
        return;
    }

    // /sys/fs/ext4/xxx/mb_group_prealloc

    if (sbi->s_mb_group_prealloc <= 0) {
        ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
        return;
    }

    /* don't use group allocation for large files */
    // The block allocation happens when .writepages. Writing has been
    // aggregated through delayed-allocation.

    size = max(size, isize);
    if (size > sbi->s_mb_stream_request) {
        ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
        return;
    }
    /*
     * locality group prealloc space are per cpu. The reason for having
     * per cpu locality group is to reduce the contention between block
     * request from multiple CPUs.
     */
    ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);

    /* we're going to use group allocation */
    ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

    /* serialize all allocations in the group */
    mutex_lock(&ac->ac_lg->lg_mutex);
---

Preallocation in-core or on-disk

Preallocation is only in-core, not on-disk.

When the allocation completes, only the spaces used by file is flushed to disk,
ext4_mb_new_blocks()
  -> ext4_mb_mark_diskspace_used()
  ---
    ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
              ac->ac_b_ex.fe_len);
  ---

  Don't worry, this is not the one used in ext4_mb_use_best_found()
  It has been adapted,
  ext4_mb_use_best_found()
    -> ext4_mb_new_preallocation()
      -> ext4_mb_new_inode_pa()
        -> ext4_mb_use_inode_pa()
        ---
        /* found preallocated blocks, use them */
        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
              start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
        len = EXT4_NUM_B2C(sbi, end - start);
        ac->ac_b_ex.fe_len = len;
        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_pa = pa;
        ---

Tmpfs

Everything in tmpfs is temporary in the sense that no files will be
created on your hard drive. The files live in memory and swap
space. If you unmount a tmpfs instance, everything stored therein is
lost.

File

An in-core inode with specific operations.
See shmem_get_inode()

An important thing is the dentry of this inode will be pined in memory
shmem_mknod()
---
        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
                ...
                dir->i_size += BOGO_DIRENT_SIZE;
                dir->i_ctime = dir->i_mtime = current_time(dir);
                d_instantiate(dentry, inode);
                dget(dentry); /* Extra count - pin the dentry in core */
        }
---

The dentry for the inode will be perserved in dcache until it is deleted

hard link

An in-core dentry points to the linked inode

shmem_link()
---
        dir->i_size += BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
        inc_nlink(inode);
        ihold(inode);        /* New dentry reference */
        dget(dentry);                /* Extra pinning count for the created dentry */
        d_instantiate(dentry, inode);
---

Xfs Log

Let's take some relatively simple examples,

inode update log

log format (record)
xfs_inode_item_ops.xfs_inode_item_format()
    -> xfs_inode_item_format_core()
        -> xfs_inode_to_log_dinode_ts()
            -> copy the in-memory inode to xfs_log_dinode

Note ! the modifications are not applied on the inode cluster xfs buffer.
log push (checkpoint)
xfs_inode_item_push()
---
        if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) ||
                (ip->i_flags & XFS_ISTALE))
                return XFS_ITEM_PINNED;

        if (xfs_iflags_test(ip, XFS_IFLUSHING))
                return XFS_ITEM_FLUSHING;

        // cluster buffer need to be locked during IO

        if (!xfs_buf_trylock(bp))
                return XFS_ITEM_LOCKED;

        spin_unlock(&lip->li_ailp->ail_lock);

        xfs_buf_hold(bp);

        // flush the dirty inodes into cluster buffer

        error = xfs_iflush_cluster(bp);
        if (!error) {

        // queue the cluster buffer to delayed write queue

                if (!xfs_buf_delwri_queue(bp, buffer_list))
                        rval = XFS_ITEM_FLUSHING;
                xfs_buf_relse(bp);
        }

---

xfs buf log

log format (record)
xfs_item_ops.xfs_buf_item_format()
    -> xfs_buf_item_format_segment()
    ---
        for (;;) {

                //The bitmap here describes which 128 bytes chunks of the buffer have
                // been dirtied. This is done by
                // xfs_trans_log_buf()
                //     -> xfs_buf_item_log()
                //         -> xfs_buf_item_log_segment()_

                next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
                                        (uint)last_bit + 1);
                if (next_bit == -1) {
                        xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
                                                first_bit, nbits);
                        blfp->blf_size++;
                        break;
                } else if (next_bit != last_bit + 1 ||
                                     xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
                        xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
                                                first_bit, nbits);
                        blfp->blf_size++;
                        first_bit = next_bit;
                        last_bit = next_bit;
                        nbits = 1;
                } else {
                        last_bit++;
                        nbits++;
                }
        }

See it !? modifications are made into xfs buffer directly.
And we will see that the push just add the xfs buffer into the del write queue.

    ---
log push(checkpoint)
xfsaild_push()
    -> xfsaild_push_item()
        -> lip->li_ops->iop_push()
             xfs_buf_item_push()
             ---
                if (xfs_buf_ispinned(bp))
                        return XFS_ITEM_PINNED;
                if (!xfs_buf_trylock(bp)) {
                        if (xfs_buf_ispinned(bp))
                                return XFS_ITEM_PINNED;
                        return XFS_ITEM_LOCKED;
                }

                if (!xfs_buf_delwri_queue(bp, buffer_list))

                        rval = XFS_ITEM_FLUSHING;
                xfs_buf_unlock(bp);
             ---
    -> xfs_buf_delwri_submit_nowait()
        -> xfs_buf_delwri_submit_buffers()
        ---
        list_sort(NULL, buffer_list, xfs_buf_cmp);

        blk_start_plug(&plug);
        list_for_each_entry_safe(bp, n, buffer_list, b_list) {

        // The buf could be pinned and locked by new writing
        // During the whole IO, the xfs buf is locked.
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

                if (!wait_list) {
                        if (xfs_buf_ispinned(bp)) {
                                pinned++;
                                continue;
                        }
                        if (!xfs_buf_trylock(bp))
                                continue;
                }
                ...
                bp->b_flags &= ~_XBF_DELWRI_Q;
                bp->b_flags |= XBF_WRITE;
                if (wait_list) {
                        ...
                } else {
                        bp->b_flags |= XBF_ASYNC;
                        list_del_init(&bp->b_list);
                }
                __xfs_buf_submit(bp, false);

                // bio->bi_end_io = xfs_buf_bio_end_io

        }
        blk_finish_plug(&plug);
        ---

xfs_buf_bio_end_io()
    -> xfs_buf_ioend_async()
        -> queue_work xfs_buf_ioend_work()
            -> xfs_buf_ioend()
                -> xfs_buf_item_done() //log item is freed here under xfs buf lock
                -> xfs_buf_relse()
                    -> xfs_buf_unlock()

Framework

iclog ring and state machine

            .-->                                                                             
         /     ---
             /         \
            | iclog |
             \         /
                 ---    /
                    <--''                                            -->
|------------------------------------------------|
                     physical log space

The iclog has two parameters,
l_iclog_bufs            8
l_iclog_size            32K (max 256K)

iclog is allocated in xlog_alloc_log()
---
        for (i = 0; i < log->l_iclog_bufs; i++) {
                int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
                size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
                                sizeof(struct bio_vec);

                iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
                ...        
                iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
                                                KM_MAYFAIL | KM_ZERO);
                ...
                iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
                iclog->ic_state = XLOG_STATE_ACTIVE;
                ...

                // link all of the iclog in a ring

                iclogp = &iclog->ic_next;
        }
---

The iclog in the ring will be employed one by one.
And every iclog has a state machine works as following,

XLOG_STATE_ACTIVE Be able to receive log
    - xlog_alloc_log()
    - xlog_state_do_callback()
            -> xlog_state_clean_iclog()
                -> xlog_state_activate_iclogs()
                    -> xlog_state_activate_iclog()

XLOG_STATE_WANT_SYNC
    - xlog_state_switch_iclogs()
    
XLOG_STATE_SYNCING
    - xlog_state_release_iclog()
            -> __xlog_state_release_iclog()
            -> xlog_sync()

XLOG_STATE_DONE_SYNC
    - xlog_ioend_work()
        -> xlog_state_done_syncing()

XLOG_STATE_CALLBACK
    - xlog_state_done_syncing()
        -> xlog_state_do_callback()

Transaction Format

                                 XLOG_REG_TYPE_TRANSHDR                    XLOG_REG_TYPE_COMMIT
                                    |                                                                |
< oph >< oph >< reg0 >< oph >< reg1 >< oph >...< regn >
     |                                                        |
    XLOG_START_TRANS                         XLOG_REG_TYPE_ICORE

IO Format

The transaction above is carried by iclog when sent out to disk.

Note !!! iclog is just used when issue IO. The log item is carried in anther place.
xlog_write()
---
                        if (copy_len > 0) {

                                // The reg here is a xfs_log_iovec
                                // It will copy the data from xfs_log_iovec to iclog buffer

                                memcpy(ptr, reg->i_addr + copy_off, copy_len);
                                xlog_write_adv_cnt(&ptr, &len, &log_offset,
                                                     copy_len);
                        }
---

The buffer carries log is allocated here,
Refer to xlog_cil_alloc_shadow_bufs() / xlog_cil_iovec_space()

The buffer for iclog contains header and payload, the header's layout is
xlog_rec_header_t.

Every iclog contains a crc num.
xlog_sync()
---
        /* calculcate the checksum */
        iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
                                                iclog->ic_datap, size)
---

Transaction Durability

In theory, when the commit record IO is completed, xlog guarantee that the whole
transaction has been on disk. How to implement this ?
There are two points here:

 commit callback
The ctx is installed on the iclog->ic_callbacks where the commit record is written
xlog_cil_push_work()
---
        list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
---
xlog_state_do_callback() would ensure the commit callback is invoked by the order of iclog.

This indicates that even if the commit record is not on the same iclog with its
records (must be the one behind),    when the callback of commit record is invoked,
its records's IO have been completed.


xlog_state_do_callback()
---
                do {

                                                     ------------------->
                                                                                                    Loop start here
                                                                                                         ACTIVE
                                                                                                    Current Head     ACTIVE        ACTIVE     ACTIVE
                                                                                                                |
                                                                                                                v
                        // iclog0 -> iclog1 -> iclog2 -> iclog3 -> iclog4 -> iclog5 -> iclog6 -> iclog7
                                    ^                ^                 ^                 ^
                                    |                |                 |                 |
                             SYNCING     SYNC_DONE    SYNC_DONE SYNCING
                             [record]        [commit]

                        if (xlog_state_iodone_process_iclog(log, iclog,
                                                        &ioerror))
                                break;
                        ...
                        xlog_state_do_iclog_callbacks(log, iclog);
                        if (XLOG_FORCED_SHUTDOWN(log))
                                wake_up_all(&iclog->ic_force_wait);
                        else
                                xlog_state_clean_iclog(log, iclog);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
---
 REQ_FUA
xlog_write_iclog()
---
        iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
                                REQ_IDLE | REQ_FUA;
---

Relog

Quote from https://www.infradead.org/~mchehab/kernel_docs/filesystems/xfs-delayed-logging-design.html

XFS allows multiple separate modifications to a single object to be carried in the log at any given time.
This allows the log to avoid needing to flush each change to disk before recording a new change to the object.
XFS does this via a method called “re-logging”. Conceptually, this is quite simple - all it requires is
that any new change to the object is recorded with a new copy of all the existing changes in the new
transaction that is written to the log.

Regarding to the comment with underline, we could refer to the implementation of jbd2,

jbd2 could be deemed as a WAL in blocks, namely, before flush the dirty blocks to real
position on disk, jbd2 would record them on journal first. In common case, jbd2 would
shadow the original buffer_head to do the journal IO.

jbd2_journal_write_metadata_buffer()
---
        spin_lock(&jh_in->b_state_lock);
repeat:
        if (jh_in->b_frozen_data) {
                ...
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
        }
        ...
        set_bh_page(new_bh, new_page, new_offset);
        new_bh->b_size = bh_in->b_size;
        new_bh->b_bdev = journal->j_dev;
        new_bh->b_blocknr = blocknr;
        new_bh->b_private = bh_in;
        set_buffer_mapped(new_bh);
        set_buffer_dirty(new_bh);

        *bh_out = new_bh;

        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
        spin_unlock(&journal->j_list_lock);

        set_buffer_shadow(bh_in);

        spin_unlock(&jh_in->b_state_lock); //Protect this journal buffer head
---

do_get_write_access()
---
        spin_lock(&jh->b_state_lock);
        ...
        if (buffer_shadow(bh)) {
                spin_unlock(&jh->b_state_lock);
                wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }
        ...
---

A very important thing need to be noted is that the modification has been made in
buffer_head which is the cache of the disk.

Deferred operations

Deferred Operations is a very bad name and could mislead the readers.
IMO, it should be called Big Transaction. The deferred operations,
cooperating with intent log, could split a complicated transaction into
multiple small transactions and still keep the atomicity of the original
transaction. Look at the following example,

To complete T, we need following multiple operations,
T_A, T_B, T_C, T_D. And each of them need 3 sub-operations.

Deferred Operations would complete this work as following,

Intent log for T_A \
Intent log for T_B    \ t0
Intent log for T_C    /
Intent log for T_D /
-------------------
Done log for T_A    \
Real log for T_A0    \ t1
Real log for T_A1    /
Real log for T_A2 /
-------------------
Done log for T_B    \
Real log for T_B0    \ t2
Real log for T_B1    /
Real log for T_B2 /
-------------------
Done log for T_C    \
Real log for T_C0    \ t4
Real log for T_C1    /
Real log for T_C2 /
-------------------
Done log for T_D    \
Real log for T_D0    \ t5
Real log for T_D1    /
Real log for T_D2 /
-------------------

The Intent log could guarantee the whole big transaction's atomicity.

xfs_defer_finish_noroll() is to carry out the work,

xfs_defer_finish_noroll()
---
        /* Until we run out of pending work to finish... */
        while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {

                //Create intent log for every deferred_operations

                xfs_defer_create_intents(*tp);
                list_splice_init(&(*tp)->t_dfops, &dop_pending);

                //Roll the transaction

                error = xfs_defer_trans_roll(tp);
                ...
                dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
                                             dfp_list);

                //Pick a defered operation and finish it
                // - create done
                // - finish_item do the real work

                error = xfs_defer_finish_one(*tp, dfp);
        }
---

log space


            tail                 head
     cycle=100         cycle=100 
                |                     |
                v                     v
|-------xxxxxxxxxxxx------------|



            head                            tail
     cycle=101                 cycle=100 
                |                                |
                v                                v
|xxxxxxx-----------------xxxxxxx|

Why xfs need two log head ?

It is related to XFS_TRANS_PERM_LOG_RES,
This kind of transaction could be rolled by multiple times.
Such as

#define        XFS_SYMLINK_LOG_COUNT                3
#define        XFS_REMOVE_LOG_COUNT                2
#define        XFS_LINK_LOG_COUNT                2
#define        XFS_RENAME_LOG_COUNT                2
#define        XFS_WRITE_LOG_COUNT                2
#define        XFS_WRITE_LOG_COUNT_REFLINK        8

A reflink transaction could be split into 8 sub-transactions.
If one sub-transaction need T bytes log space, we need
(1) reserve 8 * T log space on l_reserve_head,
(2) reserve T log space when transaction is rolled.

Refer to the code in xfs_trans_reserve()
---

        // rolled transaction

        if (tp->t_ticket != NULL) {
                        ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
                        error = xfs_log_regrant(mp, tp->t_ticket);
                } else {
                        error = xfs_log_reserve(mp,
                                                resp->tr_logres,
                                                resp->tr_logcount,
                                                &tp->t_ticket, XFS_TRANSACTION,
                                                permanent);
                }

---

The log space reserved here is always surplus.
There are two value in ticket, t_curr_res and t_unit_res.

xlog_cil_alloc_shadow_bufs() will adapt the t_curr_res
based on the real situation and then give the left back in 
xfs_log_commit_cil()
    -> xfs_log_ticket_regrant/ungrant()


The log space will given back after AIL commit with push the lsn tail.

lsn of a transaction

In software,
xfs_cil_ctx represents a transaction.
xfs_log_vec represents a log item and its space in memory. (li_lv and li_lv_shadow)
lsn                 represents cycle << 32 | log block number //>>

There are two lsn for a Transaction
(1) start lsn
xlog_cil_push_work()
    -> xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true)
    ---
                /* start_lsn is the first lsn written to. That's all we need. */
                if (!*start_lsn)
                        *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
    ---

(2) commit lsn
xlog_cil_push_work()
    -> xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);

The io completion callback of log block (iclog) is invoked in the order of lsn.
And the sign of successful committing of a transaction is the commit lsn (the log block) has been on disk

release of log space

The releasing of log space is through pushing the tail lsn forward.
The tail lsn is actually the minimum of start lsn of non-applied log.

The tail lsn could be modified in two ways,
(1) relog
xfs_trans_committed()

        // See it ? ctx>start_lsn is used here !!!

    -> xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort)
        -> xfs_log_item_batch_insert()
            -> xfs_trans_ail_update_bulk()
            ---
        for (i = 0; i < nr_items; i++) {
                struct xfs_log_item *lip = log_items[i];

                // Has been on AIL list, relog case !!!
                // It also inicates that the previous log has been on disk !!!

                if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
                        /* check if we really need to move the item */
                        if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
                                continue;


                        // relog would release its previous log space !!!

                        if (mlip == lip && !tail_lsn)
                                tail_lsn = lip->li_lsn;


                        // remove from the ail list and re-insert it later

                        xfs_ail_delete(ailp, lip);
                } else {
                        trace_xfs_ail_insert(lip, 0, lsn);
                }
                lip->li_lsn = lsn;
                list_add(&lip->li_ail, &tmp);
        }


        // all the log items uses the same lsn, namely, the start lsn of the transaction

        if (!list_empty(&tmp))
                xfs_ail_splice(ailp, cur, &tmp, lsn);

        xfs_ail_update_finish(ailp, tail_lsn);
            ---

(2) checkpoint
xfs_buf_ioend()
    -> xfs_buf_item_done()
        -> xfs_trans_ail_delete()
            -> xfs_ail_delete_one()
            ---

                // The minimum one is the head of the ailp->ail_head

                struct xfs_log_item        *mlip = xfs_ail_min(ailp);
                xfs_lsn_t                lsn = lip->li_lsn;

                xfs_ail_delete(ailp, lip);
                clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
                lip->li_lsn = 0;

                if (mlip == lip)
                        return lsn;
                return 0;
            ---
            -> xfs_ail_update_finish()

Why there is only one xfs_buf_ioend() here ?
Look at the definition of iop_push, only inode, dquot and buf define it.
This indicates that most of the transactions are made of xfs buf updating.

And inode updating would be applied on xfs buf finally.
Refer to
xfs_inode_item_push()
    -> xfs_iflush_cluster()
    -> xfs_buf_delwri_queue()

Xfs Inode

Inode management

The inode number if xfs is composed with 3 parts,

             AG number                Bn in AG             In in B
    |----------------|---------------|-------------|

Bn : block number in an AG
In : inode number in a block (a block could carry multiple inodes)

This inode number has telled us the position of the inode on disk.
xfs's inodes are dynamically allocated instead of preallocating in static
position like ext4.
How to allocate in dynamical way ?
Allocate a block in that AG !
^^^^^^^^^^^^^^^^^^^^^^^^^^^ Then create a xfs_inobt_rec as following,

struct xfs_inobt_rec {
        __be32     ir_startino;    //start ino num of this chunk
        __be32     ir_freecount; //number of free inodes
        __be64     ir_free; //bitmap

}

and insert it into the AG inode b+tree

xfs buf

xfs uses xfs_buf to manage its metadata instead of using vfs pagecache.

allocate

xfs_buf_get_map()
---
        error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
        if (!error)
                goto found;
        if (error != -ENOENT)
                return error;

        error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);

        // sema_init(&bp->b_sema, 0); /* held, no waiters */


        error = xfs_buf_allocate_memory(new_bp, flags);
        ---

        /*
         * for buffers that are contained within a single page, just allocate
         * the memory from the heap - there's no need for the complexity of
         * page arrays to keep allocation down to order 0.
         */

        size = BBTOB(bp->b_length);
        if (size < PAGE_SIZE) {
                int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
                bp->b_addr = kmem_alloc_io(size, align_mask,
                                             KM_NOFS | kmflag_mask);
                ...
                bp->b_offset = offset_in_page(bp->b_addr);
                bp->b_pages = bp->b_page_array;
                bp->b_pages[0] = kmem_to_page(bp->b_addr);
                bp->b_page_count = 1;
                bp->b_flags |= _XBF_KMEM;
                return 0;
        }

use_alloc_page:
        ...
        for (i = 0; i < bp->b_page_count; i++) {
                struct page        *page;
                uint                retries = 0;
retry:
                page = alloc_page(gfp_mask);
                ...
                nbytes = min_t(size_t, size, PAGE_SIZE - offset);
                size -= nbytes;
                bp->b_pages[i] = page;
                offset = 0;
        }

        ---

        // Do insert if new_bp is not NULL

        error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
        ...
found:
        if (!bp->b_addr) {
                error = _xfs_buf_map_pages(bp, flags);
                ...
        }
---

lookup

xfs_buf_find()
---
        pag = xfs_perag_get(btp->bt_mount,
                                xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));

        spin_lock(&pag->pag_buf_lock);
        bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
                                        xfs_buf_hash_params);
        if (bp) {
                atomic_inc(&bp->b_hold);
                goto found;
        }
        ...
found:
        spin_unlock(&pag->pag_buf_lock);
        xfs_perag_put(pag);

        // We need to return a locked xfs_buf here !!!

        if (!xfs_buf_trylock(bp)) {
                xfs_buf_lock(bp);
                XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
        }

---

read from disk

xfs_buf_read_map()
---
        error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
        ...
        if (!(bp->b_flags & XBF_DONE)) {
                /* Initiate the buffer read and wait. */
                bp->b_ops = ops;
                error = _xfs_buf_read(bp, flags);
                    -> xfs_buf_submit()
                        -> __xfs_buf_submit() // wait = !(bp->b_flags & XBF_ASYNC)
                        ---

                        /*
                            * Grab a reference so the buffer does not go away underneath us. For
                            * async buffers, I/O completion drops the callers reference, which
                            * could occur before submission returns.
                            */

                        xfs_buf_hold(bp);
                        ...
                        _xfs_buf_ioapply(bp);
                        ...
                        if (wait)
                                error = xfs_buf_iowait(bp);

                        xfs_buf_rele(bp);
                        ---

                /* Readahead iodone already dropped the buffer, so exit. */
                if (flags & XBF_ASYNC)
                        return 0;
        }
---


In non-readahead case, we have to wait the buffer to be read in.


When read completes, xfs would verify it.
xfs_buf_ioend()
---
        if (bp->b_flags & XBF_READ) {
                if (!bp->b_error && bp->b_ops)
                        bp->b_ops->verify_read(bp);
        }
---

log xfs log__xfs buf log

write to disk

The xfs buf being written to disk must have been checkpointed.
During the IO, the xfs_buf is locked. Nobody can touch it.
xfs_buf_delwri_submit_buffers()
    -> xfs_buf_lock()
    -> __xfs_buf_submit() // wait is false
         ---

        // buf has been locked, after unpin, nobody can pin it any more.
        // the modifications in buf should have been in log.

        if (bp->b_flags & XBF_WRITE)
                xfs_buf_wait_unpin(bp);
                ---
                if (atomic_read(&bp->b_pin_count) == 0)
                        return;

                add_wait_queue(&bp->b_waiters, &wait);
                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&bp->b_pin_count) == 0)
                                break;
                        io_schedule();
                }
                remove_wait_queue(&bp->b_waiters, &wait);
                set_current_state(TASK_RUNNING);
                ---
         ---

How to ensure the metadata to be on disk instead of disk caching, after release
the log space ?

xlog_sync()
---

        /*
         * Flush the data device before flushing the log to make sure all meta
         * data written back from the AIL actually made it to disk before
         * stamping the new log tail LSN into the log buffer.    For an external
         * log we need to issue the flush explicitly, and unfortunately
         * synchronously here; for an internal log we can simply use the block
         * layer state machine for preflushes.
         */

        if (log->l_targ != log->l_mp->m_ddev_targp || split) {
                xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
                need_flush = false;
        }

        xlog_verify_iclog(log, iclog, count);
        xlog_write_iclog(log, iclog, bno, count, need_flush);
---

reclaim

xfs_buf_rele()
---
        spin_lock(&bp->b_lock);
        release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
        if (!release) {
                if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
                        __xfs_buf_ioacct_dec(bp);
                goto out_unlock;
        }
        ...

        // insert into the lru list

        if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
                if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
                        bp->b_state &= ~XFS_BSTATE_DISPOSE;
                        atomic_inc(&bp->b_hold);
                }
                spin_unlock(&pag->pag_buf_lock);
        } 
---

xfs_buftarg_shrink_scan()
---
        freed = list_lru_shrink_walk(&btp->bt_lru, sc,
                                         xfs_buftarg_isolate, &dispose);

        while (!list_empty(&dispose)) {
                struct xfs_buf *bp;
                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
                list_del_init(&bp->b_lru);
                xfs_buf_rele(bp);
        }
        freed = list_lru_shrink_walk(&btp->bt_lru, sc,
                                         xfs_buftarg_isolate, &dispose);

        while (!list_empty(&dispose)) {
                struct xfs_buf *bp;
                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
                list_del_init(&bp->b_lru);
                xfs_buf_rele(bp);
        }
---

story of b_hold

The story of b_hold
[0] initial value is 1
        _xfs_buf_alloc()
        ---
                atomic_set(&bp->b_hold, 1);
        ---
[1] xfs_buf_find() get and lock
        ---
        spin_lock(&pag->pag_buf_lock);
        bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
                                        xfs_buf_hash_params);
        if (bp) {
                atomic_inc(&bp->b_hold);
                goto found;
        }
        ...
found:
        spin_unlock(&pag->pag_buf_lock);
        xfs_perag_put(pag);

        if (!xfs_buf_trylock(bp)) {
                if (flags & XBF_TRYLOCK) {
                        xfs_buf_rele(bp);
                        XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
                        return -EAGAIN;
                }
                xfs_buf_lock(bp);
                XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
        }

        ---

There are mainly two ways to release this reference
a. xfs_log_commit_cil()
         -> xfs_trans_free_items()
             -> iop_unlock()
                    xfs_buf_item_unlock()
                        -> xfs_buf_relse()

b. xfs_trans_brelse()
         -> xfs_trans_del_item()
         -> xfs_buf_relse()

     release a buf if didn't dirty it

[2] buf log item

The buf log item holds a reference of the xfs_buf
xfs_buf_item_init()
    -> xfs_buf_hold()

This reference will be released in xfs_buf_iodone()
xfs_buf_iodone()
---
        xfs_buf_rele(bp);
        spin_lock(&ailp->ail_lock);
        xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
        xfs_buf_item_free(BUF_ITEM(lip));
---

[3] delwri_queue
xfs_buf_delwri_queue()
---
        bp->b_flags |= _XBF_DELWRI_Q;
        if (list_empty(&bp->b_list)) {
                atomic_inc(&bp->b_hold);
                list_add_tail(&bp->b_list, list);
        }
---

get lock in xfs_buf_delwri_submit_buffers()
xfs_buf_iodone_callbacks()
    -> xfs_buf_ioend()
        -> xfs_buf_relse()
             unlock and release ref

xfs bmap

xfs bmap unwritten

allocated block extent has two state in xfs

Normal, XFS_EXT_NORM

Unwritten, XFS_EXT_UNWRITTEN

The meaning of the state is as the name shows
When the block extent is newly allocated, it is unwritten.

xfs_bmapi_allocate()
---

    if (bma->flags & XFS_BMAPI_PREALLOC)
        bma->got.br_state = XFS_EXT_UNWRITTEN;

    if (bma->wasdel)
        error = xfs_bmap_add_extent_delay_real(bma, whichfork);
---

The two main allocation paths,

writepages

xfs_map_blocks()
  -> xfs_convert_blocks()
    -> xfs_bmapi_convert_delalloc()
    ---
    bma.flags = XFS_BMAPI_PREALLOC;
    ...
    error = xfs_bmapi_allocate(&bma);
    ---

fallocate

xfs_file_fallocate()
---
        if (!xfs_is_always_cow_inode(ip)) {
            error = xfs_alloc_file_space(ip, offset, len,
                             XFS_BMAPI_PREALLOC);
        }
---

The unwritten state can influence the xfs in following ways,

Read, just return zero

iomap_readpage_actor()
---
    if (iomap_block_needs_zeroing(inode, iomap, pos)) {
        zero_user(page, poff, plen);
        iomap_set_range_uptodate(page, poff, plen);
        goto done;
    }
---
iomap_block_needs_zeroing()
---
    return iomap->type != IOMAP_MAPPED ||
        (iomap->flags & IOMAP_F_NEW) ||
        pos >= i_size_read(inode);
---

Write, needn't to do read

__iomap_write_begin()
---

    // iomap->type != IOMAP_MAPPED means needs zeroing

   if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
        if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
            return -EIO;
        zero_user_segments(page, poff, from, to, poff + plen);
    } else {
        int status = iomap_read_page_sync(block_start, page,
                poff, plen, srcmap);
        if (status)
            return status;
    }
---

Discard

xfs_bmap_del_extent_real()
---
    if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
        if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
            xfs_refcount_decrease_extent(tp, del);
        } else {
            __xfs_bmap_add_free(tp, del->br_startblock,
                    del->br_blockcount, NULL,
                    (bflags & XFS_BMAPI_NODISCARD) ||
                    del->br_state == XFS_EXT_UNWRITTEN);
        }
    }

    // If the extent is UNWRITTEN, the skip_discard is true

---

After write IO completes, xfs will convert unwritten to normal state

xfs_end_bio()
  -> queue_work i_ioend_work

xfs_end_io()
  -> xfs_end_ioend()
    -> xfs_iomap_write_unwritten()
    ---
        error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
                    XFS_BMAPI_CONVERT, resblks, &imap,
                    &nimaps);
    ---
       -> xfs_bmapi_convert_unwritten()