XFS

block map

per-ag metadata buffer log AG free space management AG free inode managment B Tree reflink
Talking

block map

enum {
    NODE_SIZE    = 256,
    KEYS_PER_NODE    = NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)),
    RECS_PER_LEAF    = (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) /
                sizeof(struct xfs_iext_rec),
};

On a 64-bit system, a node has two keys, a leaf has one recs.

           0   2048
           +--+--+--+--+
           |K0|K1|P0|P1|
           +--+--+--+--+
     0   1024     /   \2048 3072
     +--+--+--+--+     +--+--+--+--+
     |K0|K1|P0|P1|     |K0|K1|P0|P1|
     +--+--+--+--+     +--+--+--+--+
    0      /     \1024
    +--+-+-+     +--+-+-+
    |R0|p|n| <-> |R0|p|n|
    +--+-+-+     +--+-+-+
            /
            | startoff    (offset in the file)
    record <  startblock  (disk logical block offset ?)
            | length
            | flags
            \


Take the smallest value of the lower level node as the KEY.
The value here is file offset.

allocate block

How to allocate blocks for regular file ?
Basic principles

For large files, we try to allocate extents initially near the inode
and afterwards near the existing block in the file which is closest to
the offset in the file for which we are allocating space.

That implies blocks will be allocated near the last block in the file
for sequential writers and near blocks in the middle of the file for
processes writing into holes.

Files and directories are not limited to allocating space within a single
allocation group.
xfs_bmap_btalloc
(tp->t_firstblock is NULLFSBLOCK, ignore filestream)

    //start block of new extent
    ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);

    xfs_bmap_adjacent

    1. If allocating at eof, and there's a previous real block,
       try to use its last block as our starting point.

       ---
        if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
            !isnullstartblock(ap->prev.br_startblock) &&
                ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
                ap->prev.br_startblock)) {
            ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
            /*
             * Adjust for the gap between prevp and us.
             */
            adjust = ap->offset -
                (ap->prev.br_startoff + ap->prev.br_blockcount);
            if (adjust &&
                ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
                ap->blkno += adjust;
        }
        ---

    2. If not at eof, then compare the two neighbor blocks.
       Figure out whether either one gives us a good starting point,
       and pick the better one.


    xfs_bmap_btalloc_nullfb
      -> args->type = XFS_ALLOCTYPE_START_BNO 

    if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
        if (!ap->offset) {
            args.alignment = stripe_align;
            atype = args.type;
            isaligned = 1;
            if (blen > args.alignment && blen <= args.maxlen)
                args.minlen = blen - args.alignment;
            args.minalignslop = 0;
        } else {

            /*
             * First try an exact bno allocation.
             * If it fails then do a near or start bno
             * allocation with alignment turned on.
             */

            atype = args.type;
            tryagain = 1;
            args.type = XFS_ALLOCTYPE_THIS_BNO;
            args.alignment = 1;
            ...
        }
    } else {
        args.alignment = 1;
        args.minalignslop = 0;
    }


allocation fallback sequence in xfs_bmap_btalloc,
for example,

    XFS_ALLOCTYPE_THIS_BNO
            |
            v
    XFS_ALLOCTYPE_NEAR_BNO/START_BNO
         alignment = 1
            |
            v
         alignment = 0
            |
            v
     XFS_ALLOCTYPE_FIRST_AG
     loop all of the ags


xfs_alloc_fix_freelist will do following things:
1. check whether there is enough contiguous free space for the allocation.
2. check whether there is enough free space for remaining allocation.
   changing of the bnobt and cntbt also needs free blocks.
3. fixup the agfl.

Before invoke xfs_alloc_ag_vextent, xfs_alloc_fix_freelist will be invoked to
check the case 1 and 2 to ensure there is enough free spaces.

However, even if there is enough free space, the allocation still could fail for
XFS_ALLOCTYPE_NEAR_BNO and XFS_ALLOCTYPE_THIS_BNO. At the moment, the
 args->agbno will be NULLAGBLOCK.

Look at the code of xfs_alloc_vextent
---
    switch (type) {
    case XFS_ALLOCTYPE_THIS_AG:
    case XFS_ALLOCTYPE_NEAR_BNO:
    case XFS_ALLOCTYPE_THIS_BNO:
        args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
        args->pag = xfs_perag_get(mp, args->agno);
        error = xfs_alloc_fix_freelist(args, 0);
        ...
        if (!args->agbp)
            break;
        args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
        if ((error = xfs_alloc_ag_vextent(args)))
            goto error0;
        break;
    case XFS_ALLOCTYPE_START_BNO:

        /*
         * Try near allocation first, then anywhere-in-ag after
         * the first a.g. fails.
         */

        args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
        args->type = XFS_ALLOCTYPE_NEAR_BNO;
    case XFS_ALLOCTYPE_FIRST_AG:
        if (type == XFS_ALLOCTYPE_FIRST_AG) {
            args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
            args->type = XFS_ALLOCTYPE_THIS_AG;
            sagno = 0;
            flags = 0;
        } else {
            args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
            flags = XFS_ALLOC_FLAG_TRYLOCK;
        }
        for (;;) {
            args->pag = xfs_perag_get(mp, args->agno);
            error = xfs_alloc_fix_freelist(args, flags);
            if (args->agbp) {
                if ((error = xfs_alloc_ag_vextent(args)))
                    goto error0;
                break;
            }
            if (args->agno == sagno &&
                type == XFS_ALLOCTYPE_START_BNO)
                args->type = XFS_ALLOCTYPE_THIS_AG;

            /*
             * Reached the starting a.g., must either be done
             * or switch to non-trylock mode.
             */

            if (args->agno == sagno) {
                if (flags == 0) {
                    args->agbno = NULLAGBLOCK;
                    trace_xfs_alloc_vextent_allfailed(args);
                    break;
                }

                flags = 0;
                if (type == XFS_ALLOCTYPE_START_BNO) {
                    args->agbno = XFS_FSB_TO_AGBNO(mp,
                        args->fsbno);
                    args->type = XFS_ALLOCTYPE_NEAR_BNO;
                }
            }
            xfs_perag_put(args->pag);
        }
        break;
    default:
        ASSERT(0);
        /* NOTREACHED */
    }

    //Finally, the allocation successes or not

    if (args->agbno == NULLAGBLOCK)
        args->fsbno = NULLFSBLOCK;
    else {
        args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
    }

---

speculative preallocation

Preallocated blocks are normally reclaimed on file close, inode reclaim, unmount or
in the background once file write activity subsides.
xfs_icache_free_eofblocks
  -> __xfs_icache_free_eofblocks
    -> xfs_inode_ag_iterator_tag
      -> xfs_inode_ag_walk //iterate radix tree pag->pag_ici_root
        -> xfs_inode_free_eofblocks
          -> xfs_free_eofblocks // under XFS_IOLOCK_EXCL
            -> xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
                    XFS_ISIZE(ip), XFS_BMAPI_NODISCARD)

Free any blocks beyond EOF.

The time that triggers xfs_icache_free_eofblocks We also would truncate the blocks beyond EOF when close the file.
To avoid the applications that write and close the file repeatedly to cause
too much fragmentation, there is some heuristic there to detect that pattern.
xfs_release
---
    if (xfs_can_free_eofblocks(ip, false)) {

        /*
         * Check if the inode is being opened, written and closed
         * frequently and we have delayed allocation blocks outstanding
         * (e.g. streaming writes from the NFS server), truncating the
         * blocks past EOF will cause fragmentation to occur.
         *
         * In this case don't do the truncation, but we have to be
         * careful how we detect this case. Blocks beyond EOF show up as
         * i_delayed_blks even when the inode is clean, so we need to
         * truncate them away first before checking for a dirty release.
         * Hence on the first dirty close we will still remove the
         * speculative allocation, but after that we will leave it in
         * place.
         */

        if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
            return 0;
        /*
         * If we can't get the iolock just skip truncating the blocks
         * past EOF because we could deadlock with the mmap_sem
         * otherwise. We'll get another chance to drop them once the
         * last reference to the inode is dropped, so we'll never leak
         * blocks permanently.
         */
        if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
            error = xfs_free_eofblocks(ip);
            xfs_iunlock(ip, XFS_IOLOCK_EXCL);
            if (error)
                return error;
        }

        /* delalloc blocks after truncation means it really is dirty */
        if (ip->i_delayed_blks)
            xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
    }

---
Reference of dynamic speculatively preallocation
link https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=055388a3188f56676c21e92962fc366ac8b5cb72

xfs: dynamic speculative EOF preallocation
Currently the size of the speculative preallocation during delayed
allocation is fixed by either the allocsize mount option of a
default size. We are seeing a lot of cases where we need to
recommend using the allocsize mount option to prevent fragmentation
when buffered writes land in the same AG.

Rather than using a fixed preallocation size by default (up to 64k),
make it dynamic by basing it on the current inode size. That way the
EOF preallocation will increase as the file size increases.  Hence
for streaming writes we are much more likely to get large
preallocations exactly when we need it to reduce fragementation.

For default settings, the size of the initial extents is determined
by the number of parallel writers and the amount of memory in the
machine. For 4GB RAM and 4 concurrent 32GB file writes:

EXT: FILE-OFFSET           BLOCK-RANGE          AG AG-OFFSET                 TOTAL
   0: [0..1048575]:         1048672..2097247      0 (1048672..2097247)      1048576
   1: [1048576..2097151]:   5242976..6291551      0 (5242976..6291551)      1048576
   2: [2097152..4194303]:   12583008..14680159    0 (12583008..14680159)    2097152
   3: [4194304..8388607]:   25165920..29360223    0 (25165920..29360223)    4194304
   4: [8388608..16777215]:  58720352..67108959    0 (58720352..67108959)    8388608
   5: [16777216..33554423]: 117440584..134217791  0 (117440584..134217791) 16777208
   6: [33554424..50331511]: 184549056..201326143  0 (184549056..201326143) 16777088
   7: [50331512..67108599]: 251657408..268434495  0 (251657408..268434495) 16777088

and for 16 concurrent 16GB file writes:

 EXT: FILE-OFFSET           BLOCK-RANGE          AG AG-OFFSET                 TOTAL
   0: [0..262143]:          2490472..2752615      0 (2490472..2752615)       262144
   1: [262144..524287]:     6291560..6553703      0 (6291560..6553703)       262144
   2: [524288..1048575]:    13631592..14155879    0 (13631592..14155879)     524288
   3: [1048576..2097151]:   30408808..31457383    0 (30408808..31457383)    1048576
   4: [2097152..4194303]:   52428904..54526055    0 (52428904..54526055)    2097152
   5: [4194304..8388607]:   104857704..109052007  0 (104857704..109052007)  4194304
   6: [8388608..16777215]:  209715304..218103911  0 (209715304..218103911)  8388608
   7: [16777216..33554423]: 452984848..469762055  0 (452984848..469762055) 16777208

Because it is hard to take back specualtive preallocation, cases
where there are large slow growing log files on a nearly full
filesystem may cause premature ENOSPC. Hence as the filesystem nears
full, the maximum dynamic prealloc size іs reduced according to this
table (based on 4k block size):

freespace       max prealloc size
  >5%             full extent (8GB)
  4-5%             2GB (8GB >> 2)
  3-4%             1GB (8GB >> 3)
  2-3%           512MB (8GB >> 4)
  1-2%           256MB (8GB >> 5)
  <1%            128MB (8GB >> 6)

This should reduce the amount of space held in speculative
preallocation for such cases.

The allocsize mount option turns off the dynamic behaviour and fixes
the prealloc size to whatever the mount option specifies. i.e. the

per-ag metadata buffer



xfs_perag is maintained in a radix tree on xfs_mount->m_perag_tree

xfs_buf_find

pag->pag_buf_hash protected by pag->pag_buf_lock maintains the xfs_bufs

xfs_buf_t looks like a buffer mechanism of xfs its own.

xfs_buf_t -> xfs_buf_map [block num, size]
          -> b_pages[]   associated pages    (_xfs_buf_alloc)
          -> b_addr       virtual address in kernel (_xfs_buf_map_pages)

read in

xfs_buf_read_map
  -> xfs_buf_get_maps
  -> if !bp->b_flags & XBF_DONE
     _xfs_buf_read
       -> xfs_buf_submit_wait
         -> _xfs_buf_ioapply
         ---

    /* we only use the buffer cache for meta-data */

    op_flags |= REQ_META;

    /*
     * Walk all the vectors issuing IO on them. Set up the initial offset
     * into the buffer and the desired IO size before we start -
     * _xfs_buf_ioapply_vec() will modify them appropriately for each
     * subsequent call.
     */
    offset = bp->b_offset;
    size = BBTOB(bp->b_io_length);
    blk_start_plug(&plug);
    for (i = 0; i < bp->b_map_count; i++) {
        xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
        if (bp->b_error)
            break;
        if (size <= 0)
            break;    /* all done */
    }
    blk_finish_plug(&plug);
         ---
How to convert a xfs_buf_t to bio ?
xfs_buf_ioapply_map
---
    sector_t    sector =  bp->b_maps[map].bm_bn;
    ...
    size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
    ...
next_chunk:
    atomic_inc(&bp->b_io_remaining);
    nr_pages = min(total_nr_pages, BIO_MAX_PAGES);

    bio = bio_alloc(GFP_NOIO, nr_pages);
    bio_set_dev(bio, bp->b_target->bt_bdev);
    bio->bi_iter.bi_sector = sector;
    bio->bi_end_io = xfs_buf_bio_end_io;
    bio->bi_private = bp;
    bio_set_op_attrs(bio, op, op_flags);

    for (; size && nr_pages; nr_pages--, page_index++) {
        int    rbytes, nbytes = PAGE_SIZE - offset;

        if (nbytes > size)
            nbytes = size;

        rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
                      offset);
        if (rbytes < nbytes)
            break;

        offset = 0;
        sector += BTOBB(nbytes);
        size -= nbytes;
        total_nr_pages--;
    }

    if (likely(bio->bi_iter.bi_size)) {
        if (xfs_buf_is_vmapped(bp)) {
            flush_kernel_vmap_range(bp->b_addr,
                        xfs_buf_vmap_len(bp));
        }
        submit_bio(bio);
        if (size)
            goto next_chunk;
    }
---

xfs buf lock

Lock and unlock scenarios.

b_hold

The xfs_buf->b_hold decides the life of a xfs_buf, who will hold and free it ?

lifecycle of buf log item

When a xfs_buf is written, its log item will hold a reference.

_xfs_trans_bjoin
  -> xfs_buf_item_init
    -> xfs_buf_hold(bp)
  -> xfs_trans_add_item

It will be released when the log item is freed.
xfs_buf_item_put
  -> xfs_buf_item_relse
    -> xfs_buf_rele(bp)
    -> xfs_buf_item_free
This will introduce another topic about the xfs_buf log item's bli_refcount.
A xfs_buf_log_item's bli_refcount is held by The path that could release a buf log item

xfs_trans_binval

Quote from comment of xfs_trans_binval.

Invalidate a buffer that is being used within a transaction.
Typically this is because the blocks in the buffer are being freed, so we
need to prevent it from being written out when we're done.  Allowing it
to be written again might overwrite data in the free blocks if they are
reallocated to a file.
free an unused block on the bt
xfs_btree_free_block
  -> cur->bc_ops->free_block(cur, bp)

    // At this point, the buf is locked by ourself, but it could be pinned by others.
    // And the buf maybe under IO to log or disk.

  -> xfs_trans_binval(cur->bc_tp, bp)

Process of free a block

xfs_bmbt_ops.free_block
xfs_bmbt_free_block
  -> xfs_bmap_add_free
    -> __xfs_bmap_add_free
      -> xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list)

xfs_extent_free_defer_type.finish_item

xfs_extent_free_finish_item
  -> xfs_trans_free_extent
    -> __xfs_free_extent
      -> xfs_free_extent_fix_freelist
      -> xfs_free_ag_extent
      -> xfs_extent_busy_insert

xfs_trans_binval will prevent the metadata buffer whose block has been released to block free space from being written to disk.
A btree update removes a block from the btree.

xfs_btree_free_block
  -> cur->bc_ops->free_block(cur, bp)
    -> xfs_bmbt_free_block
      -> xfs_bmap_add_free
        -> xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
  -> xfs_trans_binval(cur->bc_tp, bp)


        [1]                         [2]                       [3]                 [4]
EFI log -> EFD log                  -> agl, agfl, cntbt bnobt -> remove from busy  -> flush update on
           Update free space bt        components buffer log     extent list          agl, agfl, cntbt   
           bnobt & cntbt               EFD log                                        bnobt to disk
           insert to busy extent list                            
           [5]                                                   
\___ ___/\________________________ ____________________________/ 
    V                             V                              
  Trans0                        Trans1                           
\____________________________ _________________________________/\_________ ________/\______ ______/
                             V                                            V                V
                 Context of fs operations                           log IO complete   AIL push context
                                                                     context
[1] xfs_trans_roll
[2] commit log
[3] log IO complete
[4] insert log item into AIL
[5] xfs_extent_free_finish_item
      -> xfs_trans_free_extent
        -> __xfs_free_extent
          -> xfs_free_ag_extent
          -> xfs_extent_busy_insert

When the xfs_btree_free_block is invoked, the update on the btree has been
completed, IOW, nobody could get the block from the btree anymore.

Moreover, when we update the btree, what to preserve the serialisation ?

xfs_bmapi_write is invoked under xfs_ilock(ip, XFS_ILOCK_EXCL)

If the block used to be a block in a block map btree,
 - all the udpate saved in the buffer will not be flushed to disk
 - all the log recorded will not be replayed during recovery.
 - no one could get it again in the block map btree

There maybe someone gets this buffer of the freed block.
xfs_buf_find
---
    if (bp->b_flags & XBF_STALE) {
        bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
        bp->b_ops = NULL; 
    }
---

log

AIL

Active Item List, a LSN-sorted double linked list.
Items are inserted into this list during log buffer IO completion, after which
they are unpind and can be written to disk.
CIL
Commited Item List, this list tracks log items that have been commited and have
formatted memory buffer attached to them. It tracks objects in transaction
commit order.



XFS
          changes on inode                 
                 | 
-----------------^------------------------------------------------------------------------------------------------------------
XFS log          | format[1]
                 v
           lvs on lip->li_lv                xlog_cil_push                                     xfsaild_push_item
           lip is pined                     take lvs down from cil->xc_cil                    li_ops->push
           under XFS_IOLOCK_EXCL            under down_write cil->xc_ctx_lock                 push the modifications to bp
                 down_read cil->xc_ctx_lock                                                   under XFS_IOLOCK_EXCL
           insert lip on cil->xc_cil        write lvs to iclog[2] .---> xlog_cil_committed[3]       ||
           under spin cil->xc_cil_lock      write commit record  /      insert to ail->ail_head     ||  xfs_iflush_done
                                                   ||           /       unpin                       ||   ^ xfs_iunlock
                                                   \/          /                                    \/   |
--------------------------------------------------------------------------------------------------------------------------------
XFS buf                                      submit to disk                                    submit to disk
--------------------------------------------------------------------------------------------------------------------------------


[1]: xfs_log_commit_cil
[2]: xlog_write
[3]: commit record's IO completion callback, see xlog_cil_push

lv   : xfs_log_vec
lip  : xfs_log_item
cil  : xfs_cil
bp   : xfs_buf

log formating

Why do we need the log format ?

XFS logging is a combination of logical and physical logging. Some objects,
such as inodes and dquots, are logged in logical format where the details
logged are made up of the changes to in-core structures rather than on-disk
structures. Other objects - typically buffers - have their physical changes
logged. The reason for these differences is to reduce the amount of log space
required for objects that are frequently logged. Some parts of inodes are more
frequently logged than others, and inodes are typically more frequently logged
than any other object (except maybe the superblock buffer) so keeping the
amount of metadata logged low is of prime importance.
xfs_trans_commit
  -> __xfs_trans_commit
    -> xfs_log_commit_cil
     -> xlog_cil_insert_items // down_read cil->xc_ctx_lock
       -> xlog_cil_insert_format_items
         -> iterate xfs_trans->t_items 
            lip->li_ops->iop_format// if xfs_log_item is set XFS_LI_DIRTY

            xfs_inode_item_format.

            Why is there no lock hold here ? [1]
       -> require cil->xc_cil_lock
          iterate tp->t_items and mve the dirty one to cil->xc_cil
          Note:
          list_move_tail(&lip->li_cil, &cil->xc_cil);
          

[1]
For example:
  xfs_fs_commit_blocks
  ---
    
    xfs_ilock(ip, XFS_ILOCK_EXCL);
    
    xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
    xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

    xfs_setattr_time(ip, iattr);
    if (update_isize) {
        i_size_write(inode, iattr->ia_size);
        ip->i_d.di_size = iattr->ia_size;
    }

    xfs_trans_set_sync(tp);
    error = xfs_trans_commit(tp);
  ---
Where to free the XFS_ILOCK_EXCL ?
xfs_trans_commit
  -> __xfs_trans_commit
    -> xfs_trans_free_items
      -> iterate the tp->t_items
         detach item
               clear_bit(XFS_LI_DIRTY, &lip->li_flags);
            list_del_init(&lip->li_trans);
         unlock item
            lip->li_ops->iop_unlock
            xfs_inode_item_unlock
              
              -> xfs_iunlock(ip, iip->ili_lock_flags)
              
    clear_bit(XFS_LI_DIRTY, &lip->li_flags);
    list_del_init(&lip->li_trans);
So we can know, when we do log formating, the inode is locked with XFS_ILOCK_EXCL
The formatted buffer contains all the changes on the log item. This enable us to relog the item in memory and write it out asynchronously without needing to relock the object that was modified at the time it gets written into log.
To achieve this, we also need a one-off xfs_log_vec every time. Only then, we could do the formating and pushing to log separately.
xfs_trans_commit
  -> __xfs_trans_commit
    -> xfs_log_commit_cil
      -> xlog_cil_alloc_shadow_bufs // allocate shadow buffer outside of
                                       cil->xc_ctx_lock to avoid deadlock when memory is low.
                                       more details, please refer to comment of
                                       xlog_cil_alloc_shadow_bufs.

      -> down_read(&cil->xc_ctx_lock)
      -> xlog_cil_insert_items
        -> xlog_cil_insert_format_items
          ---
    list_for_each_entry(lip, &tp->t_items, li_trans) {
        ...
        if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
            ...
        } else {
            /* switch to shadow buffer! */
            lv = shadow;
            lv->lv_item = lip;
            ...
        }
        ...
        lip->li_ops->iop_format(lip, lv);
    }
          ---
Regarding to the lip->li_lv, we have 3 cases as following:

checkpoint

checkpoint of the log.

xlog_cil_push
    
    //down_write cil->xc_ctx_lock: 
    
    list_add(&ctx->committing, &cil->xc_committing) //under cil->xc_push_lock
       
    CIL Head
       |
       V
    Log Item <-> log vector 1    -> memory buffer
       |                -> vector array
       V
    Log Item <-> log vector 2    -> memory buffer
       |                -> vector array
       V
    ......
       |
       V
    Log Item <-> log vector N-1    -> memory buffer
       |                -> vector array
       V
    Log Item <-> log vector N    -> memory buffer
                    -> vector array

    
     - take the item down from the cil->xc_cil list.
     - take the active lv down from the item->li_lv.
    

    And after the flush the CIL head is empty, and the checkpoint context log
    vector list would look like:

    Checkpoint Context
       |
       V
    log vector 1    -> memory buffer
       |        -> vector array
       |        -> Log Item
       V
    log vector 2    -> memory buffer
       |        -> vector array
       |        -> Log Item
       V
    ......
       |
       V
    log vector N-1    -> memory buffer
       |        -> vector array
       |        -> Log Item
       V
    log vector N    -> memory buffer
            -> vector array
            -> Log Item


    new_ctx->sequence = ctx->sequence + 1;
    new_ctx->cil = cil;
    cil->xc_ctx = new_ctx;

    
    spin_lock(&cil->xc_push_lock);
    cil->xc_current_sequence = new_ctx->sequence;
    spin_unlock(&cil->xc_push_lock);

    //up_write cil->xc_ctx_lock

    
    xlog_write
    // write log vectors into log



    // There could be multiple context running concurrently here.
    // IOW, the checkpoints could be written out of order.
    // But the commit record must be in order strictly.

    spin_lock(&cil->xc_push_lock);
    list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
        /*
         * Higher sequences will wait for this one so skip them.
         * Don't wait for our own sequence, either.
         */
        if (new_ctx->sequence >= ctx->sequence)
            continue;
        if (!new_ctx->commit_lsn) {
            xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
            goto restart;
        }
    }
    spin_unlock(&cil->xc_push_lock);



    // commit the record

    xfs_log_done
      -> xlog_commit_record
        -> xlog_write // XLOG_REG_TYPE_COMMIT
    

    // attach checkpoint context to commit record log buffer

    ctx->log_cb.cb_func = xlog_cil_committed;
    ctx->log_cb.cb_arg = ctx;
    error = xfs_log_notify(commit_iclog, &ctx->log_cb);


    /*
     * now the checkpoint commit is complete and we've attached the
     * callbacks to the iclog we can assign the commit LSN to the context
     * and wake up anyone who is waiting for the commit to complete.
     */

    spin_lock(&cil->xc_push_lock);
    ctx->commit_lsn = commit_lsn;
    wake_up_all(&cil->xc_commit_wait);
    spin_unlock(&cil->xc_push_lock);



    // The commit maybe pushed to disk here.

    return xfs_log_release_iclog(log->l_mp, commit_iclog);
How to ensure the log lvs has been on disk when commit is pushed ?
ITOW, the log IOs maybe in a different iclog from the commit record.
 - the iclog on which the log IOs are written maybe synced to disk after the one
   on which commit record is written.
 - the IOs in block layer even storage device could be disordered.
Needn't we to consider the disordering above ?

There are two points here:
1. the log callback mechanism can ensure when the xlog_cil_committed is invoked,
   the previous log IOs must have been completed. (callbacks of different iclog
   must be invoked in order of lsn)
2. do we really need to ensure log IOs has been on disk when do commit record ?
   every iclog has a crc calculated by xlog_cksum.
   it could help us to recognize the corrupted log.

in core log


iclog

The iclog is maintained as a ring in log->l_iclog.

                           ---
                         /     \
         log->l_iclog-> |       |
                         \     /
                           ---
Get an available iclog space
we will try to get an available iclog from the log->l_iclog ring.
If it is not an active one, this means the log space has been exhausted and has to wait.
See xlog_state_get_iclog_space
---
    // under log->l_icloglock
    iclog = log->l_iclog;
    if (iclog->ic_state != XLOG_STATE_ACTIVE) {
        XFS_STATS_INC(log->l_mp, xs_log_noiclogs);

        /* Wait for log writes to have flushed */
        xlog_wait(&log->l_flush_wait, &log->l_icloglock);
        goto restart;
    }
---
Note: xlog_state_get_iclog_space not only return an available iclog, but reserve space in it.
---

   // under log->l_icloglock
   // Has confirmed that this iclog has at least 2 * sizeof (xlog_op_header_t) available space.

   if (len <= iclog->ic_size - iclog->ic_offset) {

   // if the available space in this iclog is enough to carry, reserve the space.

      *continued_write = 0;
      iclog->ic_offset += len; // reserve our requested space
   } else {

   // otherwise, invoke xlog_state_switch_iclogs to switch this iclog from ACTIVE to WANT_SYNC.
   // then, noone could get it again. We have hold a reference of it, so it will not be resynced.

      *continued_write = 1;
      xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
   }
---
   when continued_write is true, the iclog->ic_offset will be modified by
   xlog_write_copy_finish
     -> xlog_state_finish_copy

   then xlog_state_release_iclog will be invoked and iclog will be synced.
So we know, one iclog could be used by multiple users. iclog block number
The iclogs' bp are allocated in xlog_alloc_log via xfs_buf_get_uncached. So there is no block number of IO there. Where to get it ?
xlog_state_get_iclog_space
---
   atomic_inc(&iclog->ic_refcnt);   /* prevents sync */
   log_offset = iclog->ic_offset;

    // ic_offset is the current number of bytes written to in this iclog.
    // if ic_offset is zero, that says this is the first writting on this iclog.

   if (log_offset == 0) {
      ticket->t_curr_res -= log->l_iclog_hsize;
      xlog_tic_add_region(ticket,
                log->l_iclog_hsize,
                XLOG_REG_TYPE_LRHEADER);
      head->h_cycle = cpu_to_be32(log->l_curr_cycle);
      head->h_lsn = cpu_to_be64(
         xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
      ASSERT(log->l_curr_block >= 0);
   }
---
The l_curr_block is turned in xlog_state_switch_iclogs
---
   log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
   ...
   if (log->l_curr_block >= log->l_logBBsize) {
      log->l_curr_block -= log->l_logBBsize;
      ASSERT(log->l_curr_block >= 0);
      smp_wmb();
      log->l_curr_cycle++;
      if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
         log->l_curr_cycle++;
   }
---
Finally, the block number will be filled into the bp in xlog_sync with XFS_BUF_SET_ADDR
sync a iclog
Every time, when we want to flush out an in core log, its state will be switched to WANT_SYNC. At the same time, the iclog ring will be turned.
xlog_state_switch_iclogs 
---
    log->l_iclog = iclog->ic_next;
---
Then When will we want to sync a iclog ? An iclog will be synced to disk when its reference count is zero. The reference is released by xlog_state_release_iclog
---
    if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
        return 0;
        ...
    if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
        /* update tail before writing to iclog */
        xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
        sync++;
        iclog->ic_state = XLOG_STATE_SYNCING;
        iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
        xlog_verify_tail_lsn(log, iclog, tail_lsn);
        /* cycle incremented when incrementing curr_block */
    }
    spin_unlock(&log->l_icloglock);
    ...
    if (sync)
        return xlog_sync(log, iclog);
---
One question: Whether we indeed need to hold the log->l_icloglock for every iclog ?

Submit the iclog to disk
xlog_state_release_iclog
  -> xlog_sync
    -> bp = iclog->ic_bp;
      XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
      XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
    -> bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
      -> xlog_bdstrat
        -> xfs_buf_lock(bp)
          -> xfs_buf_submit

completion path
xfs_buf_bio_end_io
  -> xfs_buf_ioend_async
    -> queue work xfs_buf_ioend_work
xfs_buf_ioend_work
  -> xfs_buf_ioend
    -> bp->b_iodone
       xlog_iodone
         -> xlog_state_done_syncing
           -> set iclog->ic_state to XLOG_STATE_DONE_SYNC  //under log->l_icloglock (a global one ?)
           -> xlog_state_do_callback
         -> xfs_buf_unlock

do callbacks
The callbacks must be performed in order of lsn
The callbacks here not only include invoke callbacks, but also xlog_state_clean_log. Following code segment ensure the order
xlog_state_do_callback
---
            if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
                ...
                lowest_lsn = xlog_get_lowest_lsn(log);
                if (lowest_lsn &&
                    XFS_LSN_CMP(lowest_lsn,
                        be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
                    iclog = iclog->ic_next;
                    continue;
                }
---

iclog ring and physical log space

iclog ring:

The biggest total iclog size is 8 * 256K == 2M
But this is not the real physical log space.
The real physical log space is: The max and min size of the log space is
   XFS_MAX_LOG_BLOCKS blocks  (1024 * 1024)
or XFS_MAX_LOG_BYTES          (2 * 1024 * 1024 * 1024)

   XFS_MIN_LOG_BLOCKS blocks  (512)
   XFS_MIN_LOG_BYTES          (10 * 1024 * 1024)
iclog ring:
                            <--.
                           ---  \
                       O /     \
         log->l_iclog-> |       |
                         \ N   /
                      \    ---
               rotate  '-->

rotate: it is done by xlog_state_switch_iclogs
O     : older
N     : newer

Where to allocate the iclog space ?
xlog_write
  -> xlog_state_get_iclog_space
    -> get an XLOG_STATE_ACTIVE iclog and reserve space in it.
    ---
        //under log->l_icloglock
        iclog->ic_offset += len;
    ---
    if the log->l_iclog is not ACTIVE, wait for it.
      -> xlog_wait on log->l_flush_wait

Where to free the iclog space ?

xlog_iodone
  -> xlog_state_done_syncing
    -> xlog_state_do_callback
      -> xlog_state_clean_log
    -> wake_up_all(&log->l_flush_wait);

In conclusion, at one moment, there only be 2M log IO in-flight at most.
      .-->
     /   ---                     
       /     \                   
      | iclog |                  
       \     /                   
         ---  /                  
          <--'                   
|------------------------------------------------|
           physical log space

The iclog ring will roll forward in xlog_state_switch_iclogs.
XFS transaction subsystem is that most transactions are asynchronous. That is,
they don't commit to disk until either a log buffer is filled (a log buffer can
hold multiple transactions) or a synchronous operation forces the log buffers
holding the transactions to disk. This means that XFS is doing aggregation of
transactions in memory - batching them, if you like - to minimise the impact
of the log IO on transaction throughput.

The limitation on asynchronous transaction throughput is the number and size of
log buffers made available by the log manager. By default there are 8 log
buffers available and the size of each is 32kB - the size can be increased up
to 256kB by use of a mount option.

Effectively, this gives us the maximum bound of outstanding metadata changes
that can be made to the filesystem at any point in time - if all the log
buffers are full and under IO, then no more transactions can be committed until
the current batch completes. It is now common for a single current CPU core to
be to able to issue enough transactions to keep the log buffers full and under
IO permanently. Hence the XFS journalling subsystem can be considered to be IO
bound.
[Life Cycle of a iclog]
XLOG_STATE_WANT_SYNC
  xlog_state_switch_iclogs
  ---
    iclog->ic_state = XLOG_STATE_WANT_SYNC;
    ...
    /* roll log?: ic_offset changed later */
    log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
    ...
    if (log->l_curr_block >= log->l_logBBsize) {
        log->l_curr_block -= log->l_logBBsize;
        smp_wmb();
        log->l_curr_cycle++;
    }
    log->l_iclog = iclog->ic_next;
  ---

XLOG_STATE_SYNCING
  xlog_state_release_iclog
    -> if XLOG_STATE_WANT_SYNC
       set SYNCING
    -> xlog_sync


XLOG_STATE_DONE_SYNC
  xlog_state_done_syncing


XLOG_STATE_DO_CALLBACK
  xlog_state_do_callback

XLOG_STATE_CALLBACK
  xlog_state_do_callback
  //has been the lowest lsn and ready to do callback.

XLOG_STATE_DIRTY
  xlog_state_do_callback
  //after complete the callbacks

XLOG_STATE_ACTIVE
  xlog_state_do_callback
    -> xlog_state_clean_log

iclog format

Every iclog has its header.

[1][2]    [3]  
|--Header--|----Payload---|

[1] iclog->ic_data = bp->b_addr
[2] ic_data is xlog_in_core_2_t
  typedef union xlog_in_core2 {
         xlog_rec_header_t    hic_header;
      xlog_rec_ext_header_t    hic_xheader;
      char            hic_sector[XLOG_HEADER_SIZE];
  } xlog_in_core_2_t;
  #define ic_header    ic_data->hic_header
[3] iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;

(xlog_alloc_log)

The first u32 of each log sector must contain the cycle number.
Since log item buffers are formatted without regard to this requirement,
the original contents of the first four bytes of each sector in the log
are copied into the corresponding element of this array. After that, the
first four bytes of those sectors are stamped with the cycle number. This
process is reversed at recovery time. If there are more sectors in this
log record than there are slots in this array, the cycle data continues
for as many sectors are needed; each sector is formatted as type xlog_rec_ext_header.

xlog_sync
  -> xlog_pack_data
  ---
    cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);

    dp = iclog->ic_datap;
    for (i = 0; i < BTOBB(size); i++) {
        if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
            break;
        iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
        *(__be32 *)dp = cycle_lsn;
        dp += BBSIZE;
    }
  ---

The cycle is started from 1 (xlog_alloc_log log->l_curr_cycle = 1)

       tail      head
       (c 200)   (c 200)
        |         |
        v         v
   |-----------------------------|
        \____ ____/
             v
     non-checkpointed log

non-checkpointed: the metadata associated with the log has not been flushed to
                  disk
We should try to do recovery from the tail.
This tail is recored in iclog header, xlog_rec_header_t.h_tail_lsn.

xlog_state_release_iclog
---
    if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
        /* update tail before writing to iclog */
        xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
        sync++;
        iclog->ic_state = XLOG_STATE_SYNCING;
        iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
        xlog_verify_tail_lsn(log, iclog, tail_lsn);
        /* cycle incremented when incrementing curr_block */
    }
---

This is a perfect position to update the tail lsn of the log.
It could ensure the tail of the log is updated when we overwrite the
checkpointed aera of the log.(the h_tail_lsn is on the iclog which will be flushed
to log.)


Unlike the jbd2, which has a superblock for journal, xfs record the tail of the
log in every LR. It could reduce some extra IO out of LR.
But the trouble is how to find out the tail when do recovery ?

Even if we could search the log to find out xlog_rec_header_t's magic
0xFEEDbabe, there could be multiple xlog_rec_header_t structures to be found and
they may has different tail lsn. And what we need is the last one.
How to find out it ?

Consider we have a cycle number recored at head of every sector.

                 End
                  |
                  v
|200|200|200|200|200|199|199|199|199|

                                 End
                                  |
                                  v
|200|200|200|200|200|200|200|200|200|


What we do is to search backward from the End to find out the xlog_rec_header_t's
magic.

xlog_find_tail
  -> xlog_find_head
  -> xlog_rseek_logrec_hdr

flush to disk

After commit record IO is completed, the log item associated with the lvs in the ctx will be added to AIL, at the same time, the log items are unpind.

xlog_cil_committed (completion callback of commit record)
  -> xfs_trans_committed_bulk //ctx->lv_chain (xlog_cil_push put the lvs on it)
    -> iterate the lvs
      -> lip->li_ops->iop_committed()
      -> xfs_log_item_batch_insert
        -> insert the log items onto xfs_ail->ail_head sorted by lsn.
        -> lip->li_ops->iop_unpin()

Why do we need this pin/unpin here ?
An log item cannot be flushed to disk before the log is completed.
pin/unpin pair is to achieve this.
The log item is pinned in
xlog_cil_insert_format_items
  -> lip->li_ops->iop_format()
  -> xfs_cil_prepare_item
    -> lv->lv_item->li_ops->iop_pin()

This is done under item lock.
For inode, it is xfs_ilock, for example:
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);

Then unlocked by xfs_inode_item_unlock

In addition, an log item could be reloged during it is on CIL even AIL.
Then it will not be pushed to disk until all of the pin counter is cleared.

When an log item is on AIL, it could be pushed to disk now.
xfsaild
  -> xfsaild_push
    -> xfsaild_push_item
      -> lip->li_ops->iop_push
         xfs_inode_item_push
           -> if xfs_ipincount(ip) > 0 //it could be reloged.
              return XFS_ITEM_PINNED
           -> xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)
           -> xfs_iflush
             -> xfs_imap_to_bp  // get the buffer containing the on-disk inode.
             -> xfs_iflush_int  // flush the dirty part of inode into on-disk inode.
                                // the on-disk inode is on
                                // the xfs_buf of the inode
               -> xfs_buf_attach_iodone //xfs_iflush_done
           -> xfs_buf_delwri_queue
              insert the bp into xfs_ail->ail_buf_list
    -> xfs_buf_delwri_submit_nowait
      -> xfs_buf_delwri_submit_buffers
      ---
    list_sort(NULL, buffer_list, xfs_buf_cmp);

    blk_start_plug(&plug);
    list_for_each_entry_safe(bp, n, buffer_list, b_list) {
        if (!wait_list) {
            if (xfs_buf_ispinned(bp)) {
                pinned++;
                continue;
            }
            if (!xfs_buf_trylock(bp))
                continue;
        } else {
            xfs_buf_lock(bp);
        }
        ...
        bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
        bp->b_flags |= XBF_WRITE | XBF_ASYNC;
        if (wait_list) {
            xfs_buf_hold(bp);
            list_move_tail(&bp->b_list, wait_list);
        } else
            list_del_init(&bp->b_list);

        xfs_buf_submit(bp);
    }
    blk_finish_plug(&plug);
      ---
When the IO is completed:
xfs_buf_bio_end_io
  -> xfs_buf_ioend_async
    -> queue work xfs_buf_ioend_work
xfs_buf_ioend_work
  -> xfs_buf_ioend
    -> bp->b_iodone
       xfs_iflush_done // attached by xfs_iflush_int
       ---
    list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) {
        if (lip->li_cb != xfs_iflush_done)
            continue;

        list_move_tail(&blip->li_bio_list, &tmp);
        ...
    }
    ...
    list_for_each_entry_safe(blip, n, &tmp, li_bio_list) {
        list_del_init(&blip->li_bio_list);
        iip = INODE_ITEM(blip);
        iip->ili_logged = 0;
        iip->ili_last_fields = 0;
        xfs_ifunlock(iip->ili_inode); 
    }
       ---

LSN

Where does the lsn come from ?

xlog_state_get_iclog_space
---

    // Under log->l_icloglock
    // On the 1st write to an iclog, figure out lsn. 

    if (log_offset == 0) {
        ticket->t_curr_res -= log->l_iclog_hsize;
        xlog_tic_add_region(ticket,
                    log->l_iclog_hsize,
                    XLOG_REG_TYPE_LRHEADER);
        head->h_cycle = cpu_to_be32(log->l_curr_cycle);
        head->h_lsn = cpu_to_be64(
            xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
    }
---

log->l_curr_block and log->l_curr_cycle
xlog_state_switch_iclogs //Under log->l_icloglock
---
    log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
    ...
    if (log->l_curr_block >= log->l_logBBsize) {
        log->l_curr_block -= log->l_logBBsize;
        ASSERT(log->l_curr_block >= 0);
        smp_wmb();
        log->l_curr_cycle++;
        if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
            log->l_curr_cycle++;
    }
---
So we know the lsn is (cycle << 32 | block )
Log space and cycle

             current
                |
                v
  |------------------------------|   cycle 200


    current  original
  - ->|         | - - - - - - - -
      v         v
  |------------------------------|   cycle 201

order of AIL

A xfs_cil_ctx contains following information:

The ctx->start_lsn here will be used to insert the log items to AIL.
xlog_cil_committed
  -> xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
                    ctx->start_lsn, abort)
    -> xfs_trans_ail_update_bulk
    ---
        for (i = 0; i < nr_items; i++) {
            struct xfs_log_item *lip = log_items[i];

            // if the log item has been on AIL, we may need to reposition it.
            // this is for the relogging case.

            if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
                /* check if we really need to move the item */
                if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
                    continue;

                xfs_ail_delete(ailp, lip);
                if (mlip == lip)
                    mlip_changed = 1;
            }

            // a log item's lsn is set here.

            lip->li_lsn = lsn;
            list_add(&lip->li_ail, &tmp);
        }

        // Queue on the AIL list

        if (!list_empty(&tmp))
            xfs_ail_splice(ailp, cur, &tmp, lsn);

        // If the minimum lsn of the AIL list is changed, it indicates the log
        // space is moved forward, try to wake up the waiters.

        if (mlip_changed) {
            ...
            xfs_log_space_wake(ailp->ail_mount);
        }
    ---
Question:
The AIL is sorted by lsn. What's about the order of pushing ?
Look at the two helper interfaces in xfsaild_push:
 - xfs_trans_ail_cursor_first
 - xfs_trans_ail_cursor_next
And also the log->ail_last_pushed_lsn

Basically, the entries in AIL list is pushed by order of lsn.

The lsn should be monotonically increasing. But the pushing of log item on AIL
list may be disordered.
We have known that callback of record commit, xlog_cil_committed, is invoked by
the order of lsn. But please note, this is the lsn of the commit record instead
of the log lvs. The log lvs and commit record may have different lsn and there
maybe other one inserted between them.

Why do we need the ail_last_pushed_lsn ?
AIL list is sorted by lsn which combines log space block and cycle. It is not
the block number of the real IO under the log item....So it is meaningless to
push them continously.

log space


left log space

Log space

       tail      head
       (c 200)   (c 200)
        |         |
        v         v
   |-----------------------------|
        \____ ____/
             v
            used

                 head     tail
                 (c 201)  (c 200)
                  |        |
                  v        v
   |-----------------------------|
    \______ ______/        \__ __/
           v                  v
          used               used

Both of the head and tail will be pushed forward.
 - pushing Tail means free
 - pushing Head mean allocate

log space accounting


Log format

Log space:

<          reg         > <          reg         > <          reg         >

Log format of a transaction[0]:

      < start-oph >< oph >< trans-hdr >< oph >< reg1 >< oph >...< commit-oph >
      [1]                                                         [2]

[0]
   The transaction here is not the one in xfs_trans_commit, it is the ctx in xlog_cil_push.
   It could include the lvs of a couple of xfs_trans

   xfs_trans->t_items - li - li - li
                         \    \    \
                          lv   lv   lv

   ctx->lv_chain - lv -lv -lv

[1]
   xlog_cil_push
     -> xlog_write
       -> xlog_write_start_rec
          clear XLOG_TIC_INITED of ctx->ticket

[2]
   xlog_cil_push
     -> xfs_log_done //ctx->ticket->t_flags & XLOG_TIC_INITED == 0
       -> xlog_commit_record


Actual log space accounting.

log space reserve

The log reservation steps:

Let's look at some examples:
            bash-1440  [005] ....   101.856925: xlog_ticket_alloc: input 254080 calc 267016 cnt 2
            bash-1440  [005] ....   101.857868: xlog_ungrant_log_space: curr 256644 unit 267016 cnt 1
            bash-1440  [005] ....   101.858341: xlog_ticket_alloc: input 326016 calc 340524 cnt 8
            bash-1440  [005] ....   101.858373: xlog_regrant_reserve_log_space: curr 340524 unit 340524 cnt 7
            bash-1440  [005] ....   101.858381: xlog_ungrant_log_space: curr 340524 unit 340524 cnt 6
     kworker/4:1-186   [004] ....   105.789587: xlog_ticket_alloc: input 0 calc 9268 cnt 1
     kworker/4:1-186   [004] ...1   105.789720: xlog_verify_iclog_lsn.isra.20: iclog b 120 c 1 tail b 120 c 1
     kworker/4:1-186   [004] ....   105.789730: xlog_ungrant_log_space: curr 8704 unit 9268 cnt 0
              cp-1546  [003] ....   110.256017: xlog_ticket_alloc: input 254080 calc 267016 cnt 2
              cp-1546  [003] ....   110.256160: xlog_ungrant_log_space: curr 256624 unit 267016 cnt 1
              cp-1546  [003] ....   110.256640: xlog_ticket_alloc: input 178936 calc 190824 cnt 8
              cp-1546  [003] ....   110.257872: xlog_ungrant_log_space: curr 190012 unit 190824 cnt 7
              cp-1546  [003] ....   110.258000: xlog_ticket_alloc: input 760 calc 10028 cnt 0
     kworker/6:1-68    [006] ....   110.258781: xlog_ungrant_log_space: curr 10028 unit 10028 cnt 0
              cp-1546  [003] ....   110.259266: xlog_ticket_alloc: input 5752 calc 15020 cnt 0
              cp-1546  [003] ....   110.259303: xlog_ungrant_log_space: curr 15020 unit 15020 cnt 0
              cp-1546  [003] ....   110.259326: xlog_ticket_alloc: input 178936 calc 190824 cnt 8
              cp-1546  [003] ....   110.259358: xlog_regrant_reserve_log_space: curr 190720 unit 190824 cnt 7
              cp-1546  [003] ....   110.259402: xlog_regrant_reserve_log_space: curr 190620 unit 190824 cnt 6
              cp-1546  [003] ....   110.259429: xlog_regrant_reserve_log_space: curr 190768 unit 190824 cnt 5
              cp-1546  [003] ....   110.259433: xlog_ungrant_log_space: curr 190824 unit 190824 cnt 4
              cp-1546  [003] ....   110.259441: xlog_ticket_alloc: input 5752 calc 15020 cnt 0
              cp-1546  [003] ....   110.259448: xlog_ungrant_log_space: curr 15020 unit 15020 cnt 0
              cp-1546  [003] ....   110.259448: xlog_ungrant_log_space: curr 15020 unit 15020 cnt 0
     kworker/4:1-186   [004] ....   136.508683: xlog_ticket_alloc: input 0 calc 9268 cnt 1
     kworker/4:1-186   [004] ...1   136.508693: xlog_verify_iclog_lsn.isra.20: iclog b 128 c 1 tail b 120 c 1
     kworker/4:1-186   [004] ....   136.508702: xlog_ungrant_log_space: curr 8704 unit 9268 cnt 0

     kworker/4:2-370   [004] ....   197.948672: xlog_ticket_alloc: input 4224 calc 13492 cnt 1
     kworker/4:2-370   [004] ....   197.948727: xlog_ungrant_log_space: curr 3792 unit 13492 cnt 0

Most of time, the reserved log space will be surplus.

intent log


Log Items

    Transaction Headers
    Intent to Free an Extent                (EFI) \
    Completion of Intent to Free an Extent  (EFD)  |
    Reverse Mapping Updates Intent          (RUI)  |
    Completion of Reverse Mapping Updates   (RUD)  |
    Reference Count Updates Intent          (CUI)   > Itent Log
    Completion of Reference Count Updates   (CUD)  |
    File Block Mapping Intent               (BUI)  |
    Completion of File Block Mapping Updates(BUD) /
    Inode Updates
    Inode Data Log Item
    Buffer Log Item
    Buffer Data Log Item
    Update Quota File
    Quota Update Data Log Item
    Disable Quota Log Item
    Inode Creation Log Item

+--------------+
|xfs defer ops |
+--------------+--------+
|       xfs log         |
+-----------------------+----------------+
|                xfs buf                 |
-----------------------------------------+
               submit_bio

deferred operations

What is the deferred operation mechanism for ?

The deferred operations in XFS is a kind of "intent logging mechanism".
intent logging means the logs record operation intent.
If a failure occurs, then when the system is recovering, it can use the intent log to
detect what operations were still in process during the failure, and use the intent log
to help recover from the failure, usually by either undoing a partially completed operation,
or by redoing one that might need to be completed
                     XFS_DEFER_OPS_TYPE_REFCOUNT
                     |       XFS_DEFER_OPS_TYPE_RMAP,
                     |       |
xfs_trans->t_dfops - dfp0 - dfp1
                     |
                     |
                     dfp_work - ri0 - ri1 - ri2
                                type = XFS_REFCOUNT_INCREASE
                                startblock
                                blockcount
dfp  xfs_defer_pending
ri   xfs_refcount_intent
defer operations callbacks
All of the defer operation callbacks are stores in defer_op_types[]
Take refcount deferred_operation as example.
Code path

xfs_refcount_increase_extent
  -> __xfs_refcount_add
    -> xfs_defer_add

xfs_defer_finish
  -> xfs_defer_finish_noroll
    -> xfs_defer_create_intents

        //Create intent log for the items linked in this transaction.

       ---
        list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
            dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
                    dfp->dfp_count) [1];
            list_sort(tp->t_mountp, &dfp->dfp_work,
                    dfp->dfp_type->diff_items [2]);
            list_for_each(li, &dfp->dfp_work)
                dfp->dfp_type->log_item(tp, dfp->dfp_intent, li) [3];
        }
       ---
    -> xfs_defer_trans_roll //commit the logs to cil
    ->
    ---

        //Create an intent-done log

        dfp->dfp_done = dfp->dfp_type->create_done [4](*tp, dfp->dfp_intent,
                dfp->dfp_count);
        cleanup_fn = dfp->dfp_type->finish_cleanup;

        /* Finish the work items. */
        state = NULL;
        list_for_each_safe(li, n, &dfp->dfp_work) {
            list_del(li);
            dfp->dfp_count--;
            error = dfp->dfp_type->finish_item [5](*tp, li,
                    dfp->dfp_done, &state);
            ...
            }
        if (cleanup_fn)
            cleanup_fn [6](*tp, state, error);
    ---

intent log recover

The basic principle is:

relog


What is relogging ?

XFS allows multiple separate modifications to a single object to be carried
in the log at any given time. This allows the log to avoid needing to flush
each change to disk before recording a new change to the object.


    lvs on lip          take down the lvs     commit record is completed     AIL handling
    lip is pined  ->    write to iclog    ->  insert to AIL, unpin        -> if not pinned, flush to disk
                        submit to disk                                        
    (format)            (xlog_write)          (xlog_cil_committed)
                        \____________________ ______________________/
                                             V
                                      Log IO is ongoing

The relogging here is used to implement a long-running, multiple-committing transactions.
The multiple-committing is very obvious here.
But what is the long-running ?
IMO. it should mean do logging an object repeatedly for long time.
We always need to roll the transaction with xfs_trans_roll to move log items forward in the log to avoid to be blocked by ourselves when log wraps around.
xfs_trans_roll commits the previous transaction to log and allocates a new one for next.
But if we relog repeatedly, the log item will be pined again and cannot be flushed to
disk, then this log item cannot be removed from the AIL list and free the log
space. We may hang forever to wait the log space.

Refer to order of AIL
No such issue. When an log item is relogged, it will get an new lsn and
repositioned on the AIL list based on this new lsn. The original log space will
be release finally.

Delayed log


Delayed log on CIL
We have two points about the delayed log

Delayed metadata writeout on AIL
xfs_log_reserve/regrant
  -> xlog_grant_push_ail

  // push on the buffer cache code if we ever use more 
  // than 75% of the on-disk log space

    -> xfs_ail_push
      -> wake_up_process(ailp->ail_task)

xfsaild
  -> xfsaild_push
  ---
    while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
        lock_result = xfsaild_push_item(ailp, lip);
        switch (lock_result) {
        case XFS_ITEM_SUCCESS:
            ailp->ail_last_pushed_lsn = lsn;
            break;

        case XFS_ITEM_FLUSHING:
            flushing++;
            ailp->ail_last_pushed_lsn = lsn;
            break;

        case XFS_ITEM_PINNED:
            stuck++;
            ailp->ail_log_flush++;
            break;
        case XFS_ITEM_LOCKED:
            stuck++;
            break;
        default:
            ASSERT(0);
            break;
        }

        count++;

        if (stuck > 100)
            break;

        lip = xfs_trans_ail_cursor_next(ailp, &cur);
        if (lip == NULL)
            break;
        lsn = lip->li_lsn;
    }
    xfs_trans_ail_cursor_done(&cur);
    spin_unlock(&ailp->ail_lock);


    //accumulate the IOs and write out together

    if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
        ailp->ail_log_flush++;
  ---

push log


cil->xc_push_seq //The seq need to be pushed.

Every time xlog_cil_push runs, it will do following things,
[1] under down_write of cil->xc_ctx_lock
[2] under spin_lock of cil->xc_push_lock

miscs


In one trasaction, there can only be one modification on one log item.

The one transaction here is the one xfs_cil_ctx in xlog_cil_push

It will take all of the lvs down from the log items queued on cil->xc_cil
under down_write of cil->xc_ctx_lock.

xfs_log_commit_cil
  -> xlog_cil_insert_items // under down_read of cil->xc_ctx_lock
    -> xlog_cil_insert_format_items

If the lip->lv is not NULL, format the log there. Otherwise, use the shadow lv
and this lv will be written to log in another transaction.
ctx
xlog_cil_push write the lvs to log in unit of ctx.
This ctx has a commit record.

When the iclog where the commit record of a ctx is written to log,
xlog_cil_committed will be invoked.

xlog_cil_committed
  -> xfs_trans_committed_bulk
    -> xfs_log_item_batch_insert
    ---
    for (i = 0; i < nr_items; i++) {
        struct xfs_log_item *lip = log_items[i];

        lip->li_ops->iop_unpin(lip, 0);
    }
    ---

AG free space management


AG Free Space B+trees

The XFS filesystem tracks free space in an allocation group using two B+trees. One B+tree tracks space by block number, the second by the size of the free space block. This scheme allows XFS to find quickly free space near a given block or of a given size.
All block numbers, indexes, and counts are AG relative.

agf->agf_roots[] -.
                  |
                  v
           +-----------+
           |   Header  |        
           +--+--+--+--+        key:[startblock,blockcount]
           |K1|K2|K3|K4|        1:[12,16] 2:[184586,3] 3:[225579,1] 4:[511629,1]
           +--+--+--+--+        
           |P1|P2|P3|P4|        1:2 2:83347 3:6 4:4
           +--+--+--+--+        
            |   |
       .----'   '-------.  
       v                v
       2              83347
 +-----------+    +-----------+               
 |   Header  |    |   Header  |               
 +--+--+--+--+    +--+--+--+--+               
 |R0|R1|R2|R3|    |R0|R1|R2|R3|
 +--+--+--+--+    +--+--+--+--+

xfs_alloc_ag_vextent_size

Allocate a variable extent anywhere in the allocation group agno.
Extent's length (returned in len) will be between minlen and maxlen.

xfs_alloc_ag_vextent_size
---

     //Allocate and initialize a cursor for the by-size btree.

    cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
        args->agno, XFS_BTNUM_CNT);


    //Look for an entry >= maxlen+alignment-1 blocks.
    //The result will be the nearest one. 

    if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
            args->maxlen + args->alignment - 1, &i)))
        goto error0;

    if (!i) {
        ...
    } else {
        /*
         * Search for a non-busy extent that is large enough.
         */
        for (;;) {
            error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
            busy = xfs_alloc_compute_aligned(args, fbno, flen,
                    &rbno, &rlen, &busy_gen);

            //xfs_extent_busy_trim
            //regarding to busy extent, please refer to Busy extent


            if (rlen >= args->maxlen)
                break;


            // shift to right the rec

            error = xfs_btree_increment(cnt_cur, 0, &i);
            if (i == 0) {
                /*
                 * Our only valid extents must have been busy.
                 * Make it unbusy by forcing the log out and
                 * retrying.
                 */
                xfs_btree_del_cursor(cnt_cur,
                             XFS_BTREE_NOERROR);
                trace_xfs_alloc_size_busy(args);
                xfs_extent_busy_flush(args->mp,
                            args->pag, busy_gen);
                goto restart;
            }
        }
    }
    ...

    // Allocate and initialize a cursor for the by-block tree.

    bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
        args->agno, XFS_BTNUM_BNO);
    if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
            rbno, rlen, XFSA_FIXUP_CNT_OK)))
        goto error0;
    xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
    xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
    cnt_cur = bno_cur = NULL;
    args->len = rlen;
    args->agbno = rbno;
---

xfs_alloc_fixup_trees

Update the bnobt and cntbt, remove the allocated record from the two trees.



       fb
  |---------| freespace record we get from freespace btree
  |f0|---|f1|    result record we need

The steps to fix the cntbt and bnobt:
 a. delete fb from the cntbt
 b. insert f0 and f1 into cntbt
 c. update fb to f0 (cut fb down to f0)
 d. insert f1

AG free list

The AG Free List is located in the 4th sector of each AG and is known as the AGFL. It is an array of AG relative block pointers for reserved space for growing the free space B+trees.


(bmap    refcount    rmap    inode)    data block
 the above 4 btrees itself need              
 block to construct the btree             
                            |  xfs_alloc_vextent
                            v
                     bnobt     cntbt
                 the above 2 btrees' block
                 are allocated from agfl
                            |  xfs_alloc_fix_freelist 
                            v
                           agfl
                     
During operations on free space btree (bnobt and cntbt), new free block maybe
needed, if no available free blocks, we may encounter deadlock, especially, free
space.

There are two blocks associated with agfl. Two basic interfaces of agfl
Is xfs_alloc_fix_freelist exclusive ?
The lock of agf buffer could ensure the exclusivity.
It will be locked by
xfs_alloc_read_agf
  -> xfs_read_agf
    -> xfs_trans_read_buf
      -> xfs_trans_read_buf_map
        -> xfs_buf_read_map
          -> xfs_buf_get_map
            -> xfs_buf_find
              -> xfs_buf_lock

Before the transaction is committed or rolled, if the bp has been in the
 transaction, it will not be locked again.

This is done by,
xfs_trans_read_buf_map
  -> xfs_trans_buf_item_match

Where to unlock ?
Every time we do allocation/free, agf, at least agf->agf_freeblks,
will be modified. So it must be logged and the lock will be freed
after log formating.

Actually, the whole free space allocation and free are under the lock of
agf's bp.

defer agfl block frees
f8f2835a9cf300079835e1adb1d90f85033be04c (xfs: defer agfl block frees when dfops is available)


The AGFL fixup code executes before every block allocation/free and rectifies the
AGFL based on the current, dynamic allocation requirements of the fs. The AGFL must
hold a minimum number of blocks to satisfy a worst case split of the free space
btrees caused by the impending allocation operation.

Since the AGFL caches individual blocks, AGFL reduction typically involves multiple,
single block frees. We've had reports of transaction overrun problems during certain
workloads that boil down to AGFL reduction freeing multiple blocks and consuming more
space in the log than was reserved for the transaction (note, it is tic->t_unit_res).

One way to address this problem is to release surplus blocks from the AGFL immediately
but defer the free of those blocks (similar to how file-mapped blocks are unmapped from
the file in one transaction and freed via a deferred operation) until the transaction is
rolled.

Let's look at how to achieve this,

__xfs_trans_commit
  -> xfs_defer_finish_noroll
  ---
    while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
        /* log intents and pull in intake items */
        xfs_defer_create_intents(*tp);  [1]
        list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);

        // Roll the transaction.
        // The transaction will re-get a log space reservation.

        error = xfs_defer_trans_roll(tp); [2]
        ...
                                                    [3]
        dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, dfp->dfp_count);
        cleanup_fn = dfp->dfp_type->finish_cleanup;
        /* Finish the work items. */
        state = NULL;
        list_for_each_safe(li, n, &dfp->dfp_work) {
            list_del(li);
            dfp->dfp_count--;

            // Do the real work behind the intend

                                     [4]
            error = dfp->dfp_type->finish_item(*tp, li, dfp->dfp_done, &state);
            ...
        }
        ...
    }
  ---

Given there are 4 intent log item in this transaction and all of them are btree
updating. If log all of the operations in one transaction committing, the
tic->t_unit_res maybe used up. But when handle them in defer work, things are
different.

 a. create intend log for all of them.
 b. roll the trasaction
 c. create intend done log for the 1st one
 d. do the real work, such as updating btree and agf
 e. roll the transaction
 ...

We could see, every time the transaction is rolled, it just commit one btree
updating operation and the log space needed is predictable.

Busy extent

Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that have been freed but whose transactions aren't committed to disk yet.


xlog_cil_committed
---
    xfs_extent_busy_sort(&ctx->busy_extents);
    xfs_extent_busy_clear(mp, &ctx->busy_extents,
                 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
---

Think of following operations sequence:
 1. free extent E_a from F_a in transaction T_a
 2. allocate extent E_a to F_b in transaction T_b again.
 3. write on the E_a of F_b. 

The issue here is: if the data written on E_a reaches disk before the log of T_a
and T_b, and power is lost at the moment. When do recovery, we cannot replay the
log of free extent E_a from F_a, but the blocks associated to extent E_a has
been corrupted.
How does ext4 handle this ?
ext4_free_blocks
---

    /*
     * We need to make sure we don't reuse the freed block until after the
     * transaction is committed. We make an exception if the inode is to be
     * written in writeback mode since writeback mode has weak data
     * consistency guarantees.
     */

    if (ext4_handle_valid(handle) &&
        ((flags & EXT4_FREE_BLOCKS_METADATA) ||
         !ext4_should_writeback_data(inode))) {

        // if order or journal mode

        ...
        mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
        ext4_mb_free_metadata(handle, &e4b, new_entry);
        ---
            spin_lock(&sbi->s_md_lock);

            list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);

            sbi->s_mb_free_pending += clusters;
            spin_unlock(&sbi->s_md_lock);
        ---
    }
---

ext4_journal_commit_callback
 -> ext4_process_freed_data
   -> ext4_mb_load_buddy
     -> ext4_mb_load_buddy_gfp
       -> find_or_create_page
       -> ext4_mb_init_cache // if not Uptodate

        // struct inode *inode = sbi->s_buddy_cache;


jbd2_journal_commit_transaction
---
    if (journal->j_commit_callback)
        journal->j_commit_callback(journal, commit_transaction);
---

Details

                                                         _
xfs_allocbt_init_cursor <- xfs_alloc_ag_vextent_exact     |
                           xfs_alloc_ag_vextent_near       >  xfs_alloc_ag_vextent  <- xfs_alloc_fix_freelist [1]
                           xfs_alloc_ag_vextent_size     _|                            xfs_alloc_vextent         <- xfs_bmap_extents_to_btree
                           xfs_free_ag_extent                                                                       xfs_bmap_local_to_extents_empty
                           xfs_trim_extents                                                                         xfs_bmap_btalloc
                           xfs_getfsmap_datadev_bnobt_query                                                         xfs_bmbt_alloc_block
                                                                                                                    xfs_ialloc_ag_alloc
                                                                                                                    __xfs_inobt_alloc_block          <- xfs_inobt_alloc_block
                                                                                                                    xfs_refcountbt_alloc_block          xfs_finobt_alloc_block
                                                                                                                    
[1]  xfs_alloc_fix_freelist is used for reserving free blocks to split the cntbt or bnobt.
     because we may need extra free blocks to split the bnobt or cntbt when free blocks.
     if don't reserve these blocks, we fall into deadlock.

Another path to allocate block is to get it from freelist directly

xfs_alloc_get_freelist <- xfs_alloc_ag_vextent_small
                          xfs_alloc_fix_freelist
                          xfs_allocbt_alloc_block
                          xfs_rmapbt_alloc_block
                          
The blocks in freelist are also allocated by xfs_alloc_ag_vextent


Look at xfs_alloc_vextent,

THIS_AG
NEAR_BNO
THIS_BNO
    we are forced to allocate block from a single ag

START_BNO
FIRST_AG
    we could rotate through the allocation groups to look for a winner


Let's look at the allocate policy of following cases,

[1] xfs_bmap_extents_to_btree
---
    if (tp->t_firstblock == NULLFSBLOCK) {
        args.type = XFS_ALLOCTYPE_START_BNO;
        args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
    } else if (tp->t_flags & XFS_TRANS_LOWMODE) {
        args.type = XFS_ALLOCTYPE_START_BNO;
        args.fsbno = tp->t_firstblock;
    } else {
        args.type = XFS_ALLOCTYPE_NEAR_BNO;
        args.fsbno = tp->t_firstblock;
    }

inherit from the inode
---
[2] xfs_bmap_local_to_extents_empty
---
    if (tp->t_firstblock == NULLFSBLOCK) {
        args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
        args.type = XFS_ALLOCTYPE_START_BNO;
    } else {
        args.fsbno = tp->t_firstblock;
        args.type = XFS_ALLOCTYPE_NEAR_BNO;
    }
    ...
    /* Can't fail, the space was reserved. */
    ASSERT(args.fsbno != NULLFSBLOCK);

inherit from the inode
---
[3] xfs_bmap_btalloc
---
it could fallback to START_BNO and FIRST_AG
---
[4] xfs_bmbt_alloc_block
---
    if (args.fsbno == NULLFSBLOCK) {
        args.fsbno = be64_to_cpu(start->l);
        args.type = XFS_ALLOCTYPE_START_BNO;
        args.minleft = args.tp->t_blk_res;
    } else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) {
        args.type = XFS_ALLOCTYPE_START_BNO;
    } else {
        args.type = XFS_ALLOCTYPE_NEAR_BNO;
    }

The root node's block is

#define	XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
#define	XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))

It is not related to the inode, but the ag
---
[5] xfs_ialloc_ag_alloc
----
Allocate new inodes in the allocation group specified by agbp.
----
[6] __xfs_inobt_alloc_block
---
    args.tp = cur->bc_tp;
    args.mp = cur->bc_mp;
    args.oinfo = XFS_RMAP_OINFO_INOBT;
    args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
    args.minlen = 1;
    args.maxlen = 1;
    args.prod = 1;
    args.type = XFS_ALLOCTYPE_NEAR_BNO;
    args.resv = resv;

    error = xfs_alloc_vextent(&args);

---
[7] xfs_refcountbt_alloc_block
---
    args.type = XFS_ALLOCTYPE_NEAR_BNO;
    args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
            xfs_refc_block(args.mp));
    args.oinfo = XFS_RMAP_OINFO_REFC;
    args.minlen = args.maxlen = args.prod = 1;
    args.resv = XFS_AG_RESV_METADATA;

    error = xfs_alloc_vextent(&args);

---


How does xfs_alloc_vextent know the ag from which the caller want to allocate block ?
xfs_alloc_vextent
---
    case XFS_ALLOCTYPE_THIS_AG:
    case XFS_ALLOCTYPE_NEAR_BNO:
    case XFS_ALLOCTYPE_THIS_BNO:
        /*
         * These three force us into a single a.g.
         */
        args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
        args->pag = xfs_perag_get(mp, args->agno);
        error = xfs_alloc_fix_freelist(args, 0);
        ...
        case XFS_ALLOCTYPE_START_BNO:
        /*
         * Try near allocation first, then anywhere-in-ag after
         * the first a.g. fails.
         */
        ...
        args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
        args->type = XFS_ALLOCTYPE_NEAR_BNO;
        /* FALLTHROUGH */
    case XFS_ALLOCTYPE_FIRST_AG:
        /*
         * Rotate through the allocation groups looking for a winner.
         */
        if (type == XFS_ALLOCTYPE_FIRST_AG) {
            /*
             * Start with allocation group given by bno.
             */
            args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
            args->type = XFS_ALLOCTYPE_THIS_AG;
            sagno = 0;
            flags = 0;
        } else {
            /*
             * Start with the given allocation group.
             */
            args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
            flags = XFS_ALLOC_FLAG_TRYLOCK;
        }
        /*
         * Loop over allocation groups twice; first time with
         * trylock set, second time without.
         */
        for (;;) {
            args->pag = xfs_perag_get(mp, args->agno);
            error = xfs_alloc_fix_freelist(args, flags);
            if (error) {
                trace_xfs_alloc_vextent_nofix(args);
                goto error0;
            }
            /*

---

So the key here is the arg->fsbno.

But where does the initial vlaue come from ?



How to select ag for a inode ?

xfs_dialloc
  -> xfs_ialloc_ag_select
---

    // For a directory, spread the them over all of the ags

    if (S_ISDIR(mode))
        pagno = xfs_ialloc_next_ag(mp);
    else {

    // Otherwise, inherit the ag of its parent

        pagno = XFS_INO_TO_AGNO(mp, parent);
        if (pagno >= agcount)
            pagno = 0;
    }
---

How to select ag for a file ?

xfs_bmap_btalloc
---
    fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
                            ap->tp->t_firstblock);
    if (nullfb) {
        if (xfs_alloc_is_userdata(ap->datatype) &&
            xfs_inode_is_filestream(ap->ip)) {
            ag = xfs_filestream_lookup_ag(ap->ip);
            ag = (ag != NULLAGNUMBER) ? ag : 0;
            ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
        } else {

            //initial ag is near its inode

            ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
        }
    } else
        ap->blkno = ap->tp->t_firstblock;

            //then it will follow the existing extent of the file


    xfs_bmap_adjacent(ap);
    ...
    args.fsbno = ap->blkno;
    ...
---



How to know the block allocation is for userdata or metadata ?


xfs_bmap_btalloc 
  <- xfs_bmap_alloc
    <- xfs_bmapi_allocate
      <- xfs_bmapi_convert_delalloc
          xfs_bmapi_allocate           <- xfs_attr_rmtval_set                                                                           
                                          xfs_da_grow_inode_int [M] (directory)                                                 
                                          xfs_alloc_file_space [?]                                                              
                                          xfs_dquot_disk_alloc [M]                                                              
                                          xfs_iomap_write_direc                                                                 
                                          xfs_iomap_write_unwritten                                                             
                                          xfs_reflink_allocate_cow                                                              
                                          xfs_growfs_rt_alloc [M]                                                               
                                          xfs_symlik [M]                                                                        
                                                                                            
xfs_bmapi_allocate
---
    if (!(bma->flags & XFS_BMAPI_METADATA)) {
        bma->datatype = XFS_ALLOC_NOBUSY;
        if (whichfork == XFS_DATA_FORK) {
            if (bma->offset == 0)
                bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
            else
                bma->datatype |= XFS_ALLOC_USERDATA;
        }
        if (bma->flags & XFS_BMAPI_ZERO)
            bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
    }
---



Inode space allocation



xfs_ialloc will return a pointer to an incore inode if the Space Manager has an
available inode on the free list. Otherwise, it will do an allocation and replenish
the freelist.
Since we can only do one allocation per transaction without deadlocks, 
we will need to commit the current transaction and start a new one.  We will then
need to call xfs_ialloc again to get the inode.

xfs_dialloc
---
	if (*IO_agbp) {
		/*
		 * If the caller passes in a pointer to the AGI buffer,
		 * continue where we left off before.  In this case, we
		 * know that the allocation group has free inodes.
		 */
		agbp = *IO_agbp;
		goto out_alloc;
	}

	/*
	 * We do not have an agbp, so select an initial allocation
	 * group for inode allocation.
	 */
	start_agno = xfs_ialloc_ag_select(tp, parent, mode);
	if (start_agno == NULLAGNUMBER) {
		*inop = NULLFSINO;
		return 0;
	}
	...
	/*
	 * Loop until we find an allocation group that either has free inodes
	 * or in which we can allocate some inodes.  Iterate through the
	 * allocation groups upward, wrapping at the end.
	 */
	agno = start_agno;
	for (;;) {
		pag = xfs_perag_get(mp, agno);
		if (!pag->pagi_inodeok) {
			xfs_ialloc_next_ag(mp);
			goto nextag;
		}

		if (!pag->pagi_init) {
			error = xfs_ialloc_pagi_init(mp, tp, agno);
			if (error)
				goto out_error;
		}

		/*
		 * Do a first racy fast path check if this AG is usable.
		 */
		if (!pag->pagi_freecount && !okalloc)
			goto nextag;
	
		/*
		 * Then read in the AGI buffer and recheck with the AGI buffer
		 * lock held.
		 */
		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
		if (error)
			goto out_error;


		if (pag->pagi_freecount) {
			xfs_perag_put(pag);
			goto out_alloc;
		}

		if (!okalloc)
			goto nextag_relse_buffer;


		error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
		...
		if (ialloced) {
			ASSERT(pag->pagi_freecount > 0);
			xfs_perag_put(pag);

			*IO_agbp = agbp;
			*inop = NULLFSINO;
			return 0;
		}

nextag_relse_buffer:
		xfs_trans_brelse(tp, agbp);
nextag:
		xfs_perag_put(pag);
		if (++agno == mp->m_sb.sb_agcount)
			agno = 0;
		if (agno == start_agno) {
			*inop = NULLFSINO;
			return noroom ? -ENOSPC : 0;
		}
	}

out_alloc:
	*IO_agbp = NULL;
	return xfs_dialloc_ag(tp, agbp, parent, inop);

---



AG free inode managment


Large Number of files

Inode allocation

xfs_dialloc
---
    xfs_ialloc_ag_select //select an AG

    for (;;) {
        xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (pag->pagi_freecount) {
            xfs_perag_put(pag);
            goto out_alloc;
        }
        xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
        if (ialloced) {

            /*
             * We successfully allocated some inodes, return
             * the current context to the caller so that it
             * can commit the current transaction and call
             * us again where we left off.
             */

            ASSERT(pag->pagi_freecount > 0);
            xfs_perag_put(pag);

            *IO_agbp = agbp;
            *inop = NULLFSINO;
            return 0;
        }
    }

out_alloc:
    *IO_agbp = NULL;
    return xfs_dialloc_ag(tp, agbp, parent, inop);
---

Firstly, we need to allocate chunk of inodes (64) and insert them into the inode
allocation btree. Then xfs_dialloc_ag will be invoked to allocated inode from
the btree.

As the allocate and free go on repeatedly, there will be holes in the chunk.

Policy to select AG

The policy to select AG

For large files, we try to
allocate extents initially near the inode and afterwards
near the existing block in the file which is closest to
the offset in the file for which we are allocating space

That implies blocks will be allocated near the last
block in the file for sequential writers and near blocks
in the middle of the file for processes writing into
holes

Files and directories are not limited to allocating
space within a single allocation group, however

B Tree


xfs_btree_ops

xfs_btree_ops

xfs_btree_lookup

xfs_btree_lookup will iterate the btree and fill the entries fillowing:

    /* initialise start pointer from cursor */
    cur->bc_ops->init_ptr_from_cur(cur, &ptr);
    pp = &ptr;
    for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
        error = xfs_btree_lookup_get_block(cur, level, pp, &block);
        if (diff == 0) {
            /*
             * If we already had a key match at a higher level, we
             * know we need to use the first entry in this block.
             */
            keyno = 1;
        } else {
            int    high;    /* high entry number */
            int    low;    /* low entry number */

            /* Set low and high entry numbers, 1-based. */
            low = 1;
            high = xfs_btree_get_numrecs(block);
            ...
            while (low <= high) {
                union xfs_btree_key    key;
                union xfs_btree_key    *kp;

                XFS_BTREE_STATS_INC(cur, compare);

                /* keyno is average of low and high. */
                keyno = (low + high) >> 1;

                /* Get current search key */
                kp = xfs_lookup_get_search_key(cur, level,
                        keyno, block, &key);

                /*
                 * Compute difference to get next direction:
                 *  - less than, move right
                 *  - greater than, move left
                 *  - equal, we're done
                 */
                diff = cur->bc_ops->key_diff(cur, kp);
                if (diff < 0)
                    low = keyno + 1;
                else if (diff > 0)
                    high = keyno - 1;
                else
                    break;
            }
        }

        Except for the diff==0, the search loop exit is due to low == high.

        Before that, keyno = (high + low)/2
        if diff < 0, low = keyno + 1
        if diff > 0, high = keyno -1

        We could get: low + 2 = high

        Take the bnobt as example, the final result of loop above is:
        
        KEY   low     keyno   high
        PTR   ptr_l   ptr_k   ptr_h
        BNO   bno_l   bno_k   bno_h    (start block number of the range)
                   ^        ^
                  [1]      [2]

        diff = bno_k - bno_want

        if diff > 0, bno_want is at [1], it is included in ptr_l, res = keyno - 1
        if diff < 0, bno_want is at [2], it is included in ptr_k, res = keyno

        if (level > 0) {
            if (diff > 0 && --keyno < 1)
                keyno = 1;
            ...
            cur->bc_ptrs[level] = keyno;
        }
    }

    For intermediate node, it must include the key we want.
    But for leaf node, things is different.

    low      keyno      high
         ^           ^
         '           '
      diff > 0    diff < 0

    LE    diff > 0, res = keyno - 1
          diff < 0, res = keyno
    
    EQ    diff > 0, res = keyno
          diff < 0, res = keyno + 1
          (EQ and GE return same records, but the stat is different)
    GE    diff > 0, res = keyno
          diff < 0, res = keyno + 1
          In case of keyno + 1 may exceed current node, 
          
                    [K1][K2][K3][K4]
                    [P1][P2][P3][P4]
                         |   |
                     .---'   '----------.
                     v                  v
              [R1][R2][R3][R4]   [R1][R2][R3][R4]
                           ^      
                           keyno
          we need invoke xfs_btree_increment to shift the path right (blue -> red)
    

    if (dir != XFS_LOOKUP_LE && diff < 0) {
        keyno++;
        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
        if (dir == XFS_LOOKUP_GE &&
            keyno > xfs_btree_get_numrecs(block) &&
            !xfs_btree_ptr_is_null(cur, &ptr)) {
            int    i;

            cur->bc_ptrs[0] = keyno;
            error = xfs_btree_increment(cur, 0, &i);
            ...
            *stat = 1;
            return 0;
        }
    } else if (dir == XFS_LOOKUP_LE && diff > 0)
        keyno--;
    cur->bc_ptrs[0] = keyno;

    /* Return if we succeeded or not. */
    if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
        *stat = 0;
    else if (dir != XFS_LOOKUP_EQ || diff == 0)
        *stat = 1;
    else
        *stat = 0;
    return 0;

reflink

To support the sharing of file data blocks (reflink), there are two new components,

work process

Write on the reflinked range page cache

xfs_file_iomap_begin_delay
  -> xfs_iext_lookup_extent // XFS_DATA_FORK
  ---

    // We only support COW on real extent.
    // Before this we have checked the got.br_startoff <= offset_fsb
    // The 'got' is from the in-core data extent list fork.

    if (xfs_is_reflink_inode(ip) && ((flags & IOMAP_WRITE) ||
         got.br_state != XFS_EXT_UNWRITTEN)) {

        // We may get a extent bigger than what we want.
        // We must trim it because we my do COW on that range.

        xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
        error = xfs_reflink_reserve_cow(ip, &got);
          -> xfs_iext_lookup_extent //XFS_COW_FORK

                //Has been on the COW in-core extent list ?

          if (!eof && got.br_startoff <= imap->br_startoff) {
              xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
              return 0;
          }

          //Trim the mapping to the nearest shared extent boundary.

          -> xfs_reflink_trim_around_shared
            -> xfs_reflink_find_shared

          ->  xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, 
                  imap->br_blockcount, 0, &got, &icur, eof);

          //The process looks similar with normal delayed-allocation.
          // - Insert the written shared extent into COW in-core extent fork.
                ---
                got->br_startoff = aoff;
                got->br_startblock = nullstartblock(indlen);
                got->br_blockcount = alen;
                got->br_state = XFS_EXT_NORM;

                xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
                ---
          // - And also reserve space for it with both xfs_mod_fdblocks and quota.

    }
  ---
Writeback to disk
xfs_map_blocks
---
    if (xfs_inode_has_cow_data(ip) &&
        xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
        cow_fsb = imap.br_startoff;
    if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
        wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        ...
        whichfork = XFS_COW_FORK;
        wpc->io_type = XFS_IO_COW;
        goto allocate_blocks;
        ...
allocate_blocks:
    error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
            &wpc->cow_seq);

xfs_iomap_write_allocate
  -> xfs_bmapi_write
    -> xfs_bmapi_allocate
---
    error = xfs_bmap_alloc(bma); //Allocate blocks
    ...

    // For the COW in-core extents list, there is no XFS_IFBROOT on it.
    // So finally, bma->cur will be NULL for XFS_BMAPI_COWFORK and
    // xfs_bmap_add_extent_hole_real will not update on-disk bmbtree 
    // due to a NULL bma->cur.

    if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur)
        bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
    ..
    bma->got.br_startoff = bma->offset;
    bma->got.br_startblock = bma->blkno;
    bma->got.br_blockcount = bma->length;
    bma->got.br_state = XFS_EXT_NORM;
    ...
    if (bma->wasdel)
        error = xfs_bmap_add_extent_delay_real(bma, whichfork);
    else
        error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
                whichfork, &bma->icur, &bma->cur, &bma->got,
                &bma->logflags, bma->flags);
---
Write complete
xfs_reflink_end_cow
---
    if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
        goto out_cancel;

    /* Walk backwards until we're out of the I/O range... */
    while (got.br_startoff + got.br_blockcount > offset_fsb) {
        del = got;
        xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);

        /* Extent delete may have bumped ext forward */
        if (!del.br_blockcount)
            goto prev_extent;

        if (!xfs_bmap_is_real_extent(&got))
            goto prev_extent;

        // Step 1 Unmap the old blocks in the data fork.

        rlen = del.br_blockcount;
        error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
        ...

        /* Trim the extent to whatever got unmapped. */

        if (rlen) {
            xfs_trim_extent(&del, del.br_startoff + rlen,
                del.br_blockcount - rlen);
        }
        ...

        // Step 2. Map the new blocks into the data fork.

        error = xfs_bmap_map_extent(tp, ip, &del);
        ...

        // Step 3. Remove the mapping from the CoW fork.

        xfs_bmap_del_extent_cow(ip, &icur, &got, &del);

        error = xfs_defer_finish(&tp);
        ...
        if (!xfs_iext_get_extent(ifp, &icur, &got))
            break;
        continue;
prev_extent:
        if (!xfs_iext_prev_extent(ifp, &icur, &got))
            break;
    }
---

orphan record

Let's look at the following scenario, When write the data in a range of shared blocks out to disk, we need to do the COW, but what if poweroff occurs during that ?
For example,

writeback
    allocate blocks (update agf, agfl, bnobt, cntbt ...)
    update in-core COW extent fork
                                    
    write data out            <-.  /\  /\  .-- POWROFF!!!
                                 \/  \/  \/
    complete,
      unmap old blocks from data fork
      map new blocks into data fork
      remove the extent from in-core COW fork.
Then the allocated extents will be leaked. How to handle this ?
xfs_bmapi_write
---
            error = xfs_bmapi_allocate(&bma);
            ...

            /*
             * If this is a CoW allocation, record the data in
             * the refcount btree for orphan recovery.
             */

            if (whichfork == XFS_COW_FORK) {
                error = xfs_refcount_alloc_cow_extent(tp,
                        bma.blkno, bma.length);
            }
---

Extents that are being used to stage a copy on write are stored
in the refcount btree with a refcount of 1 and the upper bit set
on the startblock.  This speeds up mount time deletion of stale
staging extents because they're all at the right side of the tree.

xfs_refcount_finish_one
  -> __xfs_refcount_cow_alloc
    -> xfs_refcount_adjust_cow
---

    agbno += XFS_REFC_COW_START;

---

Find and remove leftover CoW reservations.

xfs_mountfs
  -> xfs_reflink_recover_cow
    -> xfs_refcount_recover_cow_leftovers
Then we could get,
writeback
    allocate blocks (update agf, agfl, bnobt, cntbt ...)
    update in-core COW extent fork
    record COW allocation in refcount tree (deferred operation)
                                    
    write data out            <-.  /\  /\  .-- POWROFF!!!
                                 \/  \/  \/
    complete,
      unmap old blocks from data fork
      map new blocks into data fork
      remove the extent from in-core COW fork.


Even if it is a deferred operation, the CUI log is in the same
transaction with block allocation logs.
If poweroff comes before the CUD and
real refcount operations, the CUI could be replayed.

speculative preallocation on COW

Darrick Wong points out that

There are two forks in play here
 - the data fork that's written to disk
 - the CoW fork that only exists in memory
We don't coordinate extent overlap between the two forks; indeed, we use
this lack of overlap-coordination as part of speculative preallocation
to defragment shared files.
Let's first look at where does the speculative preallocation happen. So when we really want to allocate,
cowextsz = 10

   012345678901234567890123456789
D: SSSSSSSSSSSSRRRRRRRRRRRRRRRRRR
C: -----AAAAAAA------------------
   ^         ^         ^
we actually allocate.
cowextsz = 10

   012345678901234567890123456789
D: SSSSSSSSSSSSRRRRRRRRRRRRRRRRRR
C: AAAAAAAAAAAAAAAAAAAA----------
   ^         ^         ^
When the IO on blocks 5-13, xfs_reflink_end_cow remaps the blocks as:
   012345678901234567890123456789
D: SSSSSAAAAAAAAARRRRRRRRRRRRRRRR
C: UUUUU---------UUUUUU----------
   ^         ^         ^
If we issue write IO on blocks 16, the COW fork will be used.
xfs_file_iomap_begin_delay
---
    eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
    if (eof)
        got.br_startoff = end_fsb; /* fake hole until the end */

    if (got.br_startoff <= offset_fsb) {
        if (xfs_is_reflink_inode(ip) &&
            ((flags & IOMAA_WRITE) ||
             got.br_state != XFS_EXT_UNWRITTEN)) {
            xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
            error = xfs_reflink_reserve_cow(ip, &got);

            // Try to find the got->br_startoff in COW fork, if there,
            // return and use it directly

        }
        goto done;
    }

---
It'll convert block 16 in the COW fork before issuing the write IO:
   012345678901234567890123456789
D: SSSSSAAAAAAAAARRRRRRRRRRARRRRR
C: UUUUU---------UUAUUU----------
   ^         ^         ^
and then remap it when it's done:
   012345678901234567890123456789
D: SSSSSAAAAAAAAARRARRRRRRRARRRRR
C: UUUUU---------UU-UUU----------
   ^         ^         ^
Therefore, in normal case, the whole range will be remapped into a contiguous blocks. This is the defragment

COW Unwritten

The preallocated extents are firstly set to UNWRITTEN state.

xfs_bmapi_allocate
---
    error = xfs_bmap_alloc(bma);
    ...
    bma->got.br_startoff = bma->offset;
    bma->got.br_startblock = bma->blkno;
    bma->got.br_blockcount = bma->length;
    bma->got.br_state = XFS_EXT_NORM;
    if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
        (bma->flags & XFS_BMAPI_PREALLOC))
        bma->got.br_state = XFS_EXT_UNWRITTEN;

    if (bma->wasdel)
        error = xfs_bmap_add_extent_delay_real(bma, whichfork);
    else
        error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
                whichfork, &bma->icur, &bma->cur, &bma->got,
                &bma->logflags, bma->flags);
---
Why do we need this ?
When we write IO on a range
cowextsz 10

   012345678901234567890123456789
D: RRRRRRRRRRRRRRRSSSSSSSSSSSSSSS
C: ------------------------------
   ^         ^         ^

direct write on block 10 ~ 19
Two IO will be generated. This IO with regular and COW IO will cause the dio->flags gain a IOMAP_DIO_COW.
iomap_dio_bio_actor
---
    if (iomap->flags & IOMAP_F_SHARED)
        dio->flags |= IOMAP_DIO_COW
---
When IO completes,
xfs_dio_write_end_io
  -> xfs_reflink_end_cow //IOMAP_DIO_COW

At the moment, we will try to remap all of the blocks involved in this IO
on COW fork into data fork.

However, the blocks 10 ~ 14 are newly allocated and not yet initialized.
If power is lost, we will get data corrupted.
To fix this, we will set the preallocated blocks into Unwritten state first.
cowextsz 10

   012345678901234567890123456789
D: RRRRRRRRRRRRRRRSSSSSSSSSSSSSSS
C: ----------UUUUUUUUUU----------
   ^         ^         ^
And only convert the blocks COWed into real state.
xfs_file_iomap_begin
  -> xfs_reflink_allocate_cow
    -> xfs_reflink_convert_cow_extent

cowextsz 10

   012345678901234567890123456789
D: RRRRRRRRRRRRRRRSSSSSSSSSSSSSSS
C: ----------UUUUUAAAAA----------
   ^         ^         ^
Then only remap the blocks that not are Unwritten from COW fork into data fork.
xfs_reflink_end_cow
---

        /*
         * Only remap real extent that contain data.  With AIO
         * speculatively preallocations can leak into the range we
         * are called upon, and we need to skip them.
         */

        if (!xfs_bmap_is_real_extent(&got))
            goto prev_extent;
---
Here is test code and log in kernel.
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 


int main()
{
    int fd_in, fd_out, len, ret;
    unsigned long long off_in, off_out;
    char *buf;
    int i;

    buf = aligned_alloc(4096, 4096);
    if (!buf) {
        printf("allocate buffer failed %d\n", errno);
        return -1;
    }

    fd_out = open("./cfr_out", O_DIRECT | O_RDWR | O_CREAT);
    if (fd_in < 0) {
        printf("open cfr_out failed %d\n", errno);
        return -1;
    }

    /*
     * Set file cfr_out to 0xa, 0xb, 0xc, 0xd
     */
    memset(buf, 0xa, 4096);
    for (i=0; i<2048; i++) {
        ret = write(fd_out, buf, 4096);
    }

    memset(buf, 0xb, 4096);
    for (i=0; i<2048; i++) {
        ret = write(fd_out, buf, 4096);
    }

    memset(buf, 0xc, 4096);
    for (i=0; i<2048; i++) {
        ret = write(fd_out, buf, 4096);
    }

    memset(buf, 0xd, 4096);
    for (i=0; i<2048; i++) {
        ret = write(fd_out, buf, 4096);
    }

    fd_in = open("./cfr_in", O_DIRECT| O_RDWR | O_CREAT);
    if (fd_in < 0) {
        printf("open cfr_in failed %d\n", errno);
        return -1;
    }

    /*
     * Set file cfr_dst to 0
     */
    memset(buf, 0, 4096);
    for (i=0; i<2048; i++) {
        write(fd_in, buf, 4096);
    }

    free(buf);
    /*
     * cowextsz 0x20000
     * 
     * cfr_in                    |------|              0x800000
     *
     * cfr_out |-----------------|------------------|  0x2000000
     *                           ^
     *                           |
     *                       0x1010000
     */
    off_out = 0x1010000;
    off_in = 0;
    len = 0x100000; //1M
#if 1
    ret = syscall(326, fd_in, &off_in, fd_out, &off_out, len, 0);
    if (errno) {
        printf("copy_file_range failed %d\n", errno);
        return ret;
    }
#endif
    buf = aligned_alloc(4096, 0x20000);
    if (!buf) {
        printf("allocate 1M buffer failed %d\n", errno);
        return -1;
    }
    /*
     *                 0x1010000 0x1110000
     *                     |   .---'
     *                     v   v
     * cfr_out  |----------|sss|-----|    32M
     *                   |   |  
     *                    IO
     *                   0x20000 (128K)
     */
    memset(buf, 0xf, 0x20000);
    /*
      * Write to 0x1000000
     */
    ret = pwrite(fd_out, buf, 0x20000, 0x1000000);
    if (ret != 0x20000)
        printf("pwrite failed %d\n", errno);

    free(buf);
    close(fd_in);
    close(fd_out);
    return 0;
}
gcc -D_GNU_SOURCE  copy_file_range.c -o cfr

root@will-ThinkCentre-M93p:/home/will/Desktop/xfs/mount# hexdump -C cfr_out 
00000000  0a 0a 0a 0a 0a 0a 0a 0a  0a 0a 0a 0a 0a 0a 0a 0a  |................|
*
00800000  0b 0b 0b 0b 0b 0b 0b 0b  0b 0b 0b 0b 0b 0b 0b 0b  |................|
*
01000000  0f 0f 0f 0f 0f 0f 0f 0f  0f 0f 0f 0f 0f 0f 0f 0f  |................|
*
01020000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
01110000  0c 0c 0c 0c 0c 0c 0c 0c  0c 0c 0c 0c 0c 0c 0c 0c  |................|
*
01800000  0d 0d 0d 0d 0d 0d 0d 0d  0d 0d 0d 0d 0d 0d 0d 0d  |................|
*
02000000

cfr-1755  [001] ....  3442.372122: xfs_reflink_allocate_cow: 1 o 1000 b 1013 l 10 s 0
cfr-1755  [001] ....  3442.372147: xfs_reflink_allocate_cow: 2 o 1000 b 1013 l 10 s 0 f 0 sh 0
cfr-1755  [001] ....  3442.372157: xfs_reflink_allocate_cow: 1 o 1010 b 2013 l 10 s 0
cfr-1755  [001] ....  3442.372160: xfs_reflink_allocate_cow: 2 o 1010 b 2013 l 10 s 0 f 0 sh 1
cfr-1755  [001] ....  3442.372161: xfs_reflink_allocate_cow: cowextsz 20
cfr-1755  [001] ....  3442.372166: xfs_reflink_allocate_cow: 3 o 1010 b 2013 l 10 s 0
cfr-1755  [001] ....  3442.372177: xfs_bmapi_write: o 1000 b 1023 l 20
cfr-1755  [001] ....  3442.372178: xfs_reflink_allocate_cow: 4 o 1010 b 1033 l 10 s 1

o - offset
b - block
l - length
f - found
sh- shared

Talking


concurrent read write

https://www.spinics.net/lists/linux-xfs/msg26459.html

xfs_file_buffered_aio_read() is the only call sites I know of
to call generic_file_read_iter() with i_rwsem read side held.

xfs_file_buffered_aio_read
---
    if (iocb->ki_flags & IOCB_NOWAIT) {
        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
            return -EAGAIN;
    } else {
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
    }
    ret = generic_file_read_iter(iocb, to);
    xfs_iunlock(ip, XFS_IOLOCK_SHARED);

    return ret;
---

xfs_file_buffered_aio_write
---
write_retry:
    iolock = XFS_IOLOCK_EXCL;
    xfs_ilock(ip, iolock);

    ret = xfs_file_aio_write_checks(iocb, from, &iolock);
    if (ret)
        goto out;

    /* We can write back this queue in page reclaim */
    current->backing_dev_info = inode_to_bdi(inode);

    ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
    if (likely(ret >= 0))
        iocb->ki_pos += ret;

    ...
    current->backing_dev_info = NULL;
out:
    if (iolock)
        xfs_iunlock(ip, iolock);

    if (ret > 0) {
        XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
        /* Handle various SYNC-type writes */
        ret = generic_write_sync(iocb, ret);
    }
    return ret;
---

This lock is killing performance of multi-threaded buffered
read/write mixed workload on the same file [1].

My question is, is the purpose of this lock syncing
dio/buffered io?

Answer:
That's one part of it. The other is POSIX atomic write semantics.
https://pubs.opengroup.org/onlinepubs/009695399/functions/read.html

"I/O is intended to be atomic to ordinary files and pipes and FIFOs.
Atomic means that all the bytes from a single operation that started
out together end up together, without interleaving from other I/O
operations."

i.e. that independent read()s should see a write() as a single
atomic change. hence if you do a read() concurrently with a write(),
the read should either run to completion before the write, or the
write run to completion before the read().

XFS is the only linux filesystem that provides this behaviour.
(Databases like this behaviour very much....)

ext4 and other filesytsems tha thave no buffered read side locking
provide serialisation at page level, which means an 8kB read racing
with a 8kB write can return 4k from the new write and 4k from the
yet-to-be-written part of the file. IOWs, the buffered read gets
torn.


using AIO+DIO on XFS will allow
both reads and writes to be issued concurrently and not block the
main thread of execution.

stripe alignment

Two important variables,
xfs_mount_t.m_swidth  (stripe unit)
            m_dalign  (stripe width)

How does these two work ?

xfs_bmap_btalloc
---
	/* stripe alignment for allocation is determined by mount parameters */
	stripe_align = 0;
	if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
		stripe_align = mp->m_swidth;
	else if (mp->m_dalign)
		stripe_align = mp->m_dalign;
	...

	/*
	 * If we are not low on available data blocks, and the
	 * underlying logical volume manager is a stripe, and
	 * the file offset is zero then try to allocate data
	 * blocks on stripe unit boundary.
	 * NOTE: ap->aeof is only set if the allocation length
	 * is >= the stripe unit and the allocation offset is
	 * at the end of file.
	 */

	if (!ap->dfops->dop_low && ap->aeof) {
		if (!ap->offset) {
			args.alignment = stripe_align;
			atype = args.type;
			isaligned = 1;
			/*
			 * Adjust for alignment
			 */
			if (blen > args.alignment && blen <= args.maxlen)
				args.minlen = blen - args.alignment;
			args.minalignslop = 0;
		} else {
			/*
			 * First try an exact bno allocation.
			 * If it fails then do a near or start bno
			 * allocation with alignment turned on.
			 */
			atype = args.type;
			tryagain = 1;
			args.type = XFS_ALLOCTYPE_THIS_BNO;
			args.alignment = 1;
			...
		}
	} else {
		args.alignment = 1;
		args.minalignslop = 0;
	}
	args.minleft = ap->minleft;
	args.wasdel = ap->wasdel;
	args.resv = XFS_AG_RESV_NONE;
	args.datatype = ap->datatype;
	if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
		args.ip = ap->ip;

	error = xfs_alloc_vextent(&args);
	if (error)
		return error;

	if (tryagain && args.fsbno == NULLFSBLOCK) {
		/*
		 * Exact allocation failed. Now try with alignment
		 * turned on.
		 */
		args.type = atype;
		args.fsbno = ap->blkno;
		args.alignment = stripe_align;
		args.minlen = nextminlen;
		args.minalignslop = 0;
		isaligned = 1;
		if ((error = xfs_alloc_vextent(&args)))
			return error;
	}
	if (isaligned && args.fsbno == NULLFSBLOCK) {
		/*
		 * allocation failed, so turn off alignment and
		 * try again.
		 */
		args.type = atype;
		args.fsbno = ap->blkno;
		args.alignment = 0;
		if ((error = xfs_alloc_vextent(&args)))
			return error;
	}
---


xfs_ialloc_ag_alloc
---
if (unlikely(args.fsbno == NULLFSBLOCK)) {
		/*
		 * Set the alignment for the allocation.
		 * If stripe alignment is turned on then align at stripe unit
		 * boundary.
		 * If the cluster size is smaller than a filesystem block
		 * then we're doing I/O for inodes in filesystem block size
		 * pieces, so don't need alignment anyway.
		 */
		isaligned = 0;
		if (args.mp->m_sinoalign) {
			ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
			args.alignment = args.mp->m_dalign;
			isaligned = 1;
		} else
			args.alignment = xfs_ialloc_cluster_alignment(args.mp);
		/*
		 * Need to figure out where to allocate the inode blocks.
		 * Ideally they should be spaced out through the a.g.
		 * For now, just allocate blocks up front.
		 */
		args.agbno = be32_to_cpu(agi->agi_root);
		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
		/*
		 * Allocate a fixed-size extent of inodes.
		 */
		args.type = XFS_ALLOCTYPE_NEAR_BNO;
		args.prod = 1;
		/*
		 * Allow space for the inode btree to split.
		 */
		args.minleft = args.mp->m_in_maxlevels - 1;
		if ((error = xfs_alloc_vextent(&args)))
			return error;
	}
---

(delay allocation / extent reserve)
xfs_eof_alignment <- xfs_iomap_eof_align_last_fsb <- xfs_iomap_write_direc
                  <- xfs_file_iomap_begin_delay


xfs_preferred_iosize(xfs_mount_t *mp)
{
	if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
		return PAGE_SIZE;
	return (mp->m_swidth ?
		(mp->m_swidth << mp->m_sb.sb_blocklog) :
		((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
			(1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) :
			PAGE_SIZE));
}

xfs_vn_getattr
---
	switch (inode->i_mode & S_IFMT) {
	case S_IFBLK:
	case S_IFCHR:
		stat->blksize = BLKDEV_IOSIZE;
		stat->rdev = inode->i_rdev;
		break;
	default:
		if (XFS_IS_REALTIME_INODE(ip)) {
			/*
			 * If the file blocks are being allocated from a
			 * realtime volume, then return the inode's realtime
			 * extent size or the realtime volume's extent size.
			 */
			stat->blksize =
				xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
		} else
			stat->blksize = xfs_preferred_iosize(mp);
		stat->rdev = 0;
		break;
	}

---


space reservation

xfs_mount->m_fdblocks is used to account the free blocks.

xfs_mod_fdblocks will operate it.
---
	/*
	 * Taking blocks away, need to be more accurate the closer we
	 * are to zero.
	 *
	 * If the counter has a value of less than 2 * max batch size,
	 * then make everything serialise as we are real close to
	 * ENOSPC.
	 */
	if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
				     XFS_FDBLOCKS_BATCH) < 0)
		batch = 1;
	else
		batch = XFS_FDBLOCKS_BATCH;

	percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
	if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
				     XFS_FDBLOCKS_BATCH) >= 0) {
		/* we had space! */
		return 0;
	}

	/*
	 * lock up the sb for dipping into reserves before releasing the space
	 * that took us to ENOSPC.
	 */
	spin_lock(&mp->m_sb_lock);
	percpu_counter_add(&mp->m_fdblocks, -delta);
	if (!rsvd)
		goto fdblocks_enospc;

	lcounter = (long long)mp->m_resblks_avail + delta;
	if (lcounter >= 0) {
		mp->m_resblks_avail = lcounter;
		spin_unlock(&mp->m_sb_lock);
		return 0;
	}
	printk_once(KERN_WARNING
		"Filesystem \"%s\": reserve blocks depleted! "
		"Consider increasing reserve pool size.",
		mp->m_fsname);
fdblocks_enospc:
	spin_unlock(&mp->m_sb_lock);
	return -ENOSPC;
---


The statfs could get this information and showed in df -h.
---
	icount = percpu_counter_sum(&mp->m_icount);
	ifree = percpu_counter_sum(&mp->m_ifree);
	fdblocks = percpu_counter_sum(&mp->m_fdblocks);

	spin_lock(&mp->m_sb_lock);
	statp->f_bsize = sbp->sb_blocksize;
	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
	statp->f_blocks = sbp->sb_dblocks - lsize;
	spin_unlock(&mp->m_sb_lock);

	statp->f_bfree = fdblocks - mp->m_alloc_set_aside;

---



grow fs


xfs_growfs mountpoint -D size

NOTE: While xfs could be grown while mounted, their size cannot be reduced any
more.

xfs grow fs

------------------------
xfs_file_ioctl
---
	case XFS_IOC_FSGROWFSDATA: {
		xfs_growfs_data_t in;

		if (copy_from_user(&in, arg, sizeof(in)))
			return -EFAULT;

		error = mnt_want_write_file(filp);
		if (error)
			return error;
		error = xfs_growfs_data(mp, &in);
		mnt_drop_write_file(filp);
		return error;
	}
---

Let's look at the core part of the grow fs

xfs_growfs_data_private
---

	// new size of the fs

	nb = in->newblocks;
	if (nb < mp->m_sb.sb_dblocks)
		return -EINVAL;
	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
		return error;


	// try to read in the nb - 1 block to verify the length is accessible

	error = xfs_buf_read_uncached(mp->m_ddev_targp,
				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
	if (error)
		return error;
	xfs_buf_relse(bp);

	new = nb;	/* use new as a temporary here */
	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
	nagcount = new + (nb_mod != 0);
	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
		nagcount--;
		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
		if (nb < mp->m_sb.sb_dblocks)
			return -EINVAL;
	}
	new = nb - mp->m_sb.sb_dblocks;
	oagcount = mp->m_sb.sb_agcount;


	/* allocate the new per-ag structures */

	if (nagcount > oagcount) {
		error = xfs_initialize_perag(mp, nagcount, &nagimax);
		if (error)
			return error;
	}

	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
	if (error)
		return error;

	/*
	 * Write new AG headers to disk. Non-transactional, but need to be
	                                 ^^^^^^^^^^^^^^^^^
	 * written and completed prior to the growfs transaction being logged.
	 * To do this, we use a delayed write buffer list and wait for
	 * submission and IO completion of the list as a whole. This allows the
	 * IO subsystem to merge all the AG headers in a single AG into a single
	 * IO and hide most of the latency of the IO from us.
	 *
	 * This also means that if we get an error whilst building the buffer
	 * list to write, we can cancel the entire list without having written
	 * anything.
	 */

	INIT_LIST_HEAD(&id.buffer_list);
	for (id.agno = nagcount - 1;
	     id.agno >= oagcount;
	     id.agno--, new -= id.agsize) {

		if (id.agno == nagcount - 1)
			id.agsize = nb -
				(id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
		else
			id.agsize = mp->m_sb.sb_agblocks;

		error = xfs_ag_init_headers(mp, &id);
		if (error) {
			xfs_buf_delwri_cancel(&id.buffer_list);
			goto out_trans_cancel;
		}
	}
	error = xfs_buf_delwri_submit(&id.buffer_list);
	if (error)
		goto out_trans_cancel;

	xfs_trans_agblocks_delta(tp, id.nfree);


	/* If there are new blocks in the old last AG, extend it. */
	// currently, the id.agno is the old last AG id, the new is the space left.

	if (new) {
		error = xfs_ag_extend_space(mp, tp, &id, new);
		---


		/*
		 * Change the agi length.
		 */

		error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);

		agi = XFS_BUF_TO_AGI(bp);
		be32_add_cpu(&agi->agi_length, len);
		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);

		/*
		 * Change agf length.
		 */

		error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);

		agf = XFS_BUF_TO_AGF(bp);
		be32_add_cpu(&agf->agf_length, len);
		xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);


		/*
		 * 	Free the new space.
		 */

		return  xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
					be32_to_cpu(agf->agf_length) - len),
				len, &XFS_RMAP_OINFO_SKIP_UPDATE,
				XFS_AG_RESV_NONE);
		---
		if (error)
			goto out_trans_cancel;
	}

	/*
	 * Update changed superblock fields transactionally. These are not
	 * seen by the rest of the world until the transaction commit applies
	 * them atomically to the superblock.
	 */

	if (nagcount > oagcount)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
	if (nb > mp->m_sb.sb_dblocks)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
				 nb - mp->m_sb.sb_dblocks);
	if (id.nfree)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
	xfs_trans_set_sync(tp);
	error = xfs_trans_commit(tp);

	// The whole operation of grow fs is a complete transaction.

---

Does the new size of the fs take effect before the grow fs transaction complete
even commit ?
There are two points that would affect the ag selection
  • xfs_alloc_vextent loop all of the ag to get an available one
    ---
    			if (++(args->agno) == mp->m_sb.sb_agcount) {
    				if (args->tp->t_firstblock != NULLFSBLOCK)
    					args->agno = sagno;
    				else
    					args->agno = 0;
    			}
    ---
    xfs_log_commit_cil
      -> xlog_cil_insert_items
    
    	//BEFORE
    
      -> xfs_trans_unreserve_and_mod_sb
      ---
      	/* calculate deltas */
    	if (tp->t_blk_res > 0)
    		blkdelta = tp->t_blk_res;
    	if ((tp->t_fdblocks_delta != 0) &&
    	    (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
    	     (tp->t_flags & XFS_TRANS_SB_DIRTY)))
    	        blkdelta += tp->t_fdblocks_delta;
    
    	...
    	/* apply the per-cpu counters */
    	if (blkdelta) {
    		error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
    		if (error)
    			goto out;
    	}
    	...
    	/* apply remaining deltas */
    	spin_lock(&mp->m_sb_lock);
    	...
    	if (tp->t_dblocks_delta != 0) {
    		error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
    		if (error)
    			goto out_undo_frextents;
    	}
    	if (tp->t_agcount_delta != 0) {
    		error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
    		if (error)
    			goto out_undo_dblocks;
    	}
    	...
      ---
    
    After xfs_trans_unreserve_and_mod_sb, the newsize of fs is visible for the data block allocation.
    The xfs_alloc_vextent could allocate blocks from the new blocks.
    
    
    The xfs_trans_unreserve_and_mod_sb is invoked after xlog_cil_insert_items could guarantee that the
    following transaction of data block allocation would be in the same log ctx or after the grow fs transaction.
                                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    If the system crash during this, the data block allocation will not get blocks beyond the fs size.
    
    
  • xfs_ialloc_next_ag get a ag for a directory
    ---
    	spin_lock(&mp->m_agirotor_lock);
    	agno = mp->m_agirotor;
    	if (++mp->m_agirotor >= mp->m_maxagi)
    		mp->m_agirotor = 0;
    	spin_unlock(&mp->m_agirotor_lock);
    ---
    
    The m_maxagi is updated here,
    xfs_growfs_data_private
    ---
    	...
    	error = xfs_trans_commit(tp);
    	...
    	if (nagimax)
    		mp->m_maxagi = nagimax;
    ---