block map
per-ag metadata buffer
log
AG free space management
AG free inode managment
B Tree
reflink
Talking
enum { NODE_SIZE = 256, KEYS_PER_NODE = NODE_SIZE / (sizeof(uint64_t) + sizeof(void *)), RECS_PER_LEAF = (NODE_SIZE - (2 * sizeof(struct xfs_iext_leaf *))) / sizeof(struct xfs_iext_rec), }; On a 64-bit system, a node has two keys, a leaf has one recs. 0 2048 +--+--+--+--+ |K0|K1|P0|P1| +--+--+--+--+ 0 1024 / \2048 3072 +--+--+--+--+ +--+--+--+--+ |K0|K1|P0|P1| |K0|K1|P0|P1| +--+--+--+--+ +--+--+--+--+ 0 / \1024 +--+-+-+ +--+-+-+ |R0|p|n| <-> |R0|p|n| +--+-+-+ +--+-+-+ / | startoff (offset in the file) record < startblock (disk logical block offset ?) | length | flags \ Take the smallest value of the lower level node as the KEY. The value here is file offset.
How to allocate blocks for regular file ?
Basic principles
For large files, we try to allocate extents initially near the inode
and afterwards near the existing block in the file which is closest to
the offset in the file for which we are allocating space.
That implies blocks will be allocated near the last block in the file
for sequential writers and near blocks in the middle of the file for
processes writing into holes.
Files and directories are not limited to allocating space within a single
allocation group.
xfs_bmap_btalloc
(tp->t_firstblock is NULLFSBLOCK, ignore filestream)
//start block of new extent
ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
xfs_bmap_adjacent
1. If allocating at eof, and there's a previous real block,
try to use its last block as our starting point.
---
if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
!isnullstartblock(ap->prev.br_startblock) &&
ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
ap->prev.br_startblock)) {
ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
/*
* Adjust for the gap between prevp and us.
*/
adjust = ap->offset -
(ap->prev.br_startoff + ap->prev.br_blockcount);
if (adjust &&
ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
ap->blkno += adjust;
}
---
2. If not at eof, then compare the two neighbor blocks.
Figure out whether either one gives us a good starting point,
and pick the better one.
xfs_bmap_btalloc_nullfb
-> args->type = XFS_ALLOCTYPE_START_BNO
if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
if (!ap->offset) {
args.alignment = stripe_align;
atype = args.type;
isaligned = 1;
if (blen > args.alignment && blen <= args.maxlen)
args.minlen = blen - args.alignment;
args.minalignslop = 0;
} else {
/*
* First try an exact bno allocation.
* If it fails then do a near or start bno
* allocation with alignment turned on.
*/
atype = args.type;
tryagain = 1;
args.type = XFS_ALLOCTYPE_THIS_BNO;
args.alignment = 1;
...
}
} else {
args.alignment = 1;
args.minalignslop = 0;
}
allocation fallback sequence in xfs_bmap_btalloc,
for example,
XFS_ALLOCTYPE_THIS_BNO
|
v
XFS_ALLOCTYPE_NEAR_BNO/START_BNO
alignment = 1
|
v
alignment = 0
|
v
XFS_ALLOCTYPE_FIRST_AG
loop all of the ags
xfs_alloc_fix_freelist will do following things:
1. check whether there is enough contiguous free space for the allocation.
2. check whether there is enough free space for remaining allocation.
changing of the bnobt and cntbt also needs free blocks.
3. fixup the agfl.
Before invoke xfs_alloc_ag_vextent, xfs_alloc_fix_freelist will be invoked to
check the case 1 and 2 to ensure there is enough free spaces.
However, even if there is enough free space, the allocation still could fail for
XFS_ALLOCTYPE_NEAR_BNO and XFS_ALLOCTYPE_THIS_BNO. At the moment, the
args->agbno will be NULLAGBLOCK.
Look at the code of xfs_alloc_vextent
---
switch (type) {
case XFS_ALLOCTYPE_THIS_AG:
case XFS_ALLOCTYPE_NEAR_BNO:
case XFS_ALLOCTYPE_THIS_BNO:
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
args->pag = xfs_perag_get(mp, args->agno);
error = xfs_alloc_fix_freelist(args, 0);
...
if (!args->agbp)
break;
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
if ((error = xfs_alloc_ag_vextent(args)))
goto error0;
break;
case XFS_ALLOCTYPE_START_BNO:
/*
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
case XFS_ALLOCTYPE_FIRST_AG:
if (type == XFS_ALLOCTYPE_FIRST_AG) {
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_THIS_AG;
sagno = 0;
flags = 0;
} else {
args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
flags = XFS_ALLOC_FLAG_TRYLOCK;
}
for (;;) {
args->pag = xfs_perag_get(mp, args->agno);
error = xfs_alloc_fix_freelist(args, flags);
if (args->agbp) {
if ((error = xfs_alloc_ag_vextent(args)))
goto error0;
break;
}
if (args->agno == sagno &&
type == XFS_ALLOCTYPE_START_BNO)
args->type = XFS_ALLOCTYPE_THIS_AG;
/*
* Reached the starting a.g., must either be done
* or switch to non-trylock mode.
*/
if (args->agno == sagno) {
if (flags == 0) {
args->agbno = NULLAGBLOCK;
trace_xfs_alloc_vextent_allfailed(args);
break;
}
flags = 0;
if (type == XFS_ALLOCTYPE_START_BNO) {
args->agbno = XFS_FSB_TO_AGBNO(mp,
args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
}
}
xfs_perag_put(args->pag);
}
break;
default:
ASSERT(0);
/* NOTREACHED */
}
//Finally, the allocation successes or not
if (args->agbno == NULLAGBLOCK)
args->fsbno = NULLFSBLOCK;
else {
args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
}
---
Preallocated blocks are normally reclaimed on file close, inode reclaim, unmount or
xfs_bmap_btalloc
---
if (ap->flags & XFS_BMAPI_COWFORK)
align = xfs_get_cowextsz_hint(ap->ip);
else if (xfs_alloc_is_userdata(ap->datatype))
align = xfs_get_extsz_hint(ap->ip);
if (align) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
&ap->offset, &ap->length);
// ap->offset and ap->length will be adjusted here.
// ap->got and ap->prev are separately the next and previous extent.
// xfs_bmap_extsize_align will ensure there is no overlap between the
// new extent and the originally next and previous extent.
}
---
http://xfs.org/index.php/XFS_FAQ#Q:_What_is_speculative_preallocation.3F
The XFS speculative preallocation algorithm allocates extra blocks beyond end of file (EOF)
to minimize file fragmentation during buffered write workloads.
Workloads that benefit from this behaviour include slowly growing files, concurrent writers
and mixed reader/writer workloads. It also provides fragmentation resistance in situations where
memory pressure prevents adequate buffering of dirty data to allow formation of large contiguous
regions of data in memory.
T_a T_b T_c T_d T_e T_f T_a
| | | | | | |
v v v v v v v
|---|---|---|---|---|---|---|...
| |
'----------- -----------'
V
fragmentation for T_a
With EOF preallocation
T_a preallocates
^
.------- -------.
| |
|---|---|---|---|---|---|---|...
^
|
T_a writes
speculative preallocation has two styles,
1. fixed size with 'allocsize=' mount option
2. dynamic based on the current inode size
xfs_file_iomap_begin_delay
---
if (eof) {
prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
&icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
xfs_fileoff_t p_end_fsb;
end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
prealloc_blocks;
align = xfs_eof_alignment(ip, 0);
if (align)
p_end_fsb = roundup_64(p_end_fsb, align);
p_end_fsb = min(p_end_fsb, maxbytes_fsb);
ASSERT(p_end_fsb > offset_fsb);
prealloc_blocks = p_end_fsb - end_fsb;
}
}
retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
eof);
---
original file preallocated
____^____ ______^_______
/ \ / \
|_________|_ _|_ _ _ _ _ _ _ |
\_ _/
v
want write
When flush the pagecache to disk, real blocks will be allocated.
original file
____^____
/ \
|_________|__________________|
\________ _________/
v
more contiguous blocks
// A bigger extent will be added and when write the pages to disk and
// allocate real blocks for it, more blocks will be allocated and hence less
// fragmentation when there are multiple concurrent writters.
in the background once file write activity subsides.
xfs_icache_free_eofblocks
-> __xfs_icache_free_eofblocks
-> xfs_inode_ag_iterator_tag
-> xfs_inode_ag_walk //iterate radix tree pag->pag_ici_root
-> xfs_inode_free_eofblocks
-> xfs_free_eofblocks // under XFS_IOLOCK_EXCL
-> xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
XFS_ISIZE(ip), XFS_BMAPI_NODISCARD)
Free any blocks beyond EOF.
The time that triggers xfs_icache_free_eofblocks
We also would truncate the blocks beyond EOF when close the file.
xfs_file_buffered_aio_write
---
write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
...
ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
/*
* If we hit a space limit, try to free up some lingering preallocated
* space before returning an error. In the case of ENOSPC, first try to
* write back all dirty inodes to free up some of the excess reserved
* metadata space.
*/
if (ret == -EDQUOT && !enospc) {
xfs_iunlock(ip, iolock);
enospc = xfs_inode_free_quota_eofblocks(ip);
if (enospc)
goto write_retry;
enospc = xfs_inode_free_quota_cowblocks(ip);
if (enospc)
goto write_retry;
iolock = 0;
} else if (ret == -ENOSPC && !enospc) {
struct xfs_eofblocks eofb = {0};
enospc = 1;
xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
xfs_icache_free_eofblocks(ip->i_mount, &eofb);
xfs_icache_free_cowblocks(ip->i_mount, &eofb);
goto write_retry;
}
---
xfs_eofblocks_worker
---
xfs_icache_free_eofblocks(mp, NULL);
xfs_queue_eofblocks(mp);
---
To avoid the applications that write and close the file repeatedly to cause
too much fragmentation, there is some heuristic there to detect that pattern.
xfs_release
---
if (xfs_can_free_eofblocks(ip, false)) {
/*
* Check if the inode is being opened, written and closed
* frequently and we have delayed allocation blocks outstanding
* (e.g. streaming writes from the NFS server), truncating the
* blocks past EOF will cause fragmentation to occur.
*
* In this case don't do the truncation, but we have to be
* careful how we detect this case. Blocks beyond EOF show up as
* i_delayed_blks even when the inode is clean, so we need to
* truncate them away first before checking for a dirty release.
* Hence on the first dirty close we will still remove the
* speculative allocation, but after that we will leave it in
* place.
*/
if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
return 0;
/*
* If we can't get the iolock just skip truncating the blocks
* past EOF because we could deadlock with the mmap_sem
* otherwise. We'll get another chance to drop them once the
* last reference to the inode is dropped, so we'll never leak
* blocks permanently.
*/
if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
error = xfs_free_eofblocks(ip);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
if (error)
return error;
}
/* delalloc blocks after truncation means it really is dirty */
if (ip->i_delayed_blks)
xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
}
---
Reference of dynamic speculatively preallocation
link https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=055388a3188f56676c21e92962fc366ac8b5cb72
xfs: dynamic speculative EOF preallocation
Currently the size of the speculative preallocation during delayed
allocation is fixed by either the allocsize mount option of a
default size. We are seeing a lot of cases where we need to
recommend using the allocsize mount option to prevent fragmentation
when buffered writes land in the same AG.
Rather than using a fixed preallocation size by default (up to 64k),
make it dynamic by basing it on the current inode size. That way the
EOF preallocation will increase as the file size increases. Hence
for streaming writes we are much more likely to get large
preallocations exactly when we need it to reduce fragementation.
For default settings, the size of the initial extents is determined
by the number of parallel writers and the amount of memory in the
machine. For 4GB RAM and 4 concurrent 32GB file writes:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL
0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576
1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576
2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152
3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304
4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608
5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208
6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088
7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088
and for 16 concurrent 16GB file writes:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL
0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144
1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144
2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288
3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576
4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152
5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304
6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608
7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208
Because it is hard to take back specualtive preallocation, cases
where there are large slow growing log files on a nearly full
filesystem may cause premature ENOSPC. Hence as the filesystem nears
full, the maximum dynamic prealloc size іs reduced according to this
table (based on 4k block size):
freespace max prealloc size
>5% full extent (8GB)
4-5% 2GB (8GB >> 2)
3-4% 1GB (8GB >> 3)
2-3% 512MB (8GB >> 4)
1-2% 256MB (8GB >> 5)
<1% 128MB (8GB >> 6)
This should reduce the amount of space held in speculative
preallocation for such cases.
The allocsize mount option turns off the dynamic behaviour and fixes
the prealloc size to whatever the mount option specifies. i.e. the
xfs_perag is maintained in a radix tree on xfs_mount->m_perag_tree
xfs_buf_find
pag->pag_buf_hash protected by pag->pag_buf_lock maintains the xfs_bufs
xfs_buf_t looks like a buffer mechanism of xfs its own.
xfs_buf_t -> xfs_buf_map [block num, size]
-> b_pages[] associated pages (_xfs_buf_alloc)
-> b_addr virtual address in kernel (_xfs_buf_map_pages)
xfs_buf_read_map
-> xfs_buf_get_maps
-> if !bp->b_flags & XBF_DONE
_xfs_buf_read
-> xfs_buf_submit_wait
-> _xfs_buf_ioapply
---
/* we only use the buffer cache for meta-data */
op_flags |= REQ_META;
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
* into the buffer and the desired IO size before we start -
* _xfs_buf_ioapply_vec() will modify them appropriately for each
* subsequent call.
*/
offset = bp->b_offset;
size = BBTOB(bp->b_io_length);
blk_start_plug(&plug);
for (i = 0; i < bp->b_map_count; i++) {
xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
if (bp->b_error)
break;
if (size <= 0)
break; /* all done */
}
blk_finish_plug(&plug);
---
How to convert a xfs_buf_t to bio ?
xfs_buf_ioapply_map
---
sector_t sector = bp->b_maps[map].bm_bn;
...
size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
...
next_chunk:
atomic_inc(&bp->b_io_remaining);
nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOIO, nr_pages);
bio_set_dev(bio, bp->b_target->bt_bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
bio_set_op_attrs(bio, op, op_flags);
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
if (nbytes > size)
nbytes = size;
rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
offset);
if (rbytes < nbytes)
break;
offset = 0;
sector += BTOBB(nbytes);
size -= nbytes;
total_nr_pages--;
}
if (likely(bio->bi_iter.bi_size)) {
if (xfs_buf_is_vmapped(bp)) {
flush_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
}
submit_bio(bio);
if (size)
goto next_chunk;
}
---
Lock and unlock scenarios.
LOCK:
xfs_buf_delwri_submit_buffers
---
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
if (!wait_list) {
if (xfs_buf_ispinned(bp)) {
pinned++;
continue;
}
if (!xfs_buf_trylock(bp))
continue;
} else {
xfs_buf_lock(bp);
}
...
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
bp->b_flags |= XBF_WRITE;
if (wait_list) {
bp->b_flags &= ~XBF_ASYNC;
list_move_tail(&bp->b_list, wait_list);
} else {
bp->b_flags |= XBF_ASYNC;
list_del_init(&bp->b_list);
}
__xfs_buf_submit(bp, false);
}
blk_finish_plug(&plug);
---
UNLOCK:
For the XBF_ASYNC case, it will be unlocked by
xfs_buf_ioend
---
if (bp->b_iodone)
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);
---
For the !XBF_ASYNC case,
xfs_buf_delwri_submit
---
xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
/* Wait for IO to complete. */
while (!list_empty(&wait_list)) {
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
...
error2 = xfs_buf_iowait(bp);
xfs_buf_relse(bp);
}
---
LOCK:
xfs_buf_find
---
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
return -EAGAIN;
}
xfs_buf_lock(bp);
}
---
Or
xfs_buf_get_maps
-> _xfs_buf_alloc
-> sema_init(&bp->b_sema, 0); /* held, no waiters */
-> xfs_buf_find
---
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
xfs_buf_hash_params);
if (bp) {
atomic_inc(&bp->b_hold);
goto found;
}
...
rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
*found_bp = new_bp;
return 0;
---
A newly allocated xfs_buf is locked and also reference one.
xfs_buf_find xfs_buf_incore
xfs_buf_get_map xfs_buf_read_map xfs_buf_readahead_map [1]
xfs_buf_read xlog_recover_buffer_pass2 [2]
xlog_recover_inode_pass2 [3]
xfs_readlink_bmap_ilocked [4]
xfs_trans_read_buf_map[5]
xfs_buf_get xfs_update_secondary_sbs [6]
xfs_attr_rmtval_set itself [7]
xfs_trans_get_buf_map [8]
UNLOCK:
[1] This is just for readahead.
xfs_buf_readahead_map
---
if (bdi_read_congested(target->bt_bdev->bd_bdi))
return;
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
---
Due to the XBF_ASYNC, it will be unlocked by xfs_buf_ioend
[2] unlocked by xlog_recover_buffer_pass2 itself.
[2] unlocked by xlog_recover_inode_pass2 itself.
For both two case [2] and [3], the bps will be submitted by xfs_buf_delwri_submit.
Look at xlog_do_recovery_pass and xlog_recover_process_ophdr.
[4] unlocked by xfs_readlink_bmap_ilocked itself.
[5] This is for update transactions.
The bp will be added to the transaction.
xfs_trans_read_buf_map
---
bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
...
if (tp) {
_xfs_trans_bjoin(tp, bp, 1);
}
---
So finally, the bp will be unlocked by xfs_buf_item_unlock
xfs_buf_item_unlock
---
bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
...
if (!hold)
xfs_buf_relse(bp);
---
The XFS_BLI_HOLD is not a common case.
One of the usage of XFS_BLI_HOLD.
xfs_dir_ialloc
---
/*
* Normally, xfs_trans_commit releases all the locks.
* We call bhold to hang on to the ialloc_context across
* the commit. Holding this buffer prevents any other
* processes from doing any allocations in this
* allocation group.
*/
xfs_trans_bhold(tp, ialloc_context);
---
The xfs_buf_item_unlock is invoked in:
xfs_log_commit_cil
-> xlog_cil_insert_items
-> xfs_trans_free_items
-> lip->li_ops->iop_unlock
xfs_buf_item_unlock
In addition, the bp could be unlocked by xfs_trans_brelse if it not dirty
xfs_trans_brelse
---
/*
* If the buffer is dirty within this transaction, we can't
* release it until we commit.
*/
if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags))
return;
...
xfs_buf_relse(bp);
---
[6] unlocked by xfs_update_secondary_sbs itself.
[7] unlocked by xfs_attr_rmtval_set itself
[8] same with case [5]
The xfs_buf->b_hold decides the life of a xfs_buf, who will hold and free it ?
The initial reference of a xfs_buf is zero.
Even though we could see it is set to 1 when initialized,
_xfs_buf_alloc
-> atomic_set(&bp->b_hold, 1)
but when we use xfs_buf_find to insert a newly allocated xfs_buf into pag->pag_buf_hash,
it will not be referenced again even locked.
xfs_buf_get_maps
-> _xfs_buf_alloc
-> sema_init(&bp->b_sema, 0); /* held, no waiters */
-> xfs_buf_find
---
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
xfs_buf_hash_params);
if (bp) {
atomic_inc(&bp->b_hold);
goto found;
}
...
rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
*found_bp = new_bp;
return 0;
---
One reference will be hold when it is got by xfs_buf_find, and also it will
locked.
xfs_buf_find
---
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
xfs_buf_hash_params);
if (bp) {
atomic_inc(&bp->b_hold);
goto found;
}
...
found:
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp); //If nowait, put the reference
return -EAGAIN;
}
xfs_buf_lock(bp);
}
...
---
The buf lock and reference will be both released by xfs_buf_relse.
The typical caller of xfs_buf_relse for this case is the xfs_buf_item_unlock.
One reference will be held when the xfs_buf is added on the delwri_queue
xfs_buf_delwri_queue
---
bp->b_flags |= _XBF_DELWRI_Q;
if (list_empty(&bp->b_list)) {
atomic_inc(&bp->b_hold);
list_add_tail(&bp->b_list, list);
}
---
For the async case, it will be released by the xfs_buf_ioend.
For the sync case, the caller will do it, such as,
xfs_buf_delwri_submit
---
xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
/* Wait for IO to complete. */
while (!list_empty(&wait_list)) {
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
/*
* Wait on the locked buffer, check for errors and unlock and
* release the delwri queue reference.
*/
error2 = xfs_buf_iowait(bp);
xfs_buf_relse(bp);
// The xfs_buf_delwri_submit_buffers will lock the buf.
// We will release both the lock and reference here.
}
---
A special case here is:
__xfs_buf_submit
---
/*
* Grab a reference so the buffer does not go away underneath us. For
* async buffers, I/O completion drops the callers reference, which
* could occur before submission returns.
*/
xfs_buf_hold(bp);
...
_xfs_buf_ioapply(bp);
...
/*
* Release the hold that keeps the buffer referenced for the entire
* I/O. Note that if the buffer is async, it is not safe to reference
* after this release.
*/
xfs_buf_rele(bp);
---
The buf's log item will hold one reference.
xfs_buf_item_init
---
bp->b_log_item = bip;
xfs_buf_hold(bp);
---
This is the iodone() function for buffers which have been
logged. It is called when they are eventually flushed out.
xfs_buf_iodone
---
xfs_buf_rele(bp);
spin_lock(&ailp->ail_lock);
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
---
When the buf->b_hold reaches zero, it will first add to the buf LRU list.
xfs_buf_rele
---
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
...
/* the last reference has been dropped ... */
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
spin_unlock(&pag->pag_buf_lock);
---
Then the shrinker callback will release the reference again.
xfs_buftarg_shrink_scan
---
freed = list_lru_shrink_walk(&btp->bt_lru, sc,
xfs_buftarg_isolate, &dispose);
while (!list_empty(&dispose)) {
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
---
When a xfs_buf is written, its log item will hold a reference.
_xfs_trans_bjoin
-> xfs_buf_item_init
-> xfs_buf_hold(bp)
-> xfs_trans_add_item
It will be released when the log item is freed.
xfs_buf_item_put
-> xfs_buf_item_relse
-> xfs_buf_rele(bp)
-> xfs_buf_item_free
This will introduce another topic about the xfs_buf log item's bli_refcount.
A xfs_buf_log_item's bli_refcount is held by
The path that could release a buf log item
_xfs_trans_bjoin
---
atomic_inc(&bip->bli_refcount);
xfs_trans_add_item(tp, &bip->bli_item); // add the lp on tp->t_items
---
If the buf is dirty
xfs_log_commit_cil
-> xfs_trans_free_items
-> xfs_trans_del_item // clear XFS_LI_DIRTY and dequeue from tp->t_items
-> .iop_unlock
xfs_buf_item_unlock
-> xfs_buf_item_put
-> xfs_buf_relse
else
xfs_trans_brelse // if no XFS_LI_DIRTY set
-> xfs_buf_item_put
-> xfs_buf_relse
---
if (!atomic_dec_and_test(&bip->bli_refcount))
return false;
---
The buf log item will only be released at the two cases above under buf's
lock.
xfs_log_commit_cil
-> xlog_cil_insert_items
-> xlog_cil_insert_format_items
-> lip->li_ops->iop_format(lip, lv)
-> xfs_cil_prepare_item
-> lv->lv_item->li_ops->iop_pin(lv->lv_item);
xfs_buf_item_pin
---
atomic_inc(&bip->bli_refcount);
atomic_inc(&bip->bli_buf->b_pin_count);
---
This reference will be released when unpin.
xlog_cil_committed
-> xfs_trans_committed_bulk
-> .iop_unpin
xfs_buf_item_unpin
---
freed = atomic_dec_and_test(&bip->bli_refcount);
---
In normal case, the buf log item will not be released at this case, even though
its refcount has reached zero.
[ 1803.455740] xfs_buf_item_free+0xe/0x40 [xfs]
[ 1803.455829] xfs_buf_do_callbacks+0x1c/0x70 [xfs]
[ 1803.455919] xfs_buf_iodone_callbacks+0xef/0x470 [xfs]
[ 1803.455933] process_one_work+0x1e9/0x610
[ 1803.455951] worker_thread+0x4c/0x430
[ 1803.455974] kthread+0x101/0x140
[ 1803.455994] ret_from_fork+0x24/0x30
[ 1805.943251] xfs_buf_item_free+0xe/0x40 [xfs]
[ 1805.943349] xfs_buf_item_put+0x4d/0x90 [xfs]
[ 1805.943449] xfs_trans_brelse+0x112/0x180 [xfs]
[ 1805.943523] xfs_btree_del_cursor+0x2b/0x70 [xfs]
[ 1805.943602] xfs_dialloc_ag+0x230/0x270 [xfs]
[ 1805.943694] xfs_dialloc+0x47/0x230 [xfs]
[ 1805.943794] xfs_ialloc+0x5d/0x560 [xfs]
[ 1805.943897] xfs_dir_ialloc+0x4f/0x1c0 [xfs]
[ 1805.943996] xfs_create+0x48e/0x740 [xfs]
[ 1805.944096] xfs_generic_create+0x20f/0x2d0 [xfs]
[ 1805.944128] lookup_open+0x677/0x750
[ 1805.944167] path_openat+0x2ff/0xbd0
[ 1805.944193] do_filp_open+0x8a/0xf0
[ 1805.944286] do_sys_open+0x11c/0x1f0
[ 1805.944307] do_syscall_64+0x54/0x190
[ 1805.944315] entry_SYSCALL_64_after_hwframe+0x49/0xbe
Quote from comment of xfs_trans_binval.
Invalidate a buffer that is being used within a transaction.
Typically this is because the blocks in the buffer are being freed, so we
need to prevent it from being written out when we're done. Allowing it
to be written again might overwrite data in the free blocks if they are
reallocated to a file.
free an unused block on the bt
xfs_btree_free_block
-> cur->bc_ops->free_block(cur, bp)
// At this point, the buf is locked by ourself, but it could be pinned by others.
// And the buf maybe under IO to log or disk.
-> xfs_trans_binval(cur->bc_tp, bp)
Process of free a block
xfs_bmbt_ops.free_block
xfs_bmbt_free_block
-> xfs_bmap_add_free
-> __xfs_bmap_add_free
-> xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list)
xfs_extent_free_defer_type.finish_item
xfs_extent_free_finish_item
-> xfs_trans_free_extent
-> __xfs_free_extent
-> xfs_free_extent_fix_freelist
-> xfs_free_ag_extent
-> xfs_extent_busy_insert
xfs_trans_binval will prevent the metadata buffer whose block has been released
to block free space from being written to disk.
- bp->b_flags |= XBF_STALE;
- bip->bli_flags |= XFS_BLI_STALE;
xfs_buf_item_unlock
---
bool stale = bip->bli_flags & XFS_BLI_STALE;
...
released = xfs_buf_item_put(bip);
// the buffer will be locked until the final unpin.
if (hold || (stale && !released))
return;
---
What if others wait for this buffer on xfs_buf_find ?
We also set the XFS_BLF_CANCEL flag in the buf log format structure and log
the buf item. This will be used at recovery time to determine that copies
of the buffer in the log before this should not be replayed.
xfs_buf_item_format
-> xfs_buf_item_format_segment
---
if (bip->bli_flags & XFS_BLI_STALE) {
/*
* The buffer is stale, so all we need to log
* is the buf log format structure with the
* cancel flag in it.
*/
return;
}
---
This buffer will be logged and pined, then others will not try to push it to disk.
And we have locked the buffer, so nobody else could pin it again.
xfs_buf_item_unpin
---
freed = atomic_dec_and_test(&bip->bli_refcount);
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
if (freed && stale) {
...
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
xfs_buf_do_callbacks(bp);
bp->b_log_item = NULL;
list_del_init(&bp->b_li_list);
bp->b_iodone = NULL;
} else {
spin_lock(&ailp->ail_lock);
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
}
// unlock and release the reference.
// we lock the buffer before xfs_trans_binval
// during this, nobody could reuse it again.
xfs_buf_relse(bp);
}
---
A btree update removes a block from the btree.
xfs_btree_free_block
-> cur->bc_ops->free_block(cur, bp)
-> xfs_bmbt_free_block
-> xfs_bmap_add_free
-> xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
-> xfs_trans_binval(cur->bc_tp, bp)
[1] [2] [3] [4]
EFI log -> EFD log -> agl, agfl, cntbt bnobt -> remove from busy -> flush update on
Update free space bt components buffer log extent list agl, agfl, cntbt
bnobt & cntbt EFD log bnobt to disk
insert to busy extent list
[5]
\___ ___/\________________________ ____________________________/
V V
Trans0 Trans1
\____________________________ _________________________________/\_________ ________/\______ ______/
V V V
Context of fs operations log IO complete AIL push context
context
[1] xfs_trans_roll
[2] commit log
[3] log IO complete
[4] insert log item into AIL
[5] xfs_extent_free_finish_item
-> xfs_trans_free_extent
-> __xfs_free_extent
-> xfs_free_ag_extent
-> xfs_extent_busy_insert
When the xfs_btree_free_block is invoked, the update on the btree has been
completed, IOW, nobody could get the block from the btree anymore.
Moreover, when we update the btree, what to preserve the serialisation ?
xfs_bmapi_write is invoked under xfs_ilock(ip, XFS_ILOCK_EXCL)
If the block used to be a block in a block map btree,
- all the udpate saved in the buffer will not be flushed to disk
- all the log recorded will not be replayed during recovery.
- no one could get it again in the block map btree
There maybe someone gets this buffer of the freed block.
xfs_buf_find
---
if (bp->b_flags & XBF_STALE) {
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
}
---
AIL
Active Item List, a LSN-sorted double linked list.
Items are inserted into this list during log buffer IO completion, after which
they are unpind and can be written to disk.
CIL
Commited Item List, this list tracks log items that have been commited and have
formatted memory buffer attached to them. It tracks objects in transaction
commit order.
XFS changes on inode | -----------------^------------------------------------------------------------------------------------------------------------ XFS log | format[1] v lvs on lip->li_lv xlog_cil_push xfsaild_push_item lip is pined take lvs down from cil->xc_cil li_ops->push under XFS_IOLOCK_EXCL under down_write cil->xc_ctx_lock push the modifications to bp down_read cil->xc_ctx_lock under XFS_IOLOCK_EXCL insert lip on cil->xc_cil write lvs to iclog[2] .---> xlog_cil_committed[3] || under spin cil->xc_cil_lock write commit record / insert to ail->ail_head || xfs_iflush_done || / unpin || ^ xfs_iunlock \/ / \/ | -------------------------------------------------------------------------------------------------------------------------------- XFS buf submit to disk submit to disk -------------------------------------------------------------------------------------------------------------------------------- [1]: xfs_log_commit_cil [2]: xlog_write [3]: commit record's IO completion callback, see xlog_cil_push lv : xfs_log_vec lip : xfs_log_item cil : xfs_cil bp : xfs_buf
Why do we need the log format ?
XFS logging is a combination of logical and physical logging. Some objects,
such as inodes and dquots, are logged in logical format where the details
logged are made up of the changes to in-core structures rather than on-disk
structures. Other objects - typically buffers - have their physical changes
logged. The reason for these differences is to reduce the amount of log space
required for objects that are frequently logged. Some parts of inodes are more
frequently logged than others, and inodes are typically more frequently logged
than any other object (except maybe the superblock buffer) so keeping the
amount of metadata logged low is of prime importance.
xfs_trans_commit
-> __xfs_trans_commit
-> xfs_log_commit_cil
-> xlog_cil_insert_items // down_read cil->xc_ctx_lock
-> xlog_cil_insert_format_items
-> iterate xfs_trans->t_items
lip->li_ops->iop_format// if xfs_log_item is set XFS_LI_DIRTY
xfs_inode_item_format.
Why is there no lock hold here ? [1]
-> require cil->xc_cil_lock
iterate tp->t_items and mve the dirty one to cil->xc_cil
Note:
list_move_tail(&lip->li_cil, &cil->xc_cil);
[1]
For example:
xfs_fs_commit_blocks
---
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_setattr_time(ip, iattr);
if (update_isize) {
i_size_write(inode, iattr->ia_size);
ip->i_d.di_size = iattr->ia_size;
}
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
---
Where to free the XFS_ILOCK_EXCL ?
xfs_trans_commit
-> __xfs_trans_commit
-> xfs_trans_free_items
-> iterate the tp->t_items
detach item
clear_bit(XFS_LI_DIRTY, &lip->li_flags);
list_del_init(&lip->li_trans);
unlock item
lip->li_ops->iop_unlock
xfs_inode_item_unlock
-> xfs_iunlock(ip, iip->ili_lock_flags)
clear_bit(XFS_LI_DIRTY, &lip->li_flags);
list_del_init(&lip->li_trans);
So we can know, when we do log formating, the inode is locked with XFS_ILOCK_EXCL
The formatted buffer contains all the changes on the log item. This enable us to
relog the item in memory and write it out asynchronously without needing to
relock the object that was modified at the time it gets written into log.
To achieve this, we also need a one-off xfs_log_vec every time. Only then, we
could do the formating and pushing to log separately.
xfs_trans_commit
-> __xfs_trans_commit
-> xfs_log_commit_cil
-> xlog_cil_alloc_shadow_bufs // allocate shadow buffer outside of
cil->xc_ctx_lock to avoid deadlock when memory is low.
more details, please refer to comment of
xlog_cil_alloc_shadow_bufs.
-> down_read(&cil->xc_ctx_lock)
-> xlog_cil_insert_items
-> xlog_cil_insert_format_items
---
list_for_each_entry(lip, &tp->t_items, li_trans) {
...
if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
...
} else {
/* switch to shadow buffer! */
lv = shadow;
lv->lv_item = lip;
...
}
...
lip->li_ops->iop_format(lip, lv);
}
---
Regarding to the lip->li_lv, we have 3 cases as following:
xlog_cil_push
---
down_write(&cil->xc_ctx_lock);
while (!list_empty(&cil->xc_cil)) {
struct xfs_log_item *item;
item = list_first_entry(&cil->xc_cil,
struct xfs_log_item, li_cil);
list_del_init(&item->li_cil);
if (!ctx->lv_chain)
ctx->lv_chain = item->li_lv;
else
lv->lv_next = item->li_lv;
lv = item->li_lv; // take down the active lv
item->li_lv = NULL;
num_iovecs += lv->lv_niovecs;
}
---
checkpoint of the log.
xlog_cil_push
//down_write cil->xc_ctx_lock:
list_add(&ctx->committing, &cil->xc_committing) //under cil->xc_push_lock
CIL Head
|
V
Log Item <-> log vector 1 -> memory buffer
| -> vector array
V
Log Item <-> log vector 2 -> memory buffer
| -> vector array
V
......
|
V
Log Item <-> log vector N-1 -> memory buffer
| -> vector array
V
Log Item <-> log vector N -> memory buffer
-> vector array
- take the item down from the cil->xc_cil list.
- take the active lv down from the item->li_lv.
And after the flush the CIL head is empty, and the checkpoint context log
vector list would look like:
Checkpoint Context
|
V
log vector 1 -> memory buffer
| -> vector array
| -> Log Item
V
log vector 2 -> memory buffer
| -> vector array
| -> Log Item
V
......
|
V
log vector N-1 -> memory buffer
| -> vector array
| -> Log Item
V
log vector N -> memory buffer
-> vector array
-> Log Item
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
spin_lock(&cil->xc_push_lock);
cil->xc_current_sequence = new_ctx->sequence;
spin_unlock(&cil->xc_push_lock);
//up_write cil->xc_ctx_lock
xlog_write
// write log vectors into log
// There could be multiple context running concurrently here.
// IOW, the checkpoints could be written out of order.
// But the commit record must be in order strictly.
spin_lock(&cil->xc_push_lock);
list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
/*
* Higher sequences will wait for this one so skip them.
* Don't wait for our own sequence, either.
*/
if (new_ctx->sequence >= ctx->sequence)
continue;
if (!new_ctx->commit_lsn) {
xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
goto restart;
}
}
spin_unlock(&cil->xc_push_lock);
// commit the record
xfs_log_done
-> xlog_commit_record
-> xlog_write // XLOG_REG_TYPE_COMMIT
// attach checkpoint context to commit record log buffer
ctx->log_cb.cb_func = xlog_cil_committed;
ctx->log_cb.cb_arg = ctx;
error = xfs_log_notify(commit_iclog, &ctx->log_cb);
/*
* now the checkpoint commit is complete and we've attached the
* callbacks to the iclog we can assign the commit LSN to the context
* and wake up anyone who is waiting for the commit to complete.
*/
spin_lock(&cil->xc_push_lock);
ctx->commit_lsn = commit_lsn;
wake_up_all(&cil->xc_commit_wait);
spin_unlock(&cil->xc_push_lock);
// The commit maybe pushed to disk here.
return xfs_log_release_iclog(log->l_mp, commit_iclog);
How to ensure the log lvs has been on disk when commit is pushed ?
ITOW, the log IOs maybe in a different iclog from the commit record.
- the iclog on which the log IOs are written maybe synced to disk after the one
on which commit record is written.
- the IOs in block layer even storage device could be disordered.
Needn't we to consider the disordering above ?
There are two points here:
1. the log callback mechanism can ensure when the xlog_cil_committed is invoked,
the previous log IOs must have been completed. (callbacks of different iclog
must be invoked in order of lsn)
2. do we really need to ensure log IOs has been on disk when do commit record ?
every iclog has a crc calculated by xlog_cksum.
it could help us to recognize the corrupted log.
The iclog is maintained as a ring in log->l_iclog.
---
/ \
log->l_iclog-> | |
\ /
---
Get an available iclog space
we will try to get an available iclog from the log->l_iclog ring.
If it is not an active one, this means the log space has been exhausted and has
to wait.
See xlog_state_get_iclog_space
---
// under log->l_icloglock
iclog = log->l_iclog;
if (iclog->ic_state != XLOG_STATE_ACTIVE) {
XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
/* Wait for log writes to have flushed */
xlog_wait(&log->l_flush_wait, &log->l_icloglock);
goto restart;
}
---
Note: xlog_state_get_iclog_space not only return an available iclog, but reserve space in it.
---
// under log->l_icloglock
// Has confirmed that this iclog has at least 2 * sizeof (xlog_op_header_t) available space.
if (len <= iclog->ic_size - iclog->ic_offset) {
// if the available space in this iclog is enough to carry, reserve the space.
*continued_write = 0;
iclog->ic_offset += len; // reserve our requested space
} else {
// otherwise, invoke xlog_state_switch_iclogs to switch this iclog from ACTIVE to WANT_SYNC.
// then, noone could get it again. We have hold a reference of it, so it will not be resynced.
*continued_write = 1;
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
}
---
when continued_write is true, the iclog->ic_offset will be modified by
xlog_write_copy_finish
-> xlog_state_finish_copy
then xlog_state_release_iclog will be invoked and iclog will be synced.
So we know, one iclog could be used by multiple users.
iclog block number
The iclogs' bp are allocated in xlog_alloc_log via xfs_buf_get_uncached.
So there is no block number of IO there. Where to get it ?
xlog_state_get_iclog_space
---
atomic_inc(&iclog->ic_refcnt); /* prevents sync */
log_offset = iclog->ic_offset;
// ic_offset is the current number of bytes written to in this iclog.
// if ic_offset is zero, that says this is the first writting on this iclog.
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
xlog_tic_add_region(ticket,
log->l_iclog_hsize,
XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
ASSERT(log->l_curr_block >= 0);
}
---
The l_curr_block is turned in xlog_state_switch_iclogs
---
log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
...
if (log->l_curr_block >= log->l_logBBsize) {
log->l_curr_block -= log->l_logBBsize;
ASSERT(log->l_curr_block >= 0);
smp_wmb();
log->l_curr_cycle++;
if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
log->l_curr_cycle++;
}
---
Finally, the block number will be filled into the bp in xlog_sync with XFS_BUF_SET_ADDR
sync a iclog
Every time, when we want to flush out an in core log, its state will be switched
to WANT_SYNC. At the same time, the iclog ring will be turned.
xlog_state_switch_iclogs
---
log->l_iclog = iclog->ic_next;
---
Then
When will we want to sync a iclog ?
An iclog will be synced to disk when its reference count is zero.
The reference is released by xlog_state_release_iclog
xlog_state_get_iclog_space
-> xlog_state_switch_iclogs
xlog_write
-> xlog_write_copy_finish
-> xlog_state_want_sync
-> xlog_state_switch_iclogs
xfs_log_force
---
if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
return 0;
...
if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
/* update tail before writing to iclog */
xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
sync++;
iclog->ic_state = XLOG_STATE_SYNCING;
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
xlog_verify_tail_lsn(log, iclog, tail_lsn);
/* cycle incremented when incrementing curr_block */
}
spin_unlock(&log->l_icloglock);
...
if (sync)
return xlog_sync(log, iclog);
---
One question:
Whether we indeed need to hold the log->l_icloglock for every iclog ?
Submit the iclog to disk
xlog_state_release_iclog
-> xlog_sync
-> bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-> bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
-> xlog_bdstrat
-> xfs_buf_lock(bp)
-> xfs_buf_submit
completion path
xfs_buf_bio_end_io
-> xfs_buf_ioend_async
-> queue work xfs_buf_ioend_work
xfs_buf_ioend_work
-> xfs_buf_ioend
-> bp->b_iodone
xlog_iodone
-> xlog_state_done_syncing
-> set iclog->ic_state to XLOG_STATE_DONE_SYNC //under log->l_icloglock (a global one ?)
-> xlog_state_do_callback
-> xfs_buf_unlock
do callbacks
The callbacks must be performed in order of lsn
The callbacks here not only include invoke callbacks, but also
xlog_state_clean_log.
Following code segment ensure the order
- iclog->ic_state = XLOG_STATE_ACTIVE
- iclog->ic_offset = 0
- ic_header.h_lsn = 0
xlog_state_do_callback
---
if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
...
lowest_lsn = xlog_get_lowest_lsn(log);
if (lowest_lsn &&
XFS_LSN_CMP(lowest_lsn,
be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
iclog = iclog->ic_next;
continue;
}
---
iclog ring:
After commit record IO is completed, the log item associated with the lvs in the
ctx will be added to AIL, at the same time, the log items are unpind.
Where does the lsn come from ?
A xfs_cil_ctx contains following information:
The log reservation steps:
Log Items
What is the deferred operation mechanism for ?
The basic principle is:
What is relogging ?
Delayed log on CIL
cil->xc_push_seq //The seq need to be pushed.
In one trasaction, there can only be one modification on one log item.
The XFS filesystem tracks free space in an allocation group using two B+trees. One B+tree tracks space by block
number, the second by the size of the free space block. This scheme allows XFS to find quickly free space near a
given block or of a given size.
Allocate a variable extent anywhere in the allocation group agno.
Update the bnobt and cntbt, remove the allocated record from the two trees.
The AG Free List is located in the 4th sector of each AG and is known as the AGFL. It is an array of AG relative
block pointers for reserved space for growing the free space B+trees.
Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
have been freed but whose transactions aren't committed to disk yet.
Large Number of files
The policy to select AG
xfs_btree_ops
xfs_btree_lookup will iterate the btree and fill the entries fillowing:
To support the sharing of file data blocks (reflink), there are two new components,
Write on the reflinked range page cache
Let's look at the following scenario,
When write the data in a range of shared blocks out to disk, we need to do the COW,
but what if poweroff occurs during that ?
Darrick Wong points out that
The preallocated extents are firstly set to UNWRITTEN state.
The biggest total iclog size is 8 * 256K == 2M
xlog_get_iclog_buffer_size
---
if (mp->m_logbufs <= 0)
log->l_iclog_bufs = XLOG_MAX_ICLOGS;
else
log->l_iclog_bufs = mp->m_logbufs;
---
#define XLOG_MAX_ICLOGS 8
xlog_get_iclog_buffer_size
---
if (mp->m_logbsize > 0) {
size = log->l_iclog_size = mp->m_logbsize;
log->l_iclog_size_log = 0;
...
}
/* All machines use 32kB buffers by default. */
log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
---
But this is not the real physical log space.
The real physical log space is:
The max and min size of the log space is
Both of them are from:
xfs_mountfs
---
error = xfs_log_mount(mp, mp->m_logdev_targp,
XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
---
XFS_MAX_LOG_BLOCKS blocks (1024 * 1024)
or XFS_MAX_LOG_BYTES (2 * 1024 * 1024 * 1024)
XFS_MIN_LOG_BLOCKS blocks (512)
XFS_MIN_LOG_BYTES (10 * 1024 * 1024)
iclog ring:
<--.
--- \
O / \
log->l_iclog-> | |
\ N /
\ ---
rotate '-->
rotate: it is done by xlog_state_switch_iclogs
O : older
N : newer
Where to allocate the iclog space ?
xlog_write
-> xlog_state_get_iclog_space
-> get an XLOG_STATE_ACTIVE iclog and reserve space in it.
---
//under log->l_icloglock
iclog->ic_offset += len;
---
if the log->l_iclog is not ACTIVE, wait for it.
-> xlog_wait on log->l_flush_wait
Where to free the iclog space ?
xlog_iodone
-> xlog_state_done_syncing
-> xlog_state_do_callback
-> xlog_state_clean_log
-> wake_up_all(&log->l_flush_wait);
In conclusion, at one moment, there only be 2M log IO in-flight at most.
.-->
/ ---
/ \
| iclog |
\ /
--- /
<--'
|------------------------------------------------|
physical log space
The iclog ring will roll forward in xlog_state_switch_iclogs.
XFS transaction subsystem is that most transactions are asynchronous. That is,
they don't commit to disk until either a log buffer is filled (a log buffer can
hold multiple transactions) or a synchronous operation forces the log buffers
holding the transactions to disk. This means that XFS is doing aggregation of
transactions in memory - batching them, if you like - to minimise the impact
of the log IO on transaction throughput.
The limitation on asynchronous transaction throughput is the number and size of
log buffers made available by the log manager. By default there are 8 log
buffers available and the size of each is 32kB - the size can be increased up
to 256kB by use of a mount option.
Effectively, this gives us the maximum bound of outstanding metadata changes
that can be made to the filesystem at any point in time - if all the log
buffers are full and under IO, then no more transactions can be committed until
the current batch completes. It is now common for a single current CPU core to
be to able to issue enough transactions to keep the log buffers full and under
IO permanently. Hence the XFS journalling subsystem can be considered to be IO
bound.
[Life Cycle of a iclog]
XLOG_STATE_WANT_SYNC
xlog_state_switch_iclogs
---
iclog->ic_state = XLOG_STATE_WANT_SYNC;
...
/* roll log?: ic_offset changed later */
log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
...
if (log->l_curr_block >= log->l_logBBsize) {
log->l_curr_block -= log->l_logBBsize;
smp_wmb();
log->l_curr_cycle++;
}
log->l_iclog = iclog->ic_next;
---
XLOG_STATE_SYNCING
xlog_state_release_iclog
-> if XLOG_STATE_WANT_SYNC
set SYNCING
-> xlog_sync
XLOG_STATE_DONE_SYNC
xlog_state_done_syncing
XLOG_STATE_DO_CALLBACK
xlog_state_do_callback
XLOG_STATE_CALLBACK
xlog_state_do_callback
//has been the lowest lsn and ready to do callback.
XLOG_STATE_DIRTY
xlog_state_do_callback
//after complete the callbacks
XLOG_STATE_ACTIVE
xlog_state_do_callback
-> xlog_state_clean_log
iclog format
Every iclog has its header.
[1][2] [3]
|--Header--|----Payload---|
[1] iclog->ic_data = bp->b_addr
[2] ic_data is xlog_in_core_2_t
typedef union xlog_in_core2 {
xlog_rec_header_t hic_header;
xlog_rec_ext_header_t hic_xheader;
char hic_sector[XLOG_HEADER_SIZE];
} xlog_in_core_2_t;
#define ic_header ic_data->hic_header
[3] iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
(xlog_alloc_log)
The first u32 of each log sector must contain the cycle number.
Since log item buffers are formatted without regard to this requirement,
the original contents of the first four bytes of each sector in the log
are copied into the corresponding element of this array. After that, the
first four bytes of those sectors are stamped with the cycle number. This
process is reversed at recovery time. If there are more sectors in this
log record than there are slots in this array, the cycle data continues
for as many sectors are needed; each sector is formatted as type xlog_rec_ext_header.
xlog_sync
-> xlog_pack_data
---
cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
dp = iclog->ic_datap;
for (i = 0; i < BTOBB(size); i++) {
if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
break;
iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
*(__be32 *)dp = cycle_lsn;
dp += BBSIZE;
}
---
The cycle is started from 1 (xlog_alloc_log log->l_curr_cycle = 1)
tail head
(c 200) (c 200)
| |
v v
|-----------------------------|
\____ ____/
v
non-checkpointed log
non-checkpointed: the metadata associated with the log has not been flushed to
disk
We should try to do recovery from the tail.
This tail is recored in iclog header, xlog_rec_header_t.h_tail_lsn.
xlog_state_release_iclog
---
if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
/* update tail before writing to iclog */
xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
sync++;
iclog->ic_state = XLOG_STATE_SYNCING;
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
xlog_verify_tail_lsn(log, iclog, tail_lsn);
/* cycle incremented when incrementing curr_block */
}
---
This is a perfect position to update the tail lsn of the log.
It could ensure the tail of the log is updated when we overwrite the
checkpointed aera of the log.(the h_tail_lsn is on the iclog which will be flushed
to log.)
Unlike the jbd2, which has a superblock for journal, xfs record the tail of the
log in every LR. It could reduce some extra IO out of LR.
But the trouble is how to find out the tail when do recovery ?
Even if we could search the log to find out xlog_rec_header_t's magic
0xFEEDbabe, there could be multiple xlog_rec_header_t structures to be found and
they may has different tail lsn. And what we need is the last one.
How to find out it ?
Consider we have a cycle number recored at head of every sector.
End
|
v
|200|200|200|200|200|199|199|199|199|
End
|
v
|200|200|200|200|200|200|200|200|200|
What we do is to search backward from the End to find out the xlog_rec_header_t's
magic.
xlog_find_tail
-> xlog_find_head
-> xlog_rseek_logrec_hdr
flush to disk
xlog_cil_committed (completion callback of commit record)
-> xfs_trans_committed_bulk //ctx->lv_chain (xlog_cil_push put the lvs on it)
-> iterate the lvs
-> lip->li_ops->iop_committed()
-> xfs_log_item_batch_insert
-> insert the log items onto xfs_ail->ail_head sorted by lsn.
-> lip->li_ops->iop_unpin()
Why do we need this pin/unpin here ?
An log item cannot be flushed to disk before the log is completed.
pin/unpin pair is to achieve this.
The log item is pinned in
xlog_cil_insert_format_items
-> lip->li_ops->iop_format()
-> xfs_cil_prepare_item
-> lv->lv_item->li_ops->iop_pin()
This is done under item lock.
For inode, it is xfs_ilock, for example:
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
Then unlocked by xfs_inode_item_unlock
In addition, an log item could be reloged during it is on CIL even AIL.
Then it will not be pushed to disk until all of the pin counter is cleared.
When an log item is on AIL, it could be pushed to disk now.
xfsaild
-> xfsaild_push
-> xfsaild_push_item
-> lip->li_ops->iop_push
xfs_inode_item_push
-> if xfs_ipincount(ip) > 0 //it could be reloged.
return XFS_ITEM_PINNED
-> xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)
-> xfs_iflush
-> xfs_imap_to_bp // get the buffer containing the on-disk inode.
-> xfs_iflush_int // flush the dirty part of inode into on-disk inode.
// the on-disk inode is on
// the xfs_buf of the inode
-> xfs_buf_attach_iodone //xfs_iflush_done
-> xfs_buf_delwri_queue
insert the bp into xfs_ail->ail_buf_list
-> xfs_buf_delwri_submit_nowait
-> xfs_buf_delwri_submit_buffers
---
list_sort(NULL, buffer_list, xfs_buf_cmp);
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
if (!wait_list) {
if (xfs_buf_ispinned(bp)) {
pinned++;
continue;
}
if (!xfs_buf_trylock(bp))
continue;
} else {
xfs_buf_lock(bp);
}
...
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
bp->b_flags |= XBF_WRITE | XBF_ASYNC;
if (wait_list) {
xfs_buf_hold(bp);
list_move_tail(&bp->b_list, wait_list);
} else
list_del_init(&bp->b_list);
xfs_buf_submit(bp);
}
blk_finish_plug(&plug);
---
When the IO is completed:
xfs_buf_bio_end_io
-> xfs_buf_ioend_async
-> queue work xfs_buf_ioend_work
xfs_buf_ioend_work
-> xfs_buf_ioend
-> bp->b_iodone
xfs_iflush_done // attached by xfs_iflush_int
---
list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) {
if (lip->li_cb != xfs_iflush_done)
continue;
list_move_tail(&blip->li_bio_list, &tmp);
...
}
...
list_for_each_entry_safe(blip, n, &tmp, li_bio_list) {
list_del_init(&blip->li_bio_list);
iip = INODE_ITEM(blip);
iip->ili_logged = 0;
iip->ili_last_fields = 0;
xfs_ifunlock(iip->ili_inode);
}
---
LSN
xlog_state_get_iclog_space
---
// Under log->l_icloglock
// On the 1st write to an iclog, figure out lsn.
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
xlog_tic_add_region(ticket,
log->l_iclog_hsize,
XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
}
---
log->l_curr_block and log->l_curr_cycle
xlog_state_switch_iclogs //Under log->l_icloglock
---
log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
...
if (log->l_curr_block >= log->l_logBBsize) {
log->l_curr_block -= log->l_logBBsize;
ASSERT(log->l_curr_block >= 0);
smp_wmb();
log->l_curr_cycle++;
if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
log->l_curr_cycle++;
}
---
So we know the lsn is (cycle << 32 | block )
Log space and cycle
current
|
v
|------------------------------| cycle 200
current original
- ->| | - - - - - - - -
v v
|------------------------------| cycle 201
order of AIL
The ctx->start_lsn here will be used to insert the log items to AIL.
xlog_cil_push
-> xlog_write
-> xlog_state_get_iclog_space
-> if (!*start_lsn)
*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
xlog_cil_committed
-> xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
ctx->start_lsn, abort)
-> xfs_trans_ail_update_bulk
---
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
// if the log item has been on AIL, we may need to reposition it.
// this is for the relogging case.
if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
/* check if we really need to move the item */
if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
continue;
xfs_ail_delete(ailp, lip);
if (mlip == lip)
mlip_changed = 1;
}
// a log item's lsn is set here.
lip->li_lsn = lsn;
list_add(&lip->li_ail, &tmp);
}
// Queue on the AIL list
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
// If the minimum lsn of the AIL list is changed, it indicates the log
// space is moved forward, try to wake up the waiters.
if (mlip_changed) {
...
xfs_log_space_wake(ailp->ail_mount);
}
---
Question:
The AIL is sorted by lsn. What's about the order of pushing ?
Look at the two helper interfaces in xfsaild_push:
- xfs_trans_ail_cursor_first
- xfs_trans_ail_cursor_next
And also the log->ail_last_pushed_lsn
Basically, the entries in AIL list is pushed by order of lsn.
The lsn should be monotonically increasing. But the pushing of log item on AIL
list may be disordered.
We have known that callback of record commit, xlog_cil_committed, is invoked by
the order of lsn. But please note, this is the lsn of the commit record instead
of the log lvs. The log lvs and commit record may have different lsn and there
maybe other one inserted between them.
Why do we need the ail_last_pushed_lsn ?
AIL list is sorted by lsn which combines log space block and cycle. It is not
the block number of the real IO under the log item....So it is meaningless to
push them continously.
log space
left log space
Log space
tail head
(c 200) (c 200)
| |
v v
|-----------------------------|
\____ ____/
v
used
head tail
(c 201) (c 200)
| |
v v
|-----------------------------|
\______ ______/ \__ __/
v v
used used
Both of the head and tail will be pushed forward.
- pushing Tail means free
- pushing Head mean allocate
The log->l_tail_lsn contains the Tail of the log space.
It is essentially the minimum lsn on the AIL list.
Benifit from the sorting of the AIL, we could use the AIL to get the minimum lsn
easily through the xfs_ail_min.
Here is one of the hook to modify the log->l_tail_lsn.
xlog_state_release_iclog
-> xlog_assign_tail_lsn // iclog->ic_state == XLOG_STATE_WANT_SYNC
-> xlog_assign_tail_lsn
-> xlog_assign_tail_lsn_locked
---
lip = xfs_ail_min(mp->m_ail);
if (lip)
tail_lsn = lip->li_lsn;
else
tail_lsn = atomic64_read(&log->l_last_sync_lsn); // set by xlog_state_do_callback
atomic64_set(&log->l_tail_lsn, tail_lsn);
---
Note:
The tail is decided by log->l_last_sync_lsn or the AIL list.
What does this mean ?
log->l_last_sync_lsn is the updated xlog_state_do_callback
xlog_iodone
-> xlog_state_done_syncing
-> xlog_state_do_callback
It indicates the IO on the iclog is completed.
So it is reasonable to push the Tail.
Regarding to the AIL list, it means only when metadata is flushed to disk we
could release the log.
log->l_reserve_head.grant is the Head
xfs_log_reserve
-> xlog_grant_head_check
-> xlog_grant_head_wait // xlog_space_left(log, &head->grant) < need_bytes)
Big defect here:
The log space maybe changed between xlog_grant_head_check and
xlog_grant_add_space.
-> xlog_grant_add_space
---
int64_t head_val = atomic64_read(head);
int64_t new, old;
do {
int tmp;
int cycle, space;
xlog_crack_grant_head_val(head_val, &cycle, &space);
tmp = log->l_logsize - space;
if (tmp > bytes)
space += bytes;
else {
space = bytes - tmp;
cycle++;
}
old = head_val;
new = xlog_assign_grant_head_val(cycle, space);
head_val = atomic64_cmpxchg(head, old, new);
} while (head_val != old);
}
---
log space accounting
Log format
Log space:
Actual log space accounting.
diff_len and diff_iovecs from xlog_cil_insert_format_items
diff_len returns the actual log space consumed by the lvs in the transaction
A log item's (lip->li_lv) could have 4 cases:
[a] initial state
[b] lv has been taken away by xlog_cil_push
[c] lv is there, and its size is enough to carry the new logs
[d] lv is there, and its size is not enough to carry the new logs
For the case a and b,
diff_len = lv->lv_bytes
For the case c and d
diff_len = lv->lv_bytes - old_lv->lv_bytes
The original log is overwritten, so we could occupy the previous reserved
log space.
diff_iovecs from the xlog_cil_insert_format_items is used to account the size of
xlog_op_header_t (one xlog_op_header_t per iovec)
---
xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
spin_lock(&cil->xc_cil_lock);
/* account for space used by new iovec headers */
iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
len += iovhdr_res;
---
---
/* do we need space for more log record headers? */
iclog_space = log->l_iclog_size - log->l_iclog_hsize;
if (len > 0 && (ctx->space_used / iclog_space !=
(ctx->space_used + len) / iclog_space)) {
split_res = (len + iclog_space - 1) / iclog_space;
/* need to take into account split region headers, too */
XLOG_CONTINUE_TRANS ?
split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
ctx->ticket->t_unit_res += split_res;
ctx->ticket->t_curr_res += split_res;
tp->t_ticket->t_curr_res -= split_res;
ASSERT(tp->t_ticket->t_curr_res >= len);
}
---
A ctx contains a couple of lvs, these lvs will be written to log one time
and all of lvs in a same ctx will have a same log commit record.
So we could see, ctx->space_used is used here to calculate whether additional
log record header is needed.
The actual accounting is done in xlog_cil_push.
ctx->ticket will be used to account this.
It is allocated at:
xlog_cil_ticket_alloc
-> xlog_ticket_alloc // unit_bytes is zero, xfs_log_calc_unit_res will return the log space needed for common part
-> xfs_log_calc_unit_res
-> tic->t_curr_res = 0
It will first steal the reservation from the first xfs_trans inserted into this
ctx.
xlog_cil_insert_items
---
if (ctx->ticket->t_curr_res == 0) {
ctx_res = ctx->ticket->t_unit_res;
ctx->ticket->t_curr_res = ctx_res;
tp->t_ticket->t_curr_res -= ctx_res;
}
---
Then account the actual log space in xlog_cil_push.
The left part will be release in xfs_log_done in
log space reserve
[1] reserve the maximum log space needed
xfs_trans_reserve
-> xfs_log_reserve
-> xlog_ticket_alloc
-> xfs_log_calc_unit_res // calculate the maximum log space needed
-> xlog_grant_head_check
-> xlog_ticket_reservation
-> return tic->t_unit_res * tic->t_cnt
-> xlog_grant_add_space l_reserve_head
-> xlog_grant_add_space l_write_head
[2] account the actual log space used
[3] release the redundant log space with xfs_log_done.
There are two paths to do this:
[a] When roll trans
xlog_regrant_reserve_log_space
---
if (ticket->t_cnt > 0)
ticket->t_cnt--;
xlog_grant_sub_space(log, &log->l_reserve_head.grant,
ticket->t_curr_res);
xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
---
Just release tic->t_curr_res
[b] When commit trans
xlog_ungrant_log_space
---
if (ticket->t_cnt > 0)
ticket->t_cnt--;
bytes = ticket->t_curr_res;
if (ticket->t_cnt > 0) {
ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
bytes += ticket->t_unit_res*ticket->t_cnt;
}
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
---
Not only release tic->t_curr_res, but also the left permanent
reservation tic->t_unit_res * tic->t_cnt
Let's look at some examples:
bash-1440 [005] .... 101.856925: xlog_ticket_alloc: input 254080 calc 267016 cnt 2
bash-1440 [005] .... 101.857868: xlog_ungrant_log_space: curr 256644 unit 267016 cnt 1
bash-1440 [005] .... 101.858341: xlog_ticket_alloc: input 326016 calc 340524 cnt 8
bash-1440 [005] .... 101.858373: xlog_regrant_reserve_log_space: curr 340524 unit 340524 cnt 7
bash-1440 [005] .... 101.858381: xlog_ungrant_log_space: curr 340524 unit 340524 cnt 6
kworker/4:1-186 [004] .... 105.789587: xlog_ticket_alloc: input 0 calc 9268 cnt 1
kworker/4:1-186 [004] ...1 105.789720: xlog_verify_iclog_lsn.isra.20: iclog b 120 c 1 tail b 120 c 1
kworker/4:1-186 [004] .... 105.789730: xlog_ungrant_log_space: curr 8704 unit 9268 cnt 0
cp-1546 [003] .... 110.256017: xlog_ticket_alloc: input 254080 calc 267016 cnt 2
cp-1546 [003] .... 110.256160: xlog_ungrant_log_space: curr 256624 unit 267016 cnt 1
cp-1546 [003] .... 110.256640: xlog_ticket_alloc: input 178936 calc 190824 cnt 8
cp-1546 [003] .... 110.257872: xlog_ungrant_log_space: curr 190012 unit 190824 cnt 7
cp-1546 [003] .... 110.258000: xlog_ticket_alloc: input 760 calc 10028 cnt 0
kworker/6:1-68 [006] .... 110.258781: xlog_ungrant_log_space: curr 10028 unit 10028 cnt 0
cp-1546 [003] .... 110.259266: xlog_ticket_alloc: input 5752 calc 15020 cnt 0
cp-1546 [003] .... 110.259303: xlog_ungrant_log_space: curr 15020 unit 15020 cnt 0
cp-1546 [003] .... 110.259326: xlog_ticket_alloc: input 178936 calc 190824 cnt 8
cp-1546 [003] .... 110.259358: xlog_regrant_reserve_log_space: curr 190720 unit 190824 cnt 7
cp-1546 [003] .... 110.259402: xlog_regrant_reserve_log_space: curr 190620 unit 190824 cnt 6
cp-1546 [003] .... 110.259429: xlog_regrant_reserve_log_space: curr 190768 unit 190824 cnt 5
cp-1546 [003] .... 110.259433: xlog_ungrant_log_space: curr 190824 unit 190824 cnt 4
cp-1546 [003] .... 110.259441: xlog_ticket_alloc: input 5752 calc 15020 cnt 0
cp-1546 [003] .... 110.259448: xlog_ungrant_log_space: curr 15020 unit 15020 cnt 0
cp-1546 [003] .... 110.259448: xlog_ungrant_log_space: curr 15020 unit 15020 cnt 0
kworker/4:1-186 [004] .... 136.508683: xlog_ticket_alloc: input 0 calc 9268 cnt 1
kworker/4:1-186 [004] ...1 136.508693: xlog_verify_iclog_lsn.isra.20: iclog b 128 c 1 tail b 120 c 1
kworker/4:1-186 [004] .... 136.508702: xlog_ungrant_log_space: curr 8704 unit 9268 cnt 0
kworker/4:2-370 [004] .... 197.948672: xlog_ticket_alloc: input 4224 calc 13492 cnt 1
kworker/4:2-370 [004] .... 197.948727: xlog_ungrant_log_space: curr 3792 unit 13492 cnt 0
Most of time, the reserved log space will be surplus.
intent log
Transaction Headers
Intent to Free an Extent (EFI) \
Completion of Intent to Free an Extent (EFD) |
Reverse Mapping Updates Intent (RUI) |
Completion of Reverse Mapping Updates (RUD) |
Reference Count Updates Intent (CUI) > Itent Log
Completion of Reference Count Updates (CUD) |
File Block Mapping Intent (BUI) |
Completion of File Block Mapping Updates(BUD) /
Inode Updates
Inode Data Log Item
Buffer Log Item
Buffer Data Log Item
Update Quota File
Quota Update Data Log Item
Disable Quota Log Item
Inode Creation Log Item
+--------------+
|xfs defer ops |
+--------------+--------+
| xfs log |
+-----------------------+----------------+
| xfs buf |
-----------------------------------------+
submit_bio
deferred operations
The deferred operations in XFS is a kind of "intent logging mechanism".
intent logging means the logs record operation intent.
If a failure occurs, then when the system is recovering, it can use the intent log to
detect what operations were still in process during the failure, and use the intent log
to help recover from the failure, usually by either undoing a partially completed operation,
or by redoing one that might need to be completed
XFS_DEFER_OPS_TYPE_REFCOUNT
| XFS_DEFER_OPS_TYPE_RMAP,
| |
xfs_trans->t_dfops - dfp0 - dfp1
|
|
dfp_work - ri0 - ri1 - ri2
type = XFS_REFCOUNT_INCREASE
startblock
blockcount
dfp xfs_defer_pending
ri xfs_refcount_intent
defer operations callbacks
All of the defer operation callbacks are stores in defer_op_types[]
Take refcount deferred_operation as example.
[1] create_intent
xfs_refcount_update_create_intent
-> xfs_cui_log_item
-> xfs_log_item_init //XFS_LI_CUI xfs_cui_item_ops
-> xfs_trans_add_item
[2] diff_items
[3] log_item
xfs_refcount_update_log_item
---
//Fill up the xfs_phys_extent which will be written to intent log.
ext = &cuip->cui_format.cui_extents[next_extent];
ext->pe_startblock = refc->ri_startblock;
ext->pe_len = refc->ri_blockcount;
xfs_trans_set_refcount_flags(ext, refc->ri_type);
---
[4] create_done
xfs_refcount_update_create_done
-> xfs_trans_get_cud
-> xfs_cud_init
-> xfs_log_item_init//XFS_LI_CUD xfs_cud_item_ops
-> xfs_trans_add_item
[5] finish_item
xfs_refcount_update_finish_item
-> xfs_trans_log_finish_refcount_update
-> xfs_refcount_finish_one
// Looks like do the real work here.
[6] cleanup_fn
Code path
xfs_refcount_increase_extent
-> __xfs_refcount_add
-> xfs_defer_add
xfs_defer_finish
-> xfs_defer_finish_noroll
-> xfs_defer_create_intents
//Create intent log for the items linked in this transaction.
---
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
dfp->dfp_count) [1];
list_sort(tp->t_mountp, &dfp->dfp_work,
dfp->dfp_type->diff_items [2]);
list_for_each(li, &dfp->dfp_work)
dfp->dfp_type->log_item(tp, dfp->dfp_intent, li) [3];
}
---
-> xfs_defer_trans_roll //commit the logs to cil
->
---
//Create an intent-done log
dfp->dfp_done = dfp->dfp_type->create_done [4](*tp, dfp->dfp_intent,
dfp->dfp_count);
cleanup_fn = dfp->dfp_type->finish_cleanup;
/* Finish the work items. */
state = NULL;
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
error = dfp->dfp_type->finish_item [5](*tp, li,
dfp->dfp_done, &state);
...
}
if (cleanup_fn)
cleanup_fn [6](*tp, state, error);
---
intent log recover
xfs_mountfs
-> xfs_log_mount
-> xlog_alloc_log
-> xlog_recover
-> xlog_do_recover
-> xlog_do_log_recovery
-> xlog_do_recovery_pass // analyze log record
-> xlog_recover_process // check log record crc
-> xlog_recover_process_data // analyze oph
-> xlog_recover_process_ophdr
-> xlog_recovery_process_trans
-> xlog_recover_commit_trans // when get a complete transaction (ctx in xlog_cil_push)
-> xlog_recover_items_pass2
-> xlog_recover_commit_pass2
xlog_recover_rui_pass2
It allocates an in-core rui, copies the extents from the format
structure into it, and adds the rui to the AIL with the given
LSN.
xlog_recover_rud_pass2
This routine is called when an RUD format structure is found in a committed
transaction in the log. Its purpose is to cancel the corresponding RUI if it
was still in the log. To do this it searches the AIL for the RUI with an id
equal to that in the RUD format structure. If we find it we drop the RUD
reference, which removes the RUI from the AIL and frees it.
Look at xfs_defer_finish_noroll
---
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
/* log intents and pull in intake items */
xfs_defer_create_intents(*tp);
list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
/*
* Roll the transaction.
*/
error = xfs_defer_trans_roll(tp);
if (error)
goto out;
/* Log an intent-done item for the first pending item. */
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
dfp->dfp_count);
cleanup_fn = dfp->dfp_type->finish_cleanup;
/* Finish the work items. */
state = NULL;
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
error = dfp->dfp_type->finish_item(*tp, li,
dfp->dfp_done, &state);
...
}
...
}
---
The real work of a intent log is done in dfp->dfp_type->finish_item().
The logs (inode and buf items) is committed to a same transaction with the
intent log done. So if the log done is there, associated log of the real work
must be there. We needn't replay the intent log any more.
xfs_mountfs
-> xfs_log_mount //analyze log here
-> xfs_log_mount_finish
-> xlog_recover_finish
-> xlog_recover_process_intents
---
spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
if (!xlog_item_is_intent(lip)) {
break;
}
switch (lip->li_type) {
case XFS_LI_EFI:
error = xlog_recover_process_efi(log->l_mp, ailp, lip);
break;
case XFS_LI_RUI:
error = xlog_recover_process_rui(log->l_mp, ailp, lip);
break;
case XFS_LI_CUI:
error = xlog_recover_process_cui(parent_tp, ailp, lip);
break;
case XFS_LI_BUI:
error = xlog_recover_process_bui(parent_tp, ailp, lip);
break;
}
if (error)
goto out;
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
out:
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
---
relog
XFS allows multiple separate modifications to a single object to be carried
in the log at any given time. This allows the log to avoid needing to flush
each change to disk before recording a new change to the object.
lvs on lip take down the lvs commit record is completed AIL handling
lip is pined -> write to iclog -> insert to AIL, unpin -> if not pinned, flush to disk
submit to disk
(format) (xlog_write) (xlog_cil_committed)
\____________________ ______________________/
V
Log IO is ongoing
The relogging here is used to implement a long-running, multiple-committing transactions.
The multiple-committing is very obvious here.
But what is the long-running ?
IMO. it should mean do logging an object repeatedly for long time.
We always need to roll the transaction with xfs_trans_roll to move log items forward
in the log to avoid to be blocked by ourselves when log wraps around.
xfs_trans_roll commits the previous transaction to log and allocates a new one
for next.
But if we relog repeatedly, the log item will be pined again and cannot be flushed to
disk, then this log item cannot be removed from the AIL list and free the log
space. We may hang forever to wait the log space.
Refer to order of AIL
No such issue. When an log item is relogged, it will get an new lsn and
repositioned on the AIL list based on this new lsn. The original log space will
be release finally.
Delayed log
We have two points about the delayed log
Delayed metadata writeout on AIL
xfs_log_commit_cil // under read cil->xc_ctx_lock, lock out background commit
-> xlog_cil_insert_items
-> xlog_cil_insert_format_items
---
if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
/* same or smaller, optimise common overwrite case */
...
/* reset the lv buffer information for new formatting */
lv->lv_buf_len = 0;
lv->lv_bytes = 0;
lv->lv_buf = (char *)lv +
xlog_cil_iovec_space(lv->lv_niovecs);
} else {
/* switch to shadow buffer! */
lv = shadow;
lv->lv_item = lip;
}
// format log on the previous lv, so the previous log will be over-written.
lip->li_ops->iop_format(lip, lv);
---
Does the new log contain the previous log ? (The answer should be YES)
Let's take buf log item as example.
The buf will be logged based on a dirty data bitmap.
xfs_buf_item_format
-> xfs_buf_item_format_segment
This dirty data bitmap is set by
xfs_trans_log_buf
-> xfs_buf_item_log
-> xfs_buf_item_log_segment
When we dirty a buf and format the log again, the new formatted log contains the
previous one, because nobody clears the dirty data bitmap.
Note:
The buf log item's blf_data_map will not be cleared during the whole life time
of the buf log item. The buf log item will be released by xfs_buf_iodone. When a
buf log item is reallocated, the blf_data_map is clean.
Delayed log could accumulate multiple log on a same object together and reduce the
final IO to disk. That's the key.
After we format the log and put the lvs on ctx->xc_cil list,
we will try to push the background log writting.
xlog_cil_push_background
---
/*
* don't do a background push if we haven't used up all the
* space available yet.
*/
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
return;
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
spin_unlock(&cil->xc_push_lock);
---
xfs_log_reserve/regrant
-> xlog_grant_push_ail
// push on the buffer cache code if we ever use more
// than 75% of the on-disk log space
-> xfs_ail_push
-> wake_up_process(ailp->ail_task)
xfsaild
-> xfsaild_push
---
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
lock_result = xfsaild_push_item(ailp, lip);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
ailp->ail_last_pushed_lsn = lsn;
break;
case XFS_ITEM_FLUSHING:
flushing++;
ailp->ail_last_pushed_lsn = lsn;
break;
case XFS_ITEM_PINNED:
stuck++;
ailp->ail_log_flush++;
break;
case XFS_ITEM_LOCKED:
stuck++;
break;
default:
ASSERT(0);
break;
}
count++;
if (stuck > 100)
break;
lip = xfs_trans_ail_cursor_next(ailp, &cur);
if (lip == NULL)
break;
lsn = lip->li_lsn;
}
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
//accumulate the IOs and write out together
if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
ailp->ail_log_flush++;
---
push log
Every time xlog_cil_push runs, it will do following things,
xlog_cil_force_lsn
-> xlog_cil_push_now
---
/* start on any pending background push to minimise wait time on it */
flush_work(&cil->xc_push_work);
/*
* If the CIL is empty or we've already pushed the sequence then
* there's no work we need to do.
*/
spin_lock(&cil->xc_push_lock);
if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
spin_unlock(&cil->xc_push_lock);
return;
}
cil->xc_push_seq = push_seq;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
spin_unlock(&cil->xc_push_lock);
---
// don't do a background push if we haven't used up all the
// space available yet.
// note !
// cil->xc_ctx is the current xfs_cil_ctx that could receive lvs.
// cil->xc_current_sequence is the seq of this ctx.
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
return;
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
spin_unlock(&cil->xc_push_lock)
[1] under down_write of cil->xc_ctx_lock
[2] under spin_lock of cil->xc_push_lock
cil->xc_push_seq < cil->xc_ctx->sequence ?
cil->xc_cil empty ?
as we know, cil->xc_ctx is the current ctx that could receive lvs,
cil->xc_committing contains the ctx's that are being committing.
The log items are queued on cil->xc_cil by __xfs_trans_commit.
__xfs_trans_commit
-> xfs_log_commit_cil //down_read cil->xc_ctx_lock
-> xlog_cil_insert_items
---//under cil->xc_cil_lock
list_for_each_entry(lip, &tp->t_items, li_trans) {
if (!list_is_last(&lip->li_cil, &cil->xc_cil))
list_move_tail(&lip->li_cil, &cil->xc_cil);
}
---
Therefore, the log items committed by __xfs_trans_commit must be in one ctx.
In the other word, the log items in one transaction must be in one ctx.
But an exception is xfs_trans_roll which amount to a transaction commit.
The transaction rolling is mainly used by xfs_defer_finish.
And xfs_defer_finish mainly used in following scenarios,
- xfs_itruncate_extents_flags
- xfs_reflink_end_cow
- xfs_reflink_remap_extent
Get a sequence from the pervious one. (ctx->seqeunce + 1)
Install the new one on cil->xc_ctx
Log format of a transaction
< start-oph >< oph >< trans-hdr >< oph >< reg1 >< oph >...< commit-oph >
[1] [2]
[1]
xlog_cil_push
-> xlog_write
-> xlog_write_start_rec
clear XLOG_TIC_INITED of ctx->ticket
[2]
xlog_cil_push
-> xfs_log_done //ctx->ticket->t_flags & XLOG_TIC_INITED == 0
-> xlog_commit_record
All of the oph's have a same oh_tid, Transaction ID
This transaction id comes from,
xlog_cil_push
-> xlog_cil_ticket_alloc
-> xlog_ticket_alloc
---
tic->t_tid = prandom_u32();
---
xfs_trans_roll
-> __xfs_trans_commit
-> xfs_trans_reserve
-> xfs_log_regrant
---
/*
* This is a new transaction on the ticket, so we need to change the
* transaction ID so that the next transaction has a different TID in
* the log. Just add one to the existing tid so that we can see chains
* of rolling transactions in the log easily.
*/
tic->t_tid++;
---
And only XLOG_COMMIT_TRANS, the log could be recovered by the recovery process.
xlog_recovery_process_trans
---
case XLOG_COMMIT_TRANS:
error = xlog_recover_commit_trans(log, trans, pass,
buffer_list);
/* success or fail, we are now done with this transaction. */
freeit = true;
break;
---
miscs
The one transaction here is the one xfs_cil_ctx in xlog_cil_push
It will take all of the lvs down from the log items queued on cil->xc_cil
under down_write of cil->xc_ctx_lock.
xfs_log_commit_cil
-> xlog_cil_insert_items // under down_read of cil->xc_ctx_lock
-> xlog_cil_insert_format_items
If the lip->lv is not NULL, format the log there. Otherwise, use the shadow lv
and this lv will be written to log in another transaction.
ctx
xlog_cil_push write the lvs to log in unit of ctx.
This ctx has a commit record.
When the iclog where the commit record of a ctx is written to log,
xlog_cil_committed will be invoked.
xlog_cil_committed
-> xfs_trans_committed_bulk
-> xfs_log_item_batch_insert
---
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
lip->li_ops->iop_unpin(lip, 0);
}
---
AG free space management
AG Free Space B+trees
All block numbers, indexes, and counts are AG relative.
agf->agf_roots[] -.
|
v
+-----------+
| Header |
+--+--+--+--+ key:[startblock,blockcount]
|K1|K2|K3|K4| 1:[12,16] 2:[184586,3] 3:[225579,1] 4:[511629,1]
+--+--+--+--+
|P1|P2|P3|P4| 1:2 2:83347 3:6 4:4
+--+--+--+--+
| |
.----' '-------.
v v
2 83347
+-----------+ +-----------+
| Header | | Header |
+--+--+--+--+ +--+--+--+--+
|R0|R1|R2|R3| |R0|R1|R2|R3|
+--+--+--+--+ +--+--+--+--+
xfs_alloc_ag_vextent_size
Extent's length (returned in len) will be between minlen and maxlen.
xfs_alloc_ag_vextent_size
---
//Allocate and initialize a cursor for the by-size btree.
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
args->agno, XFS_BTNUM_CNT);
//Look for an entry >= maxlen+alignment-1 blocks.
//The result will be the nearest one.
if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
args->maxlen + args->alignment - 1, &i)))
goto error0;
if (!i) {
...
} else {
/*
* Search for a non-busy extent that is large enough.
*/
for (;;) {
error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
//xfs_extent_busy_trim
//regarding to busy extent, please refer to Busy extent
if (rlen >= args->maxlen)
break;
// shift to right the rec
error = xfs_btree_increment(cnt_cur, 0, &i);
if (i == 0) {
/*
* Our only valid extents must have been busy.
* Make it unbusy by forcing the log out and
* retrying.
*/
xfs_btree_del_cursor(cnt_cur,
XFS_BTREE_NOERROR);
trace_xfs_alloc_size_busy(args);
xfs_extent_busy_flush(args->mp,
args->pag, busy_gen);
goto restart;
}
}
}
...
// Allocate and initialize a cursor for the by-block tree.
bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
args->agno, XFS_BTNUM_BNO);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
cnt_cur = bno_cur = NULL;
args->len = rlen;
args->agbno = rbno;
---
xfs_alloc_fixup_trees
fb
|---------| freespace record we get from freespace btree
|f0|---|f1| result record we need
The steps to fix the cntbt and bnobt:
a. delete fb from the cntbt
b. insert f0 and f1 into cntbt
c. update fb to f0 (cut fb down to f0)
d. insert f1
AG free list
(bmap refcount rmap inode) data block
the above 4 btrees itself need
block to construct the btree
| xfs_alloc_vextent
v
bnobt cntbt
the above 2 btrees' block
are allocated from agfl
| xfs_alloc_fix_freelist
v
agfl
During operations on free space btree (bnobt and cntbt), new free block maybe
needed, if no available free blocks, we may encounter deadlock, especially, free
space.
There are two blocks associated with agfl.
Two basic interfaces of agfl
- agf_flfirst
- agf_fllast
- agf_flcount
2nd sector of an AG.
XFS_AGF_DADDR
4th sector of an AG
XFS_AGFL_DADDR
---
// Get bp of agfl
error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp);
...
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
// Get the first free block from the agfl
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
// Push the ring of agfl
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
xfs_perag_put(pag);
logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
...
// We only modify the agf, so just log it.
xfs_alloc_log_agf(tp, agbp, logflags);
*bnop = bno;
---
---
// Get bp of agfl
if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, 1);
xfs_trans_agflist_delta(tp, 1);
pag->pagf_flcount++;
logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
...
xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
// Fill in the free block into the agfl
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
*blockp = cpu_to_be32(bno);
startoff = (char *)blockp - (char *)agflbp->b_addr;
// we need to log both agf and agfl.
xfs_alloc_log_agf(tp, agbp, logflags);
xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
xfs_trans_log_buf(tp, agflbp, startoff,
startoff + sizeof(xfs_agblock_t) - 1);
---
Is xfs_alloc_fix_freelist exclusive ?
The lock of agf buffer could ensure the exclusivity.
It will be locked by
xfs_alloc_read_agf
-> xfs_read_agf
-> xfs_trans_read_buf
-> xfs_trans_read_buf_map
-> xfs_buf_read_map
-> xfs_buf_get_map
-> xfs_buf_find
-> xfs_buf_lock
Before the transaction is committed or rolled, if the bp has been in the
transaction, it will not be locked again.
This is done by,
xfs_trans_read_buf_map
-> xfs_trans_buf_item_match
Where to unlock ?
Every time we do allocation/free, agf, at least agf->agf_freeblks,
will be modified. So it must be logged and the lock will be freed
after log formating.
Actually, the whole free space allocation and free are under the lock of
agf's bp.
defer agfl block frees
f8f2835a9cf300079835e1adb1d90f85033be04c (xfs: defer agfl block frees when dfops is available)
The AGFL fixup code executes before every block allocation/free and rectifies the
AGFL based on the current, dynamic allocation requirements of the fs. The AGFL must
hold a minimum number of blocks to satisfy a worst case split of the free space
btrees caused by the impending allocation operation.
Since the AGFL caches individual blocks, AGFL reduction typically involves multiple,
single block frees. We've had reports of transaction overrun problems during certain
workloads that boil down to AGFL reduction freeing multiple blocks and consuming more
space in the log than was reserved for the transaction (note, it is tic->t_unit_res).
One way to address this problem is to release surplus blocks from the AGFL immediately
but defer the free of those blocks (similar to how file-mapped blocks are unmapped from
the file in one transaction and freed via a deferred operation) until the transaction is
rolled.
Let's look at how to achieve this,
__xfs_trans_commit
-> xfs_defer_finish_noroll
---
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
/* log intents and pull in intake items */
xfs_defer_create_intents(*tp); [1]
list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
// Roll the transaction.
// The transaction will re-get a log space reservation.
error = xfs_defer_trans_roll(tp); [2]
...
[3]
dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, dfp->dfp_count);
cleanup_fn = dfp->dfp_type->finish_cleanup;
/* Finish the work items. */
state = NULL;
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
// Do the real work behind the intend
[4]
error = dfp->dfp_type->finish_item(*tp, li, dfp->dfp_done, &state);
...
}
...
}
---
Given there are 4 intent log item in this transaction and all of them are btree
updating. If log all of the operations in one transaction committing, the
tic->t_unit_res maybe used up. But when handle them in defer work, things are
different.
a. create intend log for all of them.
b. roll the trasaction
c. create intend done log for the 1st one
d. do the real work, such as updating btree and agf
e. roll the transaction
...
We could see, every time the transaction is rolled, it just commit one btree
updating operation and the log space needed is predictable.
Busy extent
xlog_cil_committed
---
xfs_extent_busy_sort(&ctx->busy_extents);
xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
---
Think of following operations sequence:
1. free extent E_a from F_a in transaction T_a
2. allocate extent E_a to F_b in transaction T_b again.
3. write on the E_a of F_b.
The issue here is: if the data written on E_a reaches disk before the log of T_a
and T_b, and power is lost at the moment. When do recovery, we cannot replay the
log of free extent E_a from F_a, but the blocks associated to extent E_a has
been corrupted.
How does ext4 handle this ?
ext4_free_blocks
---
/*
* We need to make sure we don't reuse the freed block until after the
* transaction is committed. We make an exception if the inode is to be
* written in writeback mode since writeback mode has weak data
* consistency guarantees.
*/
if (ext4_handle_valid(handle) &&
((flags & EXT4_FREE_BLOCKS_METADATA) ||
!ext4_should_writeback_data(inode))) {
// if order or journal mode
...
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
---
spin_lock(&sbi->s_md_lock);
list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
---
}
---
ext4_journal_commit_callback
-> ext4_process_freed_data
-> ext4_mb_load_buddy
-> ext4_mb_load_buddy_gfp
-> find_or_create_page
-> ext4_mb_init_cache // if not Uptodate
// struct inode *inode = sbi->s_buddy_cache;
jbd2_journal_commit_transaction
---
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
---
Details
_
xfs_allocbt_init_cursor <- xfs_alloc_ag_vextent_exact |
xfs_alloc_ag_vextent_near > xfs_alloc_ag_vextent <- xfs_alloc_fix_freelist [1]
xfs_alloc_ag_vextent_size _| xfs_alloc_vextent <- xfs_bmap_extents_to_btree
xfs_free_ag_extent xfs_bmap_local_to_extents_empty
xfs_trim_extents xfs_bmap_btalloc
xfs_getfsmap_datadev_bnobt_query xfs_bmbt_alloc_block
xfs_ialloc_ag_alloc
__xfs_inobt_alloc_block <- xfs_inobt_alloc_block
xfs_refcountbt_alloc_block xfs_finobt_alloc_block
[1] xfs_alloc_fix_freelist is used for reserving free blocks to split the cntbt or bnobt.
because we may need extra free blocks to split the bnobt or cntbt when free blocks.
if don't reserve these blocks, we fall into deadlock.
Another path to allocate block is to get it from freelist directly
xfs_alloc_get_freelist <- xfs_alloc_ag_vextent_small
xfs_alloc_fix_freelist
xfs_allocbt_alloc_block
xfs_rmapbt_alloc_block
The blocks in freelist are also allocated by xfs_alloc_ag_vextent
Look at xfs_alloc_vextent,
THIS_AG
NEAR_BNO
THIS_BNO
we are forced to allocate block from a single ag
START_BNO
FIRST_AG
we could rotate through the allocation groups to look for a winner
Let's look at the allocate policy of following cases,
[1] xfs_bmap_extents_to_btree
---
if (tp->t_firstblock == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
} else if (tp->t_flags & XFS_TRANS_LOWMODE) {
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = tp->t_firstblock;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.fsbno = tp->t_firstblock;
}
inherit from the inode
---
[2] xfs_bmap_local_to_extents_empty
---
if (tp->t_firstblock == NULLFSBLOCK) {
args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
args.fsbno = tp->t_firstblock;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
}
...
/* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
inherit from the inode
---
[3] xfs_bmap_btalloc
---
it could fallback to START_BNO and FIRST_AG
---
[4] xfs_bmbt_alloc_block
---
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
args.type = XFS_ALLOCTYPE_START_BNO;
args.minleft = args.tp->t_blk_res;
} else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) {
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
}
The root node's block is
#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
It is not related to the inode, but the ag
---
[5] xfs_ialloc_ag_alloc
----
Allocate new inodes in the allocation group specified by agbp.
----
[6] __xfs_inobt_alloc_block
---
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.oinfo = XFS_RMAP_OINFO_INOBT;
args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.resv = resv;
error = xfs_alloc_vextent(&args);
---
[7] xfs_refcountbt_alloc_block
---
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
xfs_refc_block(args.mp));
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
args.resv = XFS_AG_RESV_METADATA;
error = xfs_alloc_vextent(&args);
---
How does xfs_alloc_vextent know the ag from which the caller want to allocate block ?
xfs_alloc_vextent
---
case XFS_ALLOCTYPE_THIS_AG:
case XFS_ALLOCTYPE_NEAR_BNO:
case XFS_ALLOCTYPE_THIS_BNO:
/*
* These three force us into a single a.g.
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
args->pag = xfs_perag_get(mp, args->agno);
error = xfs_alloc_fix_freelist(args, 0);
...
case XFS_ALLOCTYPE_START_BNO:
/*
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
...
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
/* FALLTHROUGH */
case XFS_ALLOCTYPE_FIRST_AG:
/*
* Rotate through the allocation groups looking for a winner.
*/
if (type == XFS_ALLOCTYPE_FIRST_AG) {
/*
* Start with allocation group given by bno.
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_THIS_AG;
sagno = 0;
flags = 0;
} else {
/*
* Start with the given allocation group.
*/
args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
flags = XFS_ALLOC_FLAG_TRYLOCK;
}
/*
* Loop over allocation groups twice; first time with
* trylock set, second time without.
*/
for (;;) {
args->pag = xfs_perag_get(mp, args->agno);
error = xfs_alloc_fix_freelist(args, flags);
if (error) {
trace_xfs_alloc_vextent_nofix(args);
goto error0;
}
/*
---
So the key here is the arg->fsbno.
But where does the initial vlaue come from ?
How to select ag for a inode ?
xfs_dialloc
-> xfs_ialloc_ag_select
---
// For a directory, spread the them over all of the ags
if (S_ISDIR(mode))
pagno = xfs_ialloc_next_ag(mp);
else {
// Otherwise, inherit the ag of its parent
pagno = XFS_INO_TO_AGNO(mp, parent);
if (pagno >= agcount)
pagno = 0;
}
---
How to select ag for a file ?
xfs_bmap_btalloc
---
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
ap->tp->t_firstblock);
if (nullfb) {
if (xfs_alloc_is_userdata(ap->datatype) &&
xfs_inode_is_filestream(ap->ip)) {
ag = xfs_filestream_lookup_ag(ap->ip);
ag = (ag != NULLAGNUMBER) ? ag : 0;
ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
} else {
//initial ag is near its inode
ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
}
} else
ap->blkno = ap->tp->t_firstblock;
//then it will follow the existing extent of the file
xfs_bmap_adjacent(ap);
...
args.fsbno = ap->blkno;
...
---
How to know the block allocation is for userdata or metadata ?
xfs_bmap_btalloc
<- xfs_bmap_alloc
<- xfs_bmapi_allocate
<- xfs_bmapi_convert_delalloc
xfs_bmapi_allocate <- xfs_attr_rmtval_set
xfs_da_grow_inode_int [M] (directory)
xfs_alloc_file_space [?]
xfs_dquot_disk_alloc [M]
xfs_iomap_write_direc
xfs_iomap_write_unwritten
xfs_reflink_allocate_cow
xfs_growfs_rt_alloc [M]
xfs_symlik [M]
xfs_bmapi_allocate
---
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->datatype = XFS_ALLOC_NOBUSY;
if (whichfork == XFS_DATA_FORK) {
if (bma->offset == 0)
bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
else
bma->datatype |= XFS_ALLOC_USERDATA;
}
if (bma->flags & XFS_BMAPI_ZERO)
bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
}
---
Inode space allocation
xfs_ialloc will return a pointer to an incore inode if the Space Manager has an
available inode on the free list. Otherwise, it will do an allocation and replenish
the freelist.
Since we can only do one allocation per transaction without deadlocks,
we will need to commit the current transaction and start a new one. We will then
need to call xfs_ialloc again to get the inode.
xfs_dialloc
---
if (*IO_agbp) {
/*
* If the caller passes in a pointer to the AGI buffer,
* continue where we left off before. In this case, we
* know that the allocation group has free inodes.
*/
agbp = *IO_agbp;
goto out_alloc;
}
/*
* We do not have an agbp, so select an initial allocation
* group for inode allocation.
*/
start_agno = xfs_ialloc_ag_select(tp, parent, mode);
if (start_agno == NULLAGNUMBER) {
*inop = NULLFSINO;
return 0;
}
...
/*
* Loop until we find an allocation group that either has free inodes
* or in which we can allocate some inodes. Iterate through the
* allocation groups upward, wrapping at the end.
*/
agno = start_agno;
for (;;) {
pag = xfs_perag_get(mp, agno);
if (!pag->pagi_inodeok) {
xfs_ialloc_next_ag(mp);
goto nextag;
}
if (!pag->pagi_init) {
error = xfs_ialloc_pagi_init(mp, tp, agno);
if (error)
goto out_error;
}
/*
* Do a first racy fast path check if this AG is usable.
*/
if (!pag->pagi_freecount && !okalloc)
goto nextag;
/*
* Then read in the AGI buffer and recheck with the AGI buffer
* lock held.
*/
error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
if (error)
goto out_error;
if (pag->pagi_freecount) {
xfs_perag_put(pag);
goto out_alloc;
}
if (!okalloc)
goto nextag_relse_buffer;
error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
...
if (ialloced) {
ASSERT(pag->pagi_freecount > 0);
xfs_perag_put(pag);
*IO_agbp = agbp;
*inop = NULLFSINO;
return 0;
}
nextag_relse_buffer:
xfs_trans_brelse(tp, agbp);
nextag:
xfs_perag_put(pag);
if (++agno == mp->m_sb.sb_agcount)
agno = 0;
if (agno == start_agno) {
*inop = NULLFSINO;
return noroom ? -ENOSPC : 0;
}
}
out_alloc:
*IO_agbp = NULL;
return xfs_dialloc_ag(tp, agbp, parent, inop);
---
AG free inode managment
AG relative inode numbers always fit within 32 bits.
Relative inode number
sb_agblkg sb_inoplog
|---------------|------------|
Absolute inode number
AG number
|----------------|---------------|-------------|
sb_agblkg log 2 value of sb_agblocks
sb_agblocks is size of each AG in blocks
sb_inopblog log 2 value of sb_inopblock.
sb_inopblock is number of inodes per block.
This is equivalent to sb_blocksize /sb_inodesize.
sb_blocksize The size of a basic unit of space allocation in bytes.
Typically, this is 4096 (4KB) but can range from 512 to
65536 bytes.
sb_inodesize Size of the inode in bytes. The default is 256 (2 inodes
per standard sector) but can be made as large as 2048 bytes
when creating the filesystem. On a v5 filesystem, the default
and minimum inode size are both 512 bytes
Actually, the inode number is related the block that contains the on-disk inode strcuture.
inode number = AG relative block number << sb_inoplog | index in the block
So the maxinum of inode number in a AG is related with the size of block and inode.
Rather than statically pre-allocating the inodes (eg. EXT4)for all of these files, XFS
dynamically allocates inodes as needed.
Inodes are traditionally allocated in chunks of 64. The inode allocation B+ tree in each
AG keeps track of these chunks and whether each inode within a chunk is in use.
The inode location is actually the inode number, namely, We could get the
position of the inode from its inode id
Inode allocation
xfs_dialloc
---
xfs_ialloc_ag_select //select an AG
for (;;) {
xfs_ialloc_read_agi(mp, tp, agno, &agbp);
if (pag->pagi_freecount) {
xfs_perag_put(pag);
goto out_alloc;
}
xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
if (ialloced) {
/*
* We successfully allocated some inodes, return
* the current context to the caller so that it
* can commit the current transaction and call
* us again where we left off.
*/
ASSERT(pag->pagi_freecount > 0);
xfs_perag_put(pag);
*IO_agbp = agbp;
*inop = NULLFSINO;
return 0;
}
}
out_alloc:
*IO_agbp = NULL;
return xfs_dialloc_ag(tp, agbp, parent, inop);
---
Firstly, we need to allocate chunk of inodes (64) and insert them into the inode
allocation btree. Then xfs_dialloc_ag will be invoked to allocated inode from
the btree.
As the allocate and free go on repeatedly, there will be holes in the chunk.
Policy to select AG
xfs_dialloc
-> xfs_ialloc_ag_select
---
if (S_ISDIR(mode))
pagno = xfs_ialloc_next_ag(mp);
else {
pagno = XFS_INO_TO_AGNO(mp, parent);
if (pagno >= agcount)
pagno = 0;
}
---
For the policy of directories, it is good for parallelism in managment of free
space and inode allocation.
For the 2nd policy, it works well for keeping directories of small files clustered
together on disk.
For large files, we try to
allocate extents initially near the inode and afterwards
near the existing block in the file which is closest to
the offset in the file for which we are allocating space
That implies blocks will be allocated near the last
block in the file for sequential writers and near blocks
in the middle of the file for processes writing into
holes
Files and directories are not limited to allocating
space within a single allocation group, however
B Tree
xfs_btree_ops
xfs_btree_lookup
---
cur->bc_ops->init_ptr_from_cur(cur, &ptr);
pp = &ptr;
---
xfs_allocbt_init_ptr_from_cur
---
ptr->s = agf->agf_roots[cur->bc_btnum];
//agf here is the on-disk one.
---
We need to iterate the btree and compare the keys.
If level > 0, there are specific key fields there.
But for level 0, only record there. So we need this callback to get key value
from the record.
xfs_btree_lookup
-> xfs_lookup_get_search_key
---
if (level == 0) {
cur->bc_ops->init_key_from_rec(kp,
xfs_btree_rec_addr(cur, keyno, block));
return kp;
}
return xfs_btree_key_addr(cur, keyno, block);
---
bnobt and cntbt, or more strictly, only cntbt use this callback to update
the agf_longest, which indicates the longest free block extent in this ag.
(bnobt and cntbt have the same callabck, but only cntbt has
XFS_BTREE_LASTREC_UPDATE based on xfs_allocbt_init_cursor, then xfs_btree_is_lastrec
will return false for bnobt)
It is easily understood that the last record of cntbt is the longest free block
extent. xfs_allocbt_update_lastrec will update the agf->agf_longest.
xfs_btree_lookup
/* initialise start pointer from cursor */
cur->bc_ops->init_ptr_from_cur(cur, &ptr);
pp = &ptr;
for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
error = xfs_btree_lookup_get_block(cur, level, pp, &block);
if (diff == 0) {
/*
* If we already had a key match at a higher level, we
* know we need to use the first entry in this block.
*/
keyno = 1;
} else {
int high; /* high entry number */
int low; /* low entry number */
/* Set low and high entry numbers, 1-based. */
low = 1;
high = xfs_btree_get_numrecs(block);
...
while (low <= high) {
union xfs_btree_key key;
union xfs_btree_key *kp;
XFS_BTREE_STATS_INC(cur, compare);
/* keyno is average of low and high. */
keyno = (low + high) >> 1;
/* Get current search key */
kp = xfs_lookup_get_search_key(cur, level,
keyno, block, &key);
/*
* Compute difference to get next direction:
* - less than, move right
* - greater than, move left
* - equal, we're done
*/
diff = cur->bc_ops->key_diff(cur, kp);
if (diff < 0)
low = keyno + 1;
else if (diff > 0)
high = keyno - 1;
else
break;
}
}
Except for the diff==0, the search loop exit is due to low == high.
Before that, keyno = (high + low)/2
if diff < 0, low = keyno + 1
if diff > 0, high = keyno -1
We could get: low + 2 = high
Take the bnobt as example, the final result of loop above is:
KEY low keyno high
PTR ptr_l ptr_k ptr_h
BNO bno_l bno_k bno_h (start block number of the range)
^ ^
[1] [2]
diff = bno_k - bno_want
if diff > 0, bno_want is at [1], it is included in ptr_l, res = keyno - 1
if diff < 0, bno_want is at [2], it is included in ptr_k, res = keyno
if (level > 0) {
if (diff > 0 && --keyno < 1)
keyno = 1;
...
cur->bc_ptrs[level] = keyno;
}
}
For intermediate node, it must include the key we want.
But for leaf node, things is different.
low keyno high
^ ^
' '
diff > 0 diff < 0
LE diff > 0, res = keyno - 1
diff < 0, res = keyno
EQ diff > 0, res = keyno
diff < 0, res = keyno + 1
(EQ and GE return same records, but the stat is different)
GE diff > 0, res = keyno
diff < 0, res = keyno + 1
In case of keyno + 1 may exceed current node,
[K1][K2][K3][K4]
[P1][P2][P3][P4]
| |
.---' '----------.
v v
[R1][R2][R3][R4] [R1][R2][R3][R4]
^
keyno
we need invoke xfs_btree_increment to shift the path right (blue -> red)
if (dir != XFS_LOOKUP_LE && diff < 0) {
keyno++;
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
if (dir == XFS_LOOKUP_GE &&
keyno > xfs_btree_get_numrecs(block) &&
!xfs_btree_ptr_is_null(cur, &ptr)) {
int i;
cur->bc_ptrs[0] = keyno;
error = xfs_btree_increment(cur, 0, &i);
...
*stat = 1;
return 0;
}
} else if (dir == XFS_LOOKUP_LE && diff > 0)
keyno--;
cur->bc_ptrs[0] = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
*stat = 0;
else if (dir != XFS_LOOKUP_EQ || diff == 0)
*stat = 1;
else
*stat = 0;
return 0;
reflink
Each record in the reference count B+tree has the following structure:
struct xfs_refcount_rec {
__be32 rc_startblock;
__be32 rc_blockcount;
__be32 rc_refcount;
};
This data structure tracks reference counts for all shared physical blocks.
- If a block is free, it will be tracked in the free space B+trees.
- If a block is owned by a single file, it appears in neither the free space nor the reference count B+trees.
- If a block is shared, it will appear in the reference count B+tree with a reference count >= 2.
The COW in-core extent fork is for storing delalloc reservations and allocated
extents that are in process of being written out.
(It is not on-disk, but in-core)
work process
xfs_file_iomap_begin_delay
-> xfs_iext_lookup_extent // XFS_DATA_FORK
---
// We only support COW on real extent.
// Before this we have checked the got.br_startoff <= offset_fsb
// The 'got' is from the in-core data extent list fork.
if (xfs_is_reflink_inode(ip) && ((flags & IOMAP_WRITE) ||
got.br_state != XFS_EXT_UNWRITTEN)) {
// We may get a extent bigger than what we want.
// We must trim it because we my do COW on that range.
xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
error = xfs_reflink_reserve_cow(ip, &got);
-> xfs_iext_lookup_extent //XFS_COW_FORK
//Has been on the COW in-core extent list ?
if (!eof && got.br_startoff <= imap->br_startoff) {
xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
return 0;
}
//Trim the mapping to the nearest shared extent boundary.
-> xfs_reflink_trim_around_shared
-> xfs_reflink_find_shared
-> xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
imap->br_blockcount, 0, &got, &icur, eof);
//The process looks similar with normal delayed-allocation.
// - Insert the written shared extent into COW in-core extent fork.
---
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
---
// - And also reserve space for it with both xfs_mod_fdblocks and quota.
}
---
Writeback to disk
xfs_map_blocks
---
if (xfs_inode_has_cow_data(ip) &&
xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
cow_fsb = imap.br_startoff;
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
...
whichfork = XFS_COW_FORK;
wpc->io_type = XFS_IO_COW;
goto allocate_blocks;
...
allocate_blocks:
error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
&wpc->cow_seq);
xfs_iomap_write_allocate
-> xfs_bmapi_write
-> xfs_bmapi_allocate
---
error = xfs_bmap_alloc(bma); //Allocate blocks
...
// For the COW in-core extents list, there is no XFS_IFBROOT on it.
// So finally, bma->cur will be NULL for XFS_BMAPI_COWFORK and
// xfs_bmap_add_extent_hole_real will not update on-disk bmbtree
// due to a NULL bma->cur.
if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur)
bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
..
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
bma->got.br_blockcount = bma->length;
bma->got.br_state = XFS_EXT_NORM;
...
if (bma->wasdel)
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
whichfork, &bma->icur, &bma->cur, &bma->got,
&bma->logflags, bma->flags);
---
Write complete
xfs_reflink_end_cow
---
if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
goto out_cancel;
/* Walk backwards until we're out of the I/O range... */
while (got.br_startoff + got.br_blockcount > offset_fsb) {
del = got;
xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
/* Extent delete may have bumped ext forward */
if (!del.br_blockcount)
goto prev_extent;
if (!xfs_bmap_is_real_extent(&got))
goto prev_extent;
// Step 1 Unmap the old blocks in the data fork.
rlen = del.br_blockcount;
error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
...
/* Trim the extent to whatever got unmapped. */
if (rlen) {
xfs_trim_extent(&del, del.br_startoff + rlen,
del.br_blockcount - rlen);
}
...
// Step 2. Map the new blocks into the data fork.
error = xfs_bmap_map_extent(tp, ip, &del);
...
// Step 3. Remove the mapping from the CoW fork.
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
error = xfs_defer_finish(&tp);
...
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
continue;
prev_extent:
if (!xfs_iext_prev_extent(ifp, &icur, &got))
break;
}
---
orphan record
For example,
writeback
allocate blocks (update agf, agfl, bnobt, cntbt ...)
update in-core COW extent fork
write data out <-. /\ /\ .-- POWROFF!!!
\/ \/ \/
complete,
unmap old blocks from data fork
map new blocks into data fork
remove the extent from in-core COW fork.
Then the allocated extents will be leaked.
How to handle this ?
xfs_bmapi_write
---
error = xfs_bmapi_allocate(&bma);
...
/*
* If this is a CoW allocation, record the data in
* the refcount btree for orphan recovery.
*/
if (whichfork == XFS_COW_FORK) {
error = xfs_refcount_alloc_cow_extent(tp,
bma.blkno, bma.length);
}
---
Extents that are being used to stage a copy on write are stored
in the refcount btree with a refcount of 1 and the upper bit set
on the startblock. This speeds up mount time deletion of stale
staging extents because they're all at the right side of the tree.
xfs_refcount_finish_one
-> __xfs_refcount_cow_alloc
-> xfs_refcount_adjust_cow
---
agbno += XFS_REFC_COW_START;
---
Find and remove leftover CoW reservations.
xfs_mountfs
-> xfs_reflink_recover_cow
-> xfs_refcount_recover_cow_leftovers
Then we could get,
writeback
allocate blocks (update agf, agfl, bnobt, cntbt ...)
update in-core COW extent fork
record COW allocation in refcount tree (deferred operation)
write data out <-. /\ /\ .-- POWROFF!!!
\/ \/ \/
complete,
unmap old blocks from data fork
map new blocks into data fork
remove the extent from in-core COW fork.
Even if it is a deferred operation, the CUI log is in the same
transaction with block allocation logs. If poweroff comes before the CUD and
real refcount operations, the CUI could be replayed.
speculative preallocation on COW
There are two forks in play here
- the data fork that's written to disk
- the CoW fork that only exists in memory
We don't coordinate extent overlap between the two forks; indeed, we use
this lack of overlap-coordination as part of speculative preallocation
to defragment shared files.
Let's first look at where does the speculative preallocation happen.
So when we really want to allocate,
xfs_bmapi_reserve_delalloc
---
if (whichfork == XFS_COW_FORK) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
prev.br_startoff = NULLFILEOFF;
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
1, 0, &aoff, &alen);
}
---
xfs_bmap_btalloc
---
if (ap->flags & XFS_BMAAI_COWFORK)
align = xfs_get_cowextsz_hint(ap->ip);
else if (xfs_alloc_is_userdata(ap->datatype))
align = xfs_get_extsz_hint(ap->ip);
if (align) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
&ap->offset, &ap->length);
}
---
cowextsz = 10
012345678901234567890123456789
D: SSSSSSSSSSSSRRRRRRRRRRRRRRRRRR
C: -----AAAAAAA------------------
^ ^ ^
we actually allocate.
cowextsz = 10
012345678901234567890123456789
D: SSSSSSSSSSSSRRRRRRRRRRRRRRRRRR
C: AAAAAAAAAAAAAAAAAAAA----------
^ ^ ^
When the IO on blocks 5-13, xfs_reflink_end_cow remaps the blocks as:
012345678901234567890123456789
D: SSSSSAAAAAAAAARRRRRRRRRRRRRRRR
C: UUUUU---------UUUUUU----------
^ ^ ^
If we issue write IO on blocks 16, the COW fork will be used.
xfs_file_iomap_begin_delay
---
eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
if (eof)
got.br_startoff = end_fsb; /* fake hole until the end */
if (got.br_startoff <= offset_fsb) {
if (xfs_is_reflink_inode(ip) &&
((flags & IOMAA_WRITE) ||
got.br_state != XFS_EXT_UNWRITTEN)) {
xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
error = xfs_reflink_reserve_cow(ip, &got);
// Try to find the got->br_startoff in COW fork, if there,
// return and use it directly
}
goto done;
}
---
It'll convert block 16 in the COW fork before issuing the write IO:
012345678901234567890123456789
D: SSSSSAAAAAAAAARRRRRRRRRRARRRRR
C: UUUUU---------UUAUUU----------
^ ^ ^
and then remap it when it's done:
012345678901234567890123456789
D: SSSSSAAAAAAAAARRARRRRRRRARRRRR
C: UUUUU---------UU-UUU----------
^ ^ ^
Therefore, in normal case, the whole range will be remapped into a
contiguous blocks. This is the defragment
COW Unwritten
xfs_bmapi_allocate
---
error = xfs_bmap_alloc(bma);
...
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
bma->got.br_blockcount = bma->length;
bma->got.br_state = XFS_EXT_NORM;
if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
(bma->flags & XFS_BMAPI_PREALLOC))
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
whichfork, &bma->icur, &bma->cur, &bma->got,
&bma->logflags, bma->flags);
---
Why do we need this ?
When we write IO on a range
cowextsz 10
012345678901234567890123456789
D: RRRRRRRRRRRRRRRSSSSSSSSSSSSSSS
C: ------------------------------
^ ^ ^
direct write on block 10 ~ 19
Two IO will be generated.
This IO with regular and COW IO will cause the dio->flags gain a IOMAP_DIO_COW.
The original big IO will be split into two IO due to the boundary.
xfs_file_iomap_begin
-> xfs_reflink_allocate_cow
---
error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
if (error || !*shared)
return error;
---
We will allocate new blocks for it.
But due to the cowextsz, the finally allocated blocks are as
cowextsz 10
012345678901234567890123456789
D: RRRRRRRRRRRRRRRSSSSSSSSSSSSSSS
C: ----------AAAAAAAAAA----------
^ ^ ^
We see that the newly allocated blocks on COW has overlapped with the
IO on blocks 10 ~ 14.
iomap_dio_bio_actor
---
if (iomap->flags & IOMAP_F_SHARED)
dio->flags |= IOMAP_DIO_COW
---
When IO completes,
xfs_dio_write_end_io
-> xfs_reflink_end_cow //IOMAP_DIO_COW
At the moment, we will try to remap all of the blocks involved in this IO
on COW fork into data fork.
However, the blocks 10 ~ 14 are newly allocated and not yet initialized.
If power is lost, we will get data corrupted.
To fix this, we will set the preallocated blocks into Unwritten state first.
cowextsz 10
012345678901234567890123456789
D: RRRRRRRRRRRRRRRSSSSSSSSSSSSSSS
C: ----------UUUUUUUUUU----------
^ ^ ^
And only convert the blocks COWed into real state.
xfs_file_iomap_begin
-> xfs_reflink_allocate_cow
-> xfs_reflink_convert_cow_extent
cowextsz 10
012345678901234567890123456789
D: RRRRRRRRRRRRRRRSSSSSSSSSSSSSSS
C: ----------UUUUUAAAAA----------
^ ^ ^
Then only remap the blocks that not are Unwritten from COW fork into data fork.
xfs_reflink_end_cow
---
/*
* Only remap real extent that contain data. With AIO
* speculatively preallocations can leak into the range we
* are called upon, and we need to skip them.
*/
if (!xfs_bmap_is_real_extent(&got))
goto prev_extent;
---
Here is test code and log in kernel.
#include
Talking
concurrent read write
https://www.spinics.net/lists/linux-xfs/msg26459.html
xfs_file_buffered_aio_read() is the only call sites I know of
to call generic_file_read_iter() with i_rwsem read side held.
xfs_file_buffered_aio_read
---
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
return -EAGAIN;
} else {
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
---
xfs_file_buffered_aio_write
---
write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
...
current->backing_dev_info = NULL;
out:
if (iolock)
xfs_iunlock(ip, iolock);
if (ret > 0) {
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
ret = generic_write_sync(iocb, ret);
}
return ret;
---
This lock is killing performance of multi-threaded buffered
read/write mixed workload on the same file [1].
My question is, is the purpose of this lock syncing
dio/buffered io?
Answer:
That's one part of it. The other is POSIX atomic write semantics.
https://pubs.opengroup.org/onlinepubs/009695399/functions/read.html
"I/O is intended to be atomic to ordinary files and pipes and FIFOs.
Atomic means that all the bytes from a single operation that started
out together end up together, without interleaving from other I/O
operations."
i.e. that independent read()s should see a write() as a single
atomic change. hence if you do a read() concurrently with a write(),
the read should either run to completion before the write, or the
write run to completion before the read().
XFS is the only linux filesystem that provides this behaviour.
(Databases like this behaviour very much....)
ext4 and other filesytsems tha thave no buffered read side locking
provide serialisation at page level, which means an 8kB read racing
with a 8kB write can return 4k from the new write and 4k from the
yet-to-be-written part of the file. IOWs, the buffered read gets
torn.
using AIO+DIO on XFS will allow
both reads and writes to be issued concurrently and not block the
main thread of execution.
stripe alignment
Two important variables,
xfs_mount_t.m_swidth (stripe unit)
m_dalign (stripe width)
How does these two work ?
xfs_bmap_btalloc
---
/* stripe alignment for allocation is determined by mount parameters */
stripe_align = 0;
if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
stripe_align = mp->m_swidth;
else if (mp->m_dalign)
stripe_align = mp->m_dalign;
...
/*
* If we are not low on available data blocks, and the
* underlying logical volume manager is a stripe, and
* the file offset is zero then try to allocate data
* blocks on stripe unit boundary.
* NOTE: ap->aeof is only set if the allocation length
* is >= the stripe unit and the allocation offset is
* at the end of file.
*/
if (!ap->dfops->dop_low && ap->aeof) {
if (!ap->offset) {
args.alignment = stripe_align;
atype = args.type;
isaligned = 1;
/*
* Adjust for alignment
*/
if (blen > args.alignment && blen <= args.maxlen)
args.minlen = blen - args.alignment;
args.minalignslop = 0;
} else {
/*
* First try an exact bno allocation.
* If it fails then do a near or start bno
* allocation with alignment turned on.
*/
atype = args.type;
tryagain = 1;
args.type = XFS_ALLOCTYPE_THIS_BNO;
args.alignment = 1;
...
}
} else {
args.alignment = 1;
args.minalignslop = 0;
}
args.minleft = ap->minleft;
args.wasdel = ap->wasdel;
args.resv = XFS_AG_RESV_NONE;
args.datatype = ap->datatype;
if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
args.ip = ap->ip;
error = xfs_alloc_vextent(&args);
if (error)
return error;
if (tryagain && args.fsbno == NULLFSBLOCK) {
/*
* Exact allocation failed. Now try with alignment
* turned on.
*/
args.type = atype;
args.fsbno = ap->blkno;
args.alignment = stripe_align;
args.minlen = nextminlen;
args.minalignslop = 0;
isaligned = 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
if (isaligned && args.fsbno == NULLFSBLOCK) {
/*
* allocation failed, so turn off alignment and
* try again.
*/
args.type = atype;
args.fsbno = ap->blkno;
args.alignment = 0;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
---
xfs_ialloc_ag_alloc
---
if (unlikely(args.fsbno == NULLFSBLOCK)) {
/*
* Set the alignment for the allocation.
* If stripe alignment is turned on then align at stripe unit
* boundary.
* If the cluster size is smaller than a filesystem block
* then we're doing I/O for inodes in filesystem block size
* pieces, so don't need alignment anyway.
*/
isaligned = 0;
if (args.mp->m_sinoalign) {
ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else
args.alignment = xfs_ialloc_cluster_alignment(args.mp);
/*
* Need to figure out where to allocate the inode blocks.
* Ideally they should be spaced out through the a.g.
* For now, just allocate blocks up front.
*/
args.agbno = be32_to_cpu(agi->agi_root);
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
/*
* Allocate a fixed-size extent of inodes.
*/
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.prod = 1;
/*
* Allow space for the inode btree to split.
*/
args.minleft = args.mp->m_in_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
---
(delay allocation / extent reserve)
xfs_eof_alignment <- xfs_iomap_eof_align_last_fsb <- xfs_iomap_write_direc
<- xfs_file_iomap_begin_delay
xfs_preferred_iosize(xfs_mount_t *mp)
{
if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
return PAGE_SIZE;
return (mp->m_swidth ?
(mp->m_swidth << mp->m_sb.sb_blocklog) :
((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
(1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) :
PAGE_SIZE));
}
xfs_vn_getattr
---
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
stat->blksize = BLKDEV_IOSIZE;
stat->rdev = inode->i_rdev;
break;
default:
if (XFS_IS_REALTIME_INODE(ip)) {
/*
* If the file blocks are being allocated from a
* realtime volume, then return the inode's realtime
* extent size or the realtime volume's extent size.
*/
stat->blksize =
xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
} else
stat->blksize = xfs_preferred_iosize(mp);
stat->rdev = 0;
break;
}
---
space reservation
xfs_mount->m_fdblocks is used to account the free blocks.
xfs_mod_fdblocks will operate it.
---
/*
* Taking blocks away, need to be more accurate the closer we
* are to zero.
*
* If the counter has a value of less than 2 * max batch size,
* then make everything serialise as we are real close to
* ENOSPC.
*/
if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
batch = XFS_FDBLOCKS_BATCH;
percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
}
/*
* lock up the sb for dipping into reserves before releasing the space
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(&mp->m_fdblocks, -delta);
if (!rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
if (lcounter >= 0) {
mp->m_resblks_avail = lcounter;
spin_unlock(&mp->m_sb_lock);
return 0;
}
printk_once(KERN_WARNING
"Filesystem \"%s\": reserve blocks depleted! "
"Consider increasing reserve pool size.",
mp->m_fsname);
fdblocks_enospc:
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
---
The statfs could get this information and showed in df -h.
---
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
spin_lock(&mp->m_sb_lock);
statp->f_bsize = sbp->sb_blocksize;
lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
statp->f_blocks = sbp->sb_dblocks - lsize;
spin_unlock(&mp->m_sb_lock);
statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
---
grow fs
xfs_growfs mountpoint -D size
NOTE: While xfs could be grown while mounted, their size cannot be reduced any
more.
xfs grow fs
------------------------
xfs_file_ioctl
---
case XFS_IOC_FSGROWFSDATA: {
xfs_growfs_data_t in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_growfs_data(mp, &in);
mnt_drop_write_file(filp);
return error;
}
---
Let's look at the core part of the grow fs
xfs_growfs_data_private
---
// new size of the fs
nb = in->newblocks;
if (nb < mp->m_sb.sb_dblocks)
return -EINVAL;
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
// try to read in the nb - 1 block to verify the length is accessible
error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
if (error)
return error;
xfs_buf_relse(bp);
new = nb; /* use new as a temporary here */
nb_mod = do_div(new, mp->m_sb.sb_agblocks);
nagcount = new + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
if (nb < mp->m_sb.sb_dblocks)
return -EINVAL;
}
new = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
error = xfs_initialize_perag(mp, nagcount, &nagimax);
if (error)
return error;
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
/*
* Write new AG headers to disk. Non-transactional, but need to be
^^^^^^^^^^^^^^^^^
* written and completed prior to the growfs transaction being logged.
* To do this, we use a delayed write buffer list and wait for
* submission and IO completion of the list as a whole. This allows the
* IO subsystem to merge all the AG headers in a single AG into a single
* IO and hide most of the latency of the IO from us.
*
* This also means that if we get an error whilst building the buffer
* list to write, we can cancel the entire list without having written
* anything.
*/
INIT_LIST_HEAD(&id.buffer_list);
for (id.agno = nagcount - 1;
id.agno >= oagcount;
id.agno--, new -= id.agsize) {
if (id.agno == nagcount - 1)
id.agsize = nb -
(id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
else
id.agsize = mp->m_sb.sb_agblocks;
error = xfs_ag_init_headers(mp, &id);
if (error) {
xfs_buf_delwri_cancel(&id.buffer_list);
goto out_trans_cancel;
}
}
error = xfs_buf_delwri_submit(&id.buffer_list);
if (error)
goto out_trans_cancel;
xfs_trans_agblocks_delta(tp, id.nfree);
/* If there are new blocks in the old last AG, extend it. */
// currently, the id.agno is the old last AG id, the new is the space left.
if (new) {
error = xfs_ag_extend_space(mp, tp, &id, new);
---
/*
* Change the agi length.
*/
error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
agi = XFS_BUF_TO_AGI(bp);
be32_add_cpu(&agi->agi_length, len);
xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
/*
* Change agf length.
*/
error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
agf = XFS_BUF_TO_AGF(bp);
be32_add_cpu(&agf->agf_length, len);
xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
/*
* Free the new space.
*/
return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
be32_to_cpu(agf->agf_length) - len),
len, &XFS_RMAP_OINFO_SKIP_UPDATE,
XFS_AG_RESV_NONE);
---
if (error)
goto out_trans_cancel;
}
/*
* Update changed superblock fields transactionally. These are not
* seen by the rest of the world until the transaction commit applies
* them atomically to the superblock.
*/
if (nagcount > oagcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
if (nb > mp->m_sb.sb_dblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
nb - mp->m_sb.sb_dblocks);
if (id.nfree)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
// The whole operation of grow fs is a complete transaction.
---
Does the new size of the fs take effect before the grow fs transaction complete
even commit ?
There are two points that would affect the ag selection
---
if (++(args->agno) == mp->m_sb.sb_agcount) {
if (args->tp->t_firstblock != NULLFSBLOCK)
args->agno = sagno;
else
args->agno = 0;
}
---
xfs_log_commit_cil
-> xlog_cil_insert_items
//BEFORE
-> xfs_trans_unreserve_and_mod_sb
---
/* calculate deltas */
if (tp->t_blk_res > 0)
blkdelta = tp->t_blk_res;
if ((tp->t_fdblocks_delta != 0) &&
(xfs_sb_version_haslazysbcount(&mp->m_sb) ||
(tp->t_flags & XFS_TRANS_SB_DIRTY)))
blkdelta += tp->t_fdblocks_delta;
...
/* apply the per-cpu counters */
if (blkdelta) {
error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
if (error)
goto out;
}
...
/* apply remaining deltas */
spin_lock(&mp->m_sb_lock);
...
if (tp->t_dblocks_delta != 0) {
error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
if (error)
goto out_undo_frextents;
}
if (tp->t_agcount_delta != 0) {
error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
if (error)
goto out_undo_dblocks;
}
...
---
After xfs_trans_unreserve_and_mod_sb, the newsize of fs is visible for the data block allocation.
The xfs_alloc_vextent could allocate blocks from the new blocks.
The xfs_trans_unreserve_and_mod_sb is invoked after xlog_cil_insert_items could guarantee that the
following transaction of data block allocation would be in the same log ctx or after the grow fs transaction.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If the system crash during this, the data block allocation will not get blocks beyond the fs size.
---
spin_lock(&mp->m_agirotor_lock);
agno = mp->m_agirotor;
if (++mp->m_agirotor >= mp->m_maxagi)
mp->m_agirotor = 0;
spin_unlock(&mp->m_agirotor_lock);
---
The m_maxagi is updated here,
xfs_growfs_data_private
---
...
error = xfs_trans_commit(tp);
...
if (nagimax)
mp->m_maxagi = nagimax;
---