Disk Format
O2CB
Journal Replay
Cache Consistency
The smallest unit of blocks of regular files
There is a default policy to decide cluster size,
mkfs.ocfs2/mkfs.c fill_defaults()
---
if (!s->cluster_size) {
switch (s->fs_type) {
case OCFS2_MKFSTYPE_DATAFILES:
case OCFS2_MKFSTYPE_VMSTORE:
s->cluster_size = cluster_size_datafiles(s);
---
if (volume_gigs < 64)
cluster_size = 128;
else if (volume_gigs < 96)
cluster_size = 256;
else if (volume_gigs < 128)
cluster_size = 512;
else
cluster_size = 1024;
return cluster_size * 1024;
---
break;
default:
s->cluster_size = cluster_size_default(s);
---
for (cluster_size = 4096; cluster_size < 1024 * 1024; cluster_size <<= 1) {
cluster_size_bits = get_bits(s, cluster_size);
volume_size =
s->volume_size_in_bytes >> cluster_size_bits;
if (volume_size <= (2^32 - 1))
break;
}
---
break;
}
}
---
include/ocfs2/ocfs2.h ocfs2_calc_cluster_groups()
---
uint16_t max_bits = 8 * ocfs2_group_bitmap_size(blocksize, 0, 0);
---
int size = blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap);
---
cgs->cgs_cpg = max_bits;
cgs->cgs_cluster_groups = (clusters + cgs->cgs_cpg - 1) /
cgs->cgs_cpg;
cgs->cgs_tail_group_bits = clusters % cgs->cgs_cpg;
if (cgs->cgs_tail_group_bits == 0)
cgs->cgs_tail_group_bits = cgs->cgs_cpg;
ocfs2_group_desc bitmap
+----------+-------------------+
| | |
+----------+-------------------+
bg_next_group
|
v
ocfs2_group_desc bitmap
+----------+-------------------+
| | |
+----------+-------------------+
---
switch (s->fs_type) {
case OCFS2_MKFSTYPE_DATAFILES:
j_blocks = journal_size_datafiles();//8K * 4K
break;
case OCFS2_MKFSTYPE_MAIL:
j_blocks = journal_size_mail(s);//64K * 4K
break;
case OCFS2_MKFSTYPE_VMSTORE:
j_blocks = journal_size_vmstore(s);//32K * 4K
break;
default:
j_blocks = journal_size_default(s);//min(64K * 4K, volume_size/160)
break;
}
ocfs2_dinode.id2.i_chain
struct ocfs2_chain_list {
/*00*/ __le16 cl_cpg; /* Clusters per Block Group */
__le16 cl_bpc; /* Bits per cluster */
__le16 cl_count; /* Total chains in this list */
__le16 cl_next_free_rec; /* Next unused chain slot */
__le64 cl_reserved1;
/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
};
struct ocfs2_chain_rec {
__le32 c_free; /* Number of free bits in this chain. */
__le32 c_total; /* Number of total bits in this chain */
__le64 c_blkno; /* Physical disk offset (blocks) of 1st group */
};
All of the ocfs2_group_desc and its following bitmap are hash-linked on the
ocfs2_chain_rec.c_blkno
global bitmap system inode c_blkno bg_next_group
ocfs2_dinode.id2.i_chain.cl_recs[0] -> ocfs2_group_desc -> ocfs2_group_desc
[1] -> ocfs2_group_desc -> ocfs2_group_desc
[2] -> ocfs2_group_desc -> ocfs2_group_desc
mkfs.ocfs2/mkfs.c main()
---
tmprec = &(record[GLOBAL_BITMAP_SYSTEM_INODE][0]);
...
s->global_bm = initialize_bitmap (s, s->volume_size_in_clusters,
s->cluster_size_bits,
"global bitmap", tmprec);
---
initialize_bitmap()
---
for(i = 1; i < s->nr_cluster_groups; i++) {
if (i == (s->nr_cluster_groups - 1))
cpg = s->tail_group_bits;
bitmap->groups[i] = initialize_alloc_group(s, "stupid", bm_record, blkno, chain, cpg, 1);
if (wrapped) {
j = i - recs_per_inode;
bitmap->groups[j]->gd->bg_next_group = blkno;
bitmap->groups[j]->next = bitmap->groups[i];
}
bitmap->groups[chain]->chain_total += bitmap->groups[i]->gd->bg_bits;
bitmap->groups[chain]->chain_free += bitmap->groups[i]->gd->bg_free_bits_count;
blkno += (uint64_t) s->global_cpg << (s->cluster_size_bits - s->blocksize_bits);
chain++;
if (chain >= recs_per_inode) {
chain = 0;
wrapped = 1;
}
}
---
In a word, it is a inode that carry all of the system inodes' block
mkfs.ocfs2/mkfs.c main()
---
tmprec = &(record[GLOBAL_INODE_ALLOC_SYSTEM_INODE][0]);
//blocks_needed calculate the blocks that all of the system inodes need
//Then setup a bitmap to carry them. After this, all of the system inodes
//will be allocated here
need = blocks_needed(s);
alloc_bytes_from_bitmap(s, need << s->blocksize_bits,
s->global_bm,
&(crap_rec.extent_off),
&(crap_rec.extent_len));
s->system_group =
initialize_alloc_group(s, "system inode group", tmprec,
crap_rec.extent_off >> s->blocksize_bits,
0, crap_rec.extent_len >> s->cluster_size_bits,
s->cluster_size / s->blocksize);
---
A inode that carries blocks for inodes of one slot.
When we allocate an inode, we actually allocate a block which will carry the
inode and inline data in the future.
ocfs2_mknod()
-> ocfs2_reserve_new_inode()
-> ocfs2_reserve_suballoc_bits() //INODE_ALLOC_SYSTEM_INODE
---
status = ocfs2_inode_lock(alloc_inode, &bh, 1);
//The lock will be held during the whole inode allocate process
//until ocfs2_free_ac_resource(). This is reservation. No one
//can touch the inode bitmap during this.
...
ac->ac_inode = alloc_inode;
ac->ac_alloc_slot = slot;
fe = (struct ocfs2_dinode *) bh->b_data;
free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
le32_to_cpu(fe->id1.bitmap1.i_used);
if (bits_wanted > free_bits) {
// If the space in it is not enough, allocate new cluster from the block allocator.
}
---
ocfs2_block_group_alloc() is used to charge the INODE_ALLOC_SYSTEM_INODE
ocfs2_block_group_alloc()
---
//Similar to ocfs2_reserve_suballoc_bits, prepare enough space in the cluster allocation inode
//The allocation unit is a group (cl->cl_cpg, namely, clusters per group)
status = ocfs2_reserve_clusters_with_limit(osb, le16_to_cpu(cl->cl_cpg), max_block, flags, &ac);
//calculate the needed log space
credits = ocfs2_calc_group_alloc_credits(osb->sb,
le16_to_cpu(cl->cl_cpg));
handle = ocfs2_start_trans(osb, credits);
//Do the really allocation and return the buffer_head of block group header
bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
ac, cl);
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
bh, OCFS2_JOURNAL_ACCESS_WRITE);
alloc_rec = le16_to_cpu(bg->bg_chain);
le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, le16_to_cpu(bg->bg_free_bits_count));
le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
//Link the block group in the chain list.
//In ocfs2_block_group_alloc_contig->ocfs2_block_group_fill
/bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
le16_add_cpu(&cl->cl_next_free_rec, 1);
...
le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, le32_to_cpu(fe->i_clusters)));
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
---
After get enough space, the reall allocation is done in following path,
ocfs2_mknod_locked()
-> ocfs2_claim_new_inode()
-> ocfs2_claim_suballoc_bits()
OCFS2 is a shard-disk cluster filesystem.
It need to feel the state of the cluster and serialize the concurrent accessing to shared disk.
o2cb is such a component which is built-in of ocfs2 kernel module.
+-----------------------------------+ +-----------------------------------+
| ocfs2 | | ocfs2 |
+-------------------------+---------+ +---------+-------------------------+
| dlm glue | | | | dlm glue |
+------------------+------+ | OR | +-------------------------+
| o2dlm | o2hb | | | bdev |
+------+-----------+------+ | | | dlm | /dev/ocfs2_control
| o2nm | o2net | bdev | | |
+------+-----------+----------------+ +---------+
When setup a ocfs2 cluster, we need the ocfs2 to know the information of cluster
/etc/ocfs2/cluster.conf
node:
name = node0
cluster = mycluster
number = 0
ip_address = 10.1.0.100
ip_port = 7777
node:
name = node1
cluster = mycluster
number = 1
ip_address = 10.1.0.101
ip_port = 7777
node:
name = node2
cluster = mycluster
number = 2
ip_address = 10.1.0.102
ip_port = 7777
cluster:
name = mycluster
heartbeat_mode = local
node_count = 4
We install these informations to kernel through configfs
+---- cluster
+---- test
+---- fence_method
+---- heartbeat
+---- idle_timeout_ms
+---- keepalive_delay_ms
+---- node
+ +---- node0
+ + +---- ipv4_address
+ + +---- ipv4_port
+ + +---- local
+ + +---- num
+ +---- node1
+ +---- ipv4_address
+ +---- ipv4_port
+ +---- local
+ +---- num
+---- reconnect_delay_ms
The information is maintained by o2nm_node.
Regarding to the progress of setup, please refer to o2nm_node_ipv4_address/ipv4_port/num/local_store.
Noted ! There can be only one cluster per node
A listen socket is setup which is used to setup the cluster linkings
o2nm_node_local_store()
-> o2net_start_listening()
o2nm_cluster_group_make_group()
---
if (o2nm_single_cluster)
return ERR_PTR(-ENOSPC);
---
ocfs2 heartbeat is a way to let the nodes in a cluster to know each other is alive.
It works through read and write a shared disk.
We have konwn that the ocfs2 heartbeat works on the shared disk. But where to
read and write ?
Look into the code of ocfs2tool,
ocfs2_fill_heartbeat_desc()
---
//sectsize is determined by BLKSSZGET, which is block device's logical block size
ret = ocfs2_get_device_sectsize(fs->fs_devname, §size);
sectsize_bits = ffs(sectsize) - 1;
//It is a system file of ocfs2
filename = ocfs2_system_inodes[HEARTBEAT_SYSTEM_INODE].si_name;
//We need to load the inode and bmap information
ret = ocfs2_lookup(fs, fs->fs_sysdir_blkno, filename,
strlen(filename), NULL, &blkno);
ret = ocfs2_read_inode(fs, blkno, buf);
di = (struct ocfs2_dinode *)buf;
if (di->id2.i_list.l_tree_depth ||
di->id2.i_list.l_next_free_rec != 1) {
ret = OCFS2_ET_BAD_HEARTBEAT_FILE;
goto leave;
}
rec = &(di->id2.i_list.l_recs[0]);
block_bits = OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
cluster_bits = OCFS2_RAW_SB(fs->fs_super)->s_clustersize_bits;
blocks = ocfs2_rec_clusters(0, rec) << cluster_bits;
blocks >>= block_bits;
start_block = rec->e_blkno << block_bits;
start_block >>= sectsize_bits;
//The region name is the filesystem's uuid
desc->r_name = fs->uuid_str;
desc->r_device_name = fs->fs_devname;
desc->r_block_bytes = sectsize;
desc->r_start_block = start_block;
desc->r_blocks = blocks;
---
This also tell us the heartbeat device need to be formated.
There are two modes of heartbeat, local and global.
In local mode, every mount will start a heartbeat for itself. This is infficient
when there are many ocfs2 volumes in the cluster. At the moment, global
heartbeat is recommended.
If local mode,
mount.ocfs2 main()
-> ocfs2_fill_heartbeat_desc()
-> o2cb_begin_group_join()
---
ret = o2cb_validate_cluster_name(cluster);
ret = o2cb_validate_cluster_flags(cluster, &globalhb);
if (!globalhb)
ret = o2cb_start_heartbeat(cluster, region);
---
-> ret = mount(spec, mo.dir, OCFS2_FS_NAME, mo.flags & ~MS_NOSYS, mo.xtra_opts);
If global mode, 'oc2b start-heartbeat cluster' name can start the global heartbeat.
There are two native heartbeat method in ocfs2,
- read/write the shared heartbeat file
- networking negotiation when shared disk is offline
Every node occupy a block in the region.
+----+
| | block -> o2hb_disk_heartbeat_block
+----+
| | block
+----+
| | block
+----+
| | block
+----+
The block size is logical block size of the block device and
read/write on this size are atomic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
o2hb_region_dev_store()
-> kthread_run o2hb_thread
-> o2hb_do_disk_heartbeat()
// Read all of the nodes's block in
-> o2hb_read_slots(reg, lowest_node, highest_node + 1);
// Write my own node's block out
-> o2hb_issue_node_write(reg, &write_wc);
-> o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, REQ_SYNC);
To determine whether a node is OK, o2hb_check_slot() would check the
o2hb_disk_heartbeat_block.hb_seq
The hb_seq is the wall clock seconds since 1970 get through ktime_get_real_seconds()
o2hb_prepare_block()
---
cputime = ktime_get_real_seconds();
if (!cputime)
cputime = 1;
hb_block->hb_seq = cpu_to_le64(cputime);
---
o2hb_check_slot() will check whether the hb_seq has been changed during the past time.
o2hb_check_slot()
---
cputime = le64_to_cpu(hb_block->hb_seq);
if (slot->ds_last_time != cputime)
slot->ds_changed_samples++;
else
slot->ds_equal_samples++;
slot->ds_last_time = cputime;
spin_lock(&o2hb_live_lock);
fire_callbacks:
/* dead nodes only come to life after some number of
* changes at any time during their dead time
*/
if (list_empty(&slot->ds_live_item) &&
slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD (2)) {
set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
/* first on the list generates a callback */
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
^^^^^^^^^^^^^^
slot->ds_node_num);
}
}
/* live nodes only go dead after enough consequtive missed
* samples.. reset the missed counter whenever we see activity
*/
if (slot->ds_equal_samples >= o2hb_dead_threshold(31) || gen_changed) {
clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
^^^^^^^^^^^^^
node, slot->ds_node_num);
}
}
---
Note: regarding to O2HB_NODE_UP_CB/O2HB_NODE_DOWN_CB, we will talk it in next section.
A node depends on heartbeat to announce its liveness. The IO can be timed out
due to bug in software or hardware.
write timer handle this case.
o2hb_write_timeout()
-> o2quo_disk_timeout()
-> o2quo_fence_self() panic or emergency_restart
There is a special case that the backend of shared disk falls into problem.
At the moment, all of the nodes will fence theirselves due to heartbeat timeout.
To fix this, negotiation mechanism is introduced,
Subject: ocfs2: o2hb: add negotiate timer
This series of patches is to fix the issue that when storage down, all
nodes will fence self due to write timeout.
With this patch set, all nodes will keep going until storage back online,
except if the following issue happens, then all nodes will do as before to
fence self.
1. io error got
2. network between nodes down
3. nodes panic
This patch (of 6):
When storage down, all nodes will fence self due to write timeout. The
negotiate timer is designed to avoid this, with it node will wait until
storage up again.
Negotiate timer working in the following way:
1. The timer expires before write timeout timer, its timeout is half
of write timeout now. It is re-queued along with write timeout timer.
If expires, it will send NEGO_TIMEOUT message to master node(node with
lowest node number). This message does nothing but marks a bit in a
bitmap recording which nodes are negotiating timeout on master node.
2. If storage down, nodes will send this message to master node, then
when master node finds its bitmap including all online nodes, it sends
NEGO_APPROVL message to all nodes one by one, this message will
re-queue write timeout timer and negotiate timer. For any node doesn't
receive this message or meets some issue when handling this message, it
will be fenced. If storage up at any time, o2hb_thread will run and
re-queue all the timer, nothing will be affected by these two steps.
To setup the connection between nodes in the cluster, we need to configure node ipv4 address and port firstly.
This is done through configfs when setup the cluster. Refer to Nodes
The connection is built up when heartbeat starts and the node knows its siblings alive.
When we set 'local' attribute of node, listen socket is setup
o2nm_node_local_store()
-> o2net_start_listening()
o2hb_do_disk_heartbeat()
-> o2hb_check_slot()
-> o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, slot->ds_node_num);
o2net_hb_node_up_cb()
-> o2net_set_nn_state(nn, NULL, 0, 0);
-> queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
o2net_start_connect()
---
/* if we're greater we initiate tx, otherwise we accept */
if (o2nm_this_node() <= o2net_num_from_nn(nn))
goto out;
sc = sc_alloc(node);
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
sc->sc_sock = sock; /* freed by sc_kref_release */
sock->sk->sk_allocation = GFP_ATOMIC;
myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
myaddr.sin_port = htons(0); /* any port */
ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
sizeof(myaddr));
...
ret = sc->sc_sock->ops->connect(sc->sc_sock,
(struct sockaddr *)&remoteaddr,
sizeof(remoteaddr),
O_NONBLOCK);
---
When the connection is setup, the node pair need to exchange handshake.
o2net_state_change()
-> o2net_sc_queue_work(sc, &sc->sc_connect_work);
---
o2net_initialize_handshake();
o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
---
The handshake includes
- o2hb_heartbeat_timeout_ms
- o2net_idle_timeout_ms
- o2net_keepalive_delay_ms
- o2net_reconnect_delay_ms
o2net_data_ready()
-> o2net_sc_queue_work(sc, &sc->sc_rx_work)
-> o2net_advance_rx()
-> o2net_check_handshake()
-> o2net_set_nn_state(nn, sc, 1, 0); //valid is true
---
if (!was_valid && valid) {
o2quo_conn_up(o2net_num_from_nn(nn));
cancel_delayed_work(&nn->nn_connect_expired);
printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
o2nm_this_node() > sc->sc_node->nd_num ?
"Connected to" : "Accepted connection from",
SC_NODEF_ARGS(sc));
}
---
To know the purpose of o2net, we need to get what's messages does it handle,
We can track the message handler register to know this, namely, caller of o2net_register_handler,
Heartbeat
- O2HB_NEGO_TIMEOUT_MSG
- O2HB_NEGO_APPROVE_MSG
O2DLM
- DLM_MASTER_REQUEST_MSG
- DLM_UNUSED_MSG1
- DLM_ASSERT_MASTER_MSG
- DLM_CREATE_LOCK_MSG
- DLM_CONVERT_LOCK_MSG
- DLM_PROXY_AST_MSG
- DLM_UNLOCK_LOCK_MSG
- DLM_DEREF_LOCKRES_MSG
- DLM_MIGRATE_REQUEST_MS
- DLM_MIG_LOCKRES_MSG
- DLM_QUERY_JOIN_MSG
- DLM_ASSERT_JOINED_MSG
- DLM_CANCEL_JOIN_MSG
- DLM_EXIT_DOMAIN_MSG
- DLM_MASTER_REQUERY_MSG
- DLM_LOCK_REQUEST_MSG
- DLM_RECO_DATA_DONE_MSG
- DLM_BEGIN_RECO_MSG
- DLM_FINALIZE_RECO_MSG
- DLM_QUERY_REGION
- DLM_QUERY_NODEINFO
- DLM_BEGIN_EXIT_DOMAIN_
- DLM_DEREF_LOCKRES_DONE
A node which has given up on connecting to a majority
of nodes who are still heartbeating will fence itself.
o2net_process_message()
-> o2net_sc_postpone_idle()
o2net_check_handshake()
-> o2net_sc_reset_idle_timer()
---
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
msecs_to_jiffies(o2net_keepalive_delay()));
o2net_set_sock_timer(sc);
mod_timer(&sc->sc_idle_timeout,
jiffies + msecs_to_jiffies(o2net_idle_timeout()));
---
When delayed kwork sc_keepalive_work is fired, it will send out a keepalive message,
when the sc_idle_timeout is expired, it says a nodes has lost connection with us.
The system service o2cb would setup the ocfs2 configrations when system boot.
o2cb.init.sh
online()
-> online_o2cb()
-> register_cluster_o2cb()
-> o2cb register-cluster cluster
-> start_global_heartbeat_o2cb()
-> o2cb start-heartbeat cluster
When the /etc/ocfs2/cluster.conf is setup properly, o2cb would setup the
configrations based on it.
Except for provding communication capability (o2net), the main target of o2cb is
to feel the state of nodes in cluster. This is provided by o2hb.
Let's look at the callback of O2HB_NODE_DOWN_CB
o2net_hb_node_down_cb()
---
o2quo_hb_down(node_num);
if (!node)
return;
if (node_num != o2nm_this_node())
o2net_disconnect_node(node);
---
/*
* This will notify any dlm users that a node in our domain
* went away without notifying us first.
*/
//Right now, the only one user is ocfs2_do_node_down
if (test_bit(idx, dlm->domain_map))
dlm_fire_domain_eviction_callbacks(dlm, idx);
spin_lock(&dlm->spinlock);
__dlm_hb_node_down(dlm, idx);
spin_unlock(&dlm->spinlock)
When a node is down, there maybe some journal has not been checkpointed
to shared disk in its per-node log file.
So,
__ocfs2_recovery_thread() --- // The node firstly success to get super lock is responsible for recovery. // After recovery the node's slot is cleared. The following node get super // lock see this and will not do recovery any more. status = ocfs2_super_lock(osb, 1); --- status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); status = ocfs2_should_refresh_lock_res(lockres); if (status) { status = ocfs2_refresh_slot_info(osb); --- ... spin_lock(&osb->osb_lock); while (rm->rm_used) { /* It's always safe to remove entry zero, as we won't * clear it until ocfs2_recover_node() has succeeded. */ node_num = rm->rm_entries[0]; spin_unlock(&osb->osb_lock); slot_num = ocfs2_node_num_to_slot(osb, node_num); if (slot_num == -ENOENT) { status = 0; goto skip_recovery; } status = ocfs2_recover_node(osb, node_num, slot_num); ... } ---
The jbd2 journal recovery just copy the logged blocks to its target position. During this, there is no concurrent access control to the metadata. So we have to stop all of the access to the inode's metadata And this is also the reason why all of the nodes need to know recovery is ongoing. ocfs2_inode_lock_full_nested() --- if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) ocfs2_wait_for_recovery(osb); ---
+----+ +----+ +----+ | N0 | | N1 | | N2 | +----+ +----+ +----+ | | | | | | -----------+----------- +---+ | D | +---+There are two key points need to do to keep cache consistency
The node holding PR lock on a file need to invalidate its cache when other node want to upconvert to EX mode. The node holding EX lock on a file need to flush data to shared disk when other node want to downconvert to PR lock. ocfs2_data_convert_worker() --- unmap_mapping_range(mapping, 0, 0, 0); if (filemap_fdatawrite(mapping)) { mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", (unsigned long long)OCFS2_I(inode)->ip_blkno); } sync_mapping_buffers(mapping); if (blocking == DLM_LOCK_EX) { //If we need to downconvert to NL, drop the cache truncate_inode_pages(mapping, 0); } else { /* We only need to wait on the I/O if we're not also * truncating pages because truncate_inode_pages waits * for us above. We don't truncate pages if we're * blocking anything < EXMODE because we want to keep * them around in that case. */ filemap_fdatawait(mapping); } ---
The node holding meta EX lock on a file need to wait the log is checkpointed when other node want to downconvert the lock. ^^^^^^^^^^^^^^^^^^^ Only after checkpoint, we can ensure the metadata has been flushed do disk. ocfs2_unblock_lock() --- if (lockres->l_ops->check_downconvert && !lockres->l_ops->check_downconvert(lockres, new_level)) { goto leave_requeue; } --- ocfs2_check_meta_downconvert() -> ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level); -> ocfs2_ci_fully_checkpointed() --- spin_lock(&trans_inc_lock); ret = time_after(journal->j_trans_id, ci->ci_last_trans); spin_unlock(&trans_inc_lock); ---