OCFS2

Nodes
Heartbeat
Init when boot
Net
Node down

Disk Format

Basis

cluster size

The smallest unit of blocks of regular files

There is a default policy to decide cluster size,
mkfs.ocfs2/mkfs.c fill_defaults()
---
    if (!s->cluster_size) {
        switch (s->fs_type) {
        case OCFS2_MKFSTYPE_DATAFILES:
        case OCFS2_MKFSTYPE_VMSTORE:
            s->cluster_size = cluster_size_datafiles(s);
            ---

                if (volume_gigs < 64)
                    cluster_size = 128;
                else if (volume_gigs < 96)
                    cluster_size = 256;
                else if (volume_gigs < 128)
                    cluster_size = 512;
                else
                    cluster_size = 1024;

                return cluster_size * 1024;

            ---
            break;
        default:
            s->cluster_size = cluster_size_default(s);
            ---

                for (cluster_size = 4096; cluster_size < 1024 * 1024; cluster_size <<= 1) {
                    cluster_size_bits = get_bits(s, cluster_size);

                    volume_size =
                        s->volume_size_in_bytes >> cluster_size_bits;

                    if (volume_size <= (2^32 - 1))
                        break;
                }
            ---

            break;
        }
    }
---

cluster group

include/ocfs2/ocfs2.h ocfs2_calc_cluster_groups()
---
    uint16_t max_bits = 8 * ocfs2_group_bitmap_size(blocksize, 0, 0);

    ---
    int size = blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap);
    ---


    cgs->cgs_cpg = max_bits;

    cgs->cgs_cluster_groups = (clusters + cgs->cgs_cpg - 1) / 
                  cgs->cgs_cpg;
    cgs->cgs_tail_group_bits = clusters % cgs->cgs_cpg;
    if (cgs->cgs_tail_group_bits == 0)
        cgs->cgs_tail_group_bits = cgs->cgs_cpg;

ocfs2_group_desc     bitmap
  +----------+-------------------+
  |          |                   |
  +----------+-------------------+
 bg_next_group
      |
      v
 ocfs2_group_desc     bitmap
  +----------+-------------------+
  |          |                   |
  +----------+-------------------+

---

journal size

    switch (s->fs_type) {
    case OCFS2_MKFSTYPE_DATAFILES:
        j_blocks = journal_size_datafiles();//8K * 4K
        break;
    case OCFS2_MKFSTYPE_MAIL:
        j_blocks = journal_size_mail(s);//64K * 4K
        break;
    case OCFS2_MKFSTYPE_VMSTORE:
        j_blocks = journal_size_vmstore(s);//32K * 4K
        break;
    default:
        j_blocks = journal_size_default(s);//min(64K * 4K, volume_size/160)
        break;
    }

System Files

GLOBAL_BITMAP_SYSTEM_INODE

ocfs2_dinode.id2.i_chain
struct ocfs2_chain_list {
/*00*/    __le16 cl_cpg;            /* Clusters per Block Group */
    __le16 cl_bpc;            /* Bits per cluster */
    __le16 cl_count;        /* Total chains in this list */
    __le16 cl_next_free_rec;    /* Next unused chain slot */
    __le64 cl_reserved1;
/*10*/    struct ocfs2_chain_rec cl_recs[0];    /* Chain records */
};

struct ocfs2_chain_rec {
    __le32 c_free;    /* Number of free bits in this chain. */
    __le32 c_total;    /* Number of total bits in this chain */
    __le64 c_blkno;    /* Physical disk offset (blocks) of 1st group */
};

All of the ocfs2_group_desc and its following bitmap are hash-linked on the
ocfs2_chain_rec.c_blkno

global bitmap system inode         c_blkno                bg_next_group
ocfs2_dinode.id2.i_chain.cl_recs[0]  ->  ocfs2_group_desc      ->     ocfs2_group_desc
                                [1]  ->  ocfs2_group_desc      ->     ocfs2_group_desc
                                [2]  ->  ocfs2_group_desc      ->     ocfs2_group_desc
mkfs.ocfs2/mkfs.c main()
---
    tmprec = &(record[GLOBAL_BITMAP_SYSTEM_INODE][0]);
    ...
    s->global_bm = initialize_bitmap (s, s->volume_size_in_clusters,
                      s->cluster_size_bits,
                      "global bitmap", tmprec);
---
initialize_bitmap()
---
    for(i = 1; i < s->nr_cluster_groups; i++) {
        if (i == (s->nr_cluster_groups - 1))
            cpg = s->tail_group_bits;
        bitmap->groups[i] = initialize_alloc_group(s, "stupid", bm_record, blkno, chain, cpg, 1);
        if (wrapped) {
            j = i - recs_per_inode;
            bitmap->groups[j]->gd->bg_next_group = blkno;
            bitmap->groups[j]->next = bitmap->groups[i];
        }

        bitmap->groups[chain]->chain_total += bitmap->groups[i]->gd->bg_bits;
        bitmap->groups[chain]->chain_free += bitmap->groups[i]->gd->bg_free_bits_count;

        blkno += (uint64_t) s->global_cpg << (s->cluster_size_bits - s->blocksize_bits);
        chain++;
        if (chain >= recs_per_inode) {
            chain = 0;
            wrapped = 1;
        }
    }
---

GLOBAL_INODE_ALLOC_SYSTEM_INODE

In a word, it is a inode that carry all of the system inodes' block
mkfs.ocfs2/mkfs.c main()
---
    tmprec = &(record[GLOBAL_INODE_ALLOC_SYSTEM_INODE][0]);

    //blocks_needed calculate the blocks that all of the system inodes need
    //Then setup a bitmap to carry them. After this, all of the system inodes
    //will be allocated here

    need = blocks_needed(s);
    alloc_bytes_from_bitmap(s, need << s->blocksize_bits,
                                s->global_bm,
                                &(crap_rec.extent_off),
                                &(crap_rec.extent_len));

    s->system_group =
        initialize_alloc_group(s, "system inode group", tmprec,
                       crap_rec.extent_off >> s->blocksize_bits,
                       0, crap_rec.extent_len >> s->cluster_size_bits,
                       s->cluster_size / s->blocksize);
---

INODE_ALLOC_SYSTEM_INODE

A inode that carries blocks for inodes of one slot.
When we allocate an inode, we actually allocate a block which will carry the
inode and inline data in the future.

ocfs2_mknod()
  -> ocfs2_reserve_new_inode()
    -> ocfs2_reserve_suballoc_bits() //INODE_ALLOC_SYSTEM_INODE
    ---
    status = ocfs2_inode_lock(alloc_inode, &bh, 1);

    //The lock will be held during the whole inode allocate process
    //until ocfs2_free_ac_resource(). This is reservation. No one
    //can touch the inode bitmap during this.

    ...
    ac->ac_inode = alloc_inode;
    ac->ac_alloc_slot = slot;

    fe = (struct ocfs2_dinode *) bh->b_data;

    free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
        le32_to_cpu(fe->id1.bitmap1.i_used);

    if (bits_wanted > free_bits) {

    // If the space in it is not enough, allocate new cluster from the block allocator.

    }
    ---


ocfs2_block_group_alloc() is used to charge the INODE_ALLOC_SYSTEM_INODE

ocfs2_block_group_alloc()
 ---
 
     //Similar to ocfs2_reserve_suballoc_bits, prepare enough space in the cluster allocation inode
    //The allocation unit is a group (cl->cl_cpg, namely, clusters per group)
 
    status = ocfs2_reserve_clusters_with_limit(osb, le16_to_cpu(cl->cl_cpg), max_block, flags, &ac);

    //calculate the needed log space

    credits = ocfs2_calc_group_alloc_credits(osb->sb,
                         le16_to_cpu(cl->cl_cpg));
    handle = ocfs2_start_trans(osb, credits);

    //Do the really allocation and return the buffer_head of block group header

    bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
                           ac, cl);
    bg = (struct ocfs2_group_desc *) bg_bh->b_data;

    status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
                     bh, OCFS2_JOURNAL_ACCESS_WRITE);

    alloc_rec = le16_to_cpu(bg->bg_chain);
    le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, le16_to_cpu(bg->bg_free_bits_count));
    le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));

    //Link the block group in the chain list.
    //In ocfs2_block_group_alloc_contig->ocfs2_block_group_fill
    /bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);

    cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
    if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
        le16_add_cpu(&cl->cl_next_free_rec, 1);
    ...
    le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));

    ocfs2_journal_dirty(handle, bh);

    spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
    OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
    fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, le32_to_cpu(fe->i_clusters)));
    spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);

    i_size_write(alloc_inode, le64_to_cpu(fe->i_size));

    alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
    ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
      ---
After get enough space, the reall allocation is done in following path,
ocfs2_mknod_locked()
  -> ocfs2_claim_new_inode()
    -> ocfs2_claim_suballoc_bits()

OCFS2 is a shard-disk cluster filesystem.
It need to feel the state of the cluster and serialize the concurrent accessing to shared disk.
o2cb is such a component which is built-in of ocfs2 kernel module.

     +-----------------------------------+       +-----------------------------------+
     |              ocfs2                |       |              ocfs2                |
     +-------------------------+---------+       +---------+-------------------------+
     |       dlm glue          |         |       |         |        dlm glue         |
     +------------------+------+         |  OR   |         +-------------------------+
     |       o2dlm      | o2hb |         |       |   bdev  |                          
     +------+-----------+------+         |       |         |     dlm | /dev/ocfs2_control                     
     | o2nm |   o2net   |      bdev      |       |         |                          
     +------+-----------+----------------+       +---------+

Nodes

When setup a ocfs2 cluster, we need the ocfs2 to know the information of cluster

/etc/ocfs2/cluster.conf

node:
        name = node0
        cluster = mycluster
        number = 0
        ip_address = 10.1.0.100
        ip_port = 7777

node:
        name = node1
        cluster = mycluster
        number = 1
        ip_address = 10.1.0.101
        ip_port = 7777

node:
        name = node2
        cluster = mycluster
        number = 2
        ip_address = 10.1.0.102
        ip_port = 7777

cluster:
        name = mycluster
        heartbeat_mode = local
        node_count = 4

We install these informations to kernel through configfs

+---- cluster
      +---- test
            +---- fence_method
            +---- heartbeat
            +---- idle_timeout_ms
            +---- keepalive_delay_ms
            +---- node
            +     +---- node0
            +     +     +---- ipv4_address
            +     +     +---- ipv4_port
            +     +     +---- local
            +     +     +---- num
            +     +---- node1
            +           +---- ipv4_address
            +           +---- ipv4_port
            +           +---- local
            +           +---- num
            +---- reconnect_delay_ms

The information is maintained by o2nm_node.

Regarding to the progress of setup, please refer to o2nm_node_ipv4_address/ipv4_port/num/local_store.

'num' need to be set after 'ipv4_address/ipv4_port'
'local' need to be set after 'num/ipv4_address/ipv4_port'

when local is set

A listen socket is setup which is used to setup the cluster linkings

o2nm_node_local_store()
  -> o2net_start_listening()

Noted ! There can be only one cluster per node

o2nm_cluster_group_make_group()
---
    if (o2nm_single_cluster)
        return ERR_PTR(-ENOSPC);
---

Heartbeat

ocfs2 heartbeat is a way to let the nodes in a cluster to know each other is alive.
It works through read and write a shared disk.

Where

We have konwn that the ocfs2 heartbeat works on the shared disk. But where to
read and write ?
Look into the code of ocfs2tool,
ocfs2_fill_heartbeat_desc()
---

    //sectsize is determined by BLKSSZGET, which is block device's logical block size

    ret = ocfs2_get_device_sectsize(fs->fs_devname, §size);
    sectsize_bits = ffs(sectsize) - 1;


    //It is a system file of ocfs2

    filename = ocfs2_system_inodes[HEARTBEAT_SYSTEM_INODE].si_name;


    //We need to load the inode and bmap information

    ret = ocfs2_lookup(fs, fs->fs_sysdir_blkno, filename,
               strlen(filename),  NULL, &blkno);
    ret = ocfs2_read_inode(fs, blkno, buf);

    di = (struct ocfs2_dinode *)buf;
    if (di->id2.i_list.l_tree_depth || 
        di->id2.i_list.l_next_free_rec != 1) {
        ret = OCFS2_ET_BAD_HEARTBEAT_FILE;
        goto leave;
    }
    rec = &(di->id2.i_list.l_recs[0]);

    block_bits = OCFS2_RAW_SB(fs->fs_super)->s_blocksize_bits;
    cluster_bits = OCFS2_RAW_SB(fs->fs_super)->s_clustersize_bits;

    blocks = ocfs2_rec_clusters(0, rec) << cluster_bits;
    blocks >>= block_bits;

    start_block = rec->e_blkno << block_bits;
    start_block >>= sectsize_bits;


    //The region name is the filesystem's uuid

    desc->r_name            = fs->uuid_str;
    desc->r_device_name        = fs->fs_devname;
    desc->r_block_bytes        = sectsize;
    desc->r_start_block        = start_block;
    desc->r_blocks            = blocks;

---
This also tell us the heartbeat device need to be formated.

When

There are two modes of heartbeat, local and global.
In local mode, every mount will start a heartbeat for itself. This is infficient
when there are many ocfs2 volumes in the cluster. At the moment, global
heartbeat is recommended.

If local mode,
mount.ocfs2 main()
  -> ocfs2_fill_heartbeat_desc()
  -> o2cb_begin_group_join()
     ---
        ret = o2cb_validate_cluster_name(cluster);
        ret = o2cb_validate_cluster_flags(cluster, &globalhb);
        if (!globalhb)
            ret = o2cb_start_heartbeat(cluster, region);
     ---
  -> ret = mount(spec, mo.dir, OCFS2_FS_NAME, mo.flags & ~MS_NOSYS, mo.xtra_opts);

If global mode, 'oc2b start-heartbeat cluster' name can start the global heartbeat.

How

There are two native heartbeat method in ocfs2,
 - read/write the shared heartbeat file
 - networking negotiation when shared disk is offline

Every node occupy a block in the region.
                        +----+
                        |    | block -> o2hb_disk_heartbeat_block
                        +----+
                        |    | block
                        +----+ 
                        |    | block
                        +----+
                        |    | block
                        +----+

The block size is logical block size of the block device and
read/write on this size are atomic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

o2hb_region_dev_store()
  -> kthread_run o2hb_thread
    -> o2hb_do_disk_heartbeat()

        // Read all of the nodes's block in

      -> o2hb_read_slots(reg, lowest_node, highest_node + 1);

        // Write my own node's block out

      -> o2hb_issue_node_write(reg, &write_wc);
         -> o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, REQ_SYNC);

To determine whether a node is OK, o2hb_check_slot() would check the
o2hb_disk_heartbeat_block.hb_seq

The hb_seq is the wall clock seconds since 1970 get through ktime_get_real_seconds()
o2hb_prepare_block()
---
    cputime = ktime_get_real_seconds();
    if (!cputime)
        cputime = 1;

    hb_block->hb_seq = cpu_to_le64(cputime);
---

o2hb_check_slot() will check whether the hb_seq has been changed during the past time.
o2hb_check_slot()
---
    cputime = le64_to_cpu(hb_block->hb_seq);
    if (slot->ds_last_time != cputime)
        slot->ds_changed_samples++;
    else
        slot->ds_equal_samples++;
    slot->ds_last_time = cputime;

    spin_lock(&o2hb_live_lock);

fire_callbacks:

    /* dead nodes only come to life after some number of
     * changes at any time during their dead time
     */

    if (list_empty(&slot->ds_live_item) &&
        slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD (2)) {
        set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);

        /* first on the list generates a callback */
        if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
            set_bit(slot->ds_node_num, o2hb_live_node_bitmap);

            o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
                                          ^^^^^^^^^^^^^^
                          slot->ds_node_num);
        }
    }

    /* live nodes only go dead after enough consequtive missed
     * samples..  reset the missed counter whenever we see activity
     */

    if (slot->ds_equal_samples >= o2hb_dead_threshold(31) || gen_changed) {
        clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
        if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
            clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);

            o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
                                          ^^^^^^^^^^^^^
                          node, slot->ds_node_num);
        }
    }
---
Note: regarding to O2HB_NODE_UP_CB/O2HB_NODE_DOWN_CB, we will talk it in next section.

timeout

A node depends on heartbeat to announce its liveness. The IO can be timed out
due to bug in software or hardware.

write timer handle this case.
o2hb_write_timeout()
  -> o2quo_disk_timeout()
    -> o2quo_fence_self() panic or emergency_restart

There is a special case that the backend of shared disk falls into problem.
At the moment, all of the nodes will fence theirselves due to heartbeat timeout.

To fix this, negotiation mechanism is introduced,
Subject: ocfs2: o2hb: add negotiate timer

This series of patches is to fix the issue that when storage down, all
nodes will fence self due to write timeout.

With this patch set, all nodes will keep going until storage back online,
except if the following issue happens, then all nodes will do as before to
fence self.

1. io error got
2. network between nodes down
3. nodes panic


This patch (of 6):

When storage down, all nodes will fence self due to write timeout.  The
negotiate timer is designed to avoid this, with it node will wait until
storage up again.

Negotiate timer working in the following way:

1. The timer expires before write timeout timer, its timeout is half
   of write timeout now.  It is re-queued along with write timeout timer. 
   If expires, it will send NEGO_TIMEOUT message to master node(node with
   lowest node number).  This message does nothing but marks a bit in a
   bitmap recording which nodes are negotiating timeout on master node.

2. If storage down, nodes will send this message to master node, then
   when master node finds its bitmap including all online nodes, it sends
   NEGO_APPROVL message to all nodes one by one, this message will
   re-queue write timeout timer and negotiate timer.  For any node doesn't
   receive this message or meets some issue when handling this message, it
   will be fenced.  If storage up at any time, o2hb_thread will run and
   re-queue all the timer, nothing will be affected by these two steps.

Net

Setup

To setup the connection between nodes in the cluster, we need to configure node ipv4 address and port firstly.
This is done through configfs when setup the cluster. Refer to Nodes

The connection is built up when heartbeat starts and the node knows its siblings alive.

Listen

When we set 'local' attribute of node, listen socket is setup

o2nm_node_local_store()
  -> o2net_start_listening()

Connect

o2hb_do_disk_heartbeat()
  -> o2hb_check_slot()
    -> o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, slot->ds_node_num);
o2net_hb_node_up_cb()
  -> o2net_set_nn_state(nn, NULL, 0, 0);
    -> queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
o2net_start_connect()
---
    /* if we're greater we initiate tx, otherwise we accept */
    if (o2nm_this_node() <= o2net_num_from_nn(nn))
        goto out;

    sc = sc_alloc(node);
    ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
    sc->sc_sock = sock; /* freed by sc_kref_release */
    sock->sk->sk_allocation = GFP_ATOMIC;
    myaddr.sin_family = AF_INET;
    myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
    myaddr.sin_port = htons(0); /* any port */

    ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
                  sizeof(myaddr));
    ...
    ret = sc->sc_sock->ops->connect(sc->sc_sock,
                    (struct sockaddr *)&remoteaddr,
                    sizeof(remoteaddr),
                    O_NONBLOCK);
---

Establish

When the connection is setup, the node pair need to exchange handshake.

o2net_state_change()
  -> o2net_sc_queue_work(sc, &sc->sc_connect_work);
  ---
    o2net_initialize_handshake();
    o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
  ---
The handshake includes
 - o2hb_heartbeat_timeout_ms
 - o2net_idle_timeout_ms
 - o2net_keepalive_delay_ms
 - o2net_reconnect_delay_ms


o2net_data_ready()
  -> o2net_sc_queue_work(sc, &sc->sc_rx_work)
    -> o2net_advance_rx()
      -> o2net_check_handshake()
        -> o2net_set_nn_state(nn, sc, 1, 0); //valid is true
        ---
        if (!was_valid && valid) {
            o2quo_conn_up(o2net_num_from_nn(nn));
            cancel_delayed_work(&nn->nn_connect_expired);
            printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
                   o2nm_this_node() > sc->sc_node->nd_num ?
                   "Connected to" : "Accepted connection from",
                   SC_NODEF_ARGS(sc));
        }
        ---

Purpose

To know the purpose of o2net, we need to get what's messages does it handle,

We can track the message handler register to know this, namely, caller of o2net_register_handler,
Heartbeat
 - O2HB_NEGO_TIMEOUT_MSG
 - O2HB_NEGO_APPROVE_MSG

O2DLM
 - DLM_MASTER_REQUEST_MSG
 - DLM_UNUSED_MSG1          
 - DLM_ASSERT_MASTER_MSG 
 - DLM_CREATE_LOCK_MSG      
 - DLM_CONVERT_LOCK_MSG  
 - DLM_PROXY_AST_MSG      
 - DLM_UNLOCK_LOCK_MSG      
 - DLM_DEREF_LOCKRES_MSG 
 - DLM_MIGRATE_REQUEST_MS
 - DLM_MIG_LOCKRES_MSG      
 - DLM_QUERY_JOIN_MSG      
 - DLM_ASSERT_JOINED_MSG 
 - DLM_CANCEL_JOIN_MSG      
 - DLM_EXIT_DOMAIN_MSG      
 - DLM_MASTER_REQUERY_MSG
 - DLM_LOCK_REQUEST_MSG  
 - DLM_RECO_DATA_DONE_MSG
 - DLM_BEGIN_RECO_MSG      
 - DLM_FINALIZE_RECO_MSG 
 - DLM_QUERY_REGION      
 - DLM_QUERY_NODEINFO      
 - DLM_BEGIN_EXIT_DOMAIN_
 - DLM_DEREF_LOCKRES_DONE

Keepalive

A node which has given up on connecting to a majority
of nodes who are still heartbeating will fence itself.

o2net_process_message()
  -> o2net_sc_postpone_idle()
o2net_check_handshake()
  -> o2net_sc_reset_idle_timer()
  ---
    o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
    o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
              msecs_to_jiffies(o2net_keepalive_delay()));
    o2net_set_sock_timer(sc);
    mod_timer(&sc->sc_idle_timeout,
           jiffies + msecs_to_jiffies(o2net_idle_timeout()));
  ---

When delayed kwork sc_keepalive_work is fired, it will send out a keepalive message,
when the sc_idle_timeout is expired, it says a nodes has lost connection with us.

Init when boot

The system service o2cb would setup the ocfs2 configrations when system boot.

o2cb.init.sh

online()
  -> online_o2cb()
    -> register_cluster_o2cb()
      -> o2cb register-cluster cluster
    -> start_global_heartbeat_o2cb()
      -> o2cb start-heartbeat cluster

When the /etc/ocfs2/cluster.conf is setup properly, o2cb would setup the configrations based on it.

Node Down

Except for provding communication capability (o2net), the main target of o2cb is
to feel the state of nodes in cluster. This is provided by o2hb.
Let's look at the callback of O2HB_NODE_DOWN_CB

o2net_hb_node_down_cb

o2net_hb_node_down_cb()
---
    o2quo_hb_down(node_num);

    if (!node)
        return;

    if (node_num != o2nm_this_node())
        o2net_disconnect_node(node);
---

dlm_hb_node_down_cb


    /*
     * This will notify any dlm users that a node in our domain
     * went away without notifying us first.
     */
     //Right now, the only one user is ocfs2_do_node_down

    if (test_bit(idx, dlm->domain_map))
        dlm_fire_domain_eviction_callbacks(dlm, idx);

    spin_lock(&dlm->spinlock);
    __dlm_hb_node_down(dlm, idx);
    spin_unlock(&dlm->spinlock)

Journal Replay

When a node is down, there maybe some journal has not been checkpointed
to shared disk in its per-node log file.
So,

flush the log to shared disk by another node

__ocfs2_recovery_thread()
---

    // The node firstly success to get super lock is responsible for recovery.
    // After recovery the node's slot is cleared. The following node get super
    // lock see this and will not do recovery any more.

    status = ocfs2_super_lock(osb, 1);
    ---
        status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
        status = ocfs2_should_refresh_lock_res(lockres);
        if (status) {
            status = ocfs2_refresh_slot_info(osb);
    ---
    ...
    spin_lock(&osb->osb_lock);
    while (rm->rm_used) {
        /* It's always safe to remove entry zero, as we won't
         * clear it until ocfs2_recover_node() has succeeded. */
        node_num = rm->rm_entries[0];
        spin_unlock(&osb->osb_lock);
        slot_num = ocfs2_node_num_to_slot(osb, node_num);
        if (slot_num == -ENOENT) {
            status = 0;
            goto skip_recovery;
        }

        status = ocfs2_recover_node(osb, node_num, slot_num);
        ...
    }
---

wait the recovery to complete if need to access metadata

The jbd2 journal recovery just copy the logged blocks to its target position.
During this, there is no concurrent access control to the metadata.

So we have to stop all of the access to the inode's metadata
And this is also the reason why all of the nodes need to know recovery is
ongoing.

ocfs2_inode_lock_full_nested()
---
    if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
        ocfs2_wait_for_recovery(osb);
---

Cache Consistency

    +----+      +----+     +----+ 
    | N0 |      | N1 |     | N2 |
    +----+      +----+     +----+
       |          |          |
       |          |          |
       -----------+-----------
                +---+
                | D |
                +---+

There are two key points need to do to keep cache consistency

Data

The node holding PR lock on a file need to invalidate its cache when other node want to upconvert to EX mode.
The node holding EX lock on a file need to flush data to shared disk when other node want to downconvert to PR lock.
ocfs2_data_convert_worker()
---
    unmap_mapping_range(mapping, 0, 0, 0);

    if (filemap_fdatawrite(mapping)) {
        mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
             (unsigned long long)OCFS2_I(inode)->ip_blkno);
    }
    sync_mapping_buffers(mapping);
    if (blocking == DLM_LOCK_EX) {

        //If we need to downconvert to NL, drop the cache

        truncate_inode_pages(mapping, 0);
    } else {
        /* We only need to wait on the I/O if we're not also
         * truncating pages because truncate_inode_pages waits
         * for us above. We don't truncate pages if we're
         * blocking anything < EXMODE because we want to keep
         * them around in that case. */
        filemap_fdatawait(mapping);
    }

---