BCACHE

Infrastructure

metadata btree bucket management cached data

Infrastructure


closure dance

closure_call // initialize the cl->remaining to CLOSURE_RUNNING + 1 and execute .fn

closure_bio_submit // closure_get and generic_make_request
                   // the associated closure_put is invoked in endio callback.

continue_at //sub CLOSURE_RUNNING + 1 from cl->remaining and queue a new .fn

closure_put_after_sub
---
if (!r) {
        if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
            atomic_set(&cl->remaining,
                   CLOSURE_REMAINING_INITIALIZER);

            // #define CLOSURE_REMAINING_INITIALIZER    (1|CLOSURE_RUNNING)

            closure_queue(cl);
        }
        ...
}
---

CLOSURE_RUNNING means the .fn is running.


closure_wait // add CLOSURE_WAITING + 1, llist_add (lockless)

__closure_wake_up // dec CLOSURE_WAITING + 1

binary tree search

          +--+--+--+--+--+
  Array   |11| 7|25|14| 1|
          +--+--+--+--+--+

             11 (0)
           /       \
        7 (1)     25 (2)
       /     \
   14 (3)   1 (4)

  value (index)

As the diagram above, every flat array could seen as a binary tree.
                   
                       n
                    /     \
                2n + 1  2n + 2

If we want to select the least/biggest value in one array, just need to iterate
every non-leaf node in reverse order and take lesser/bigger one of the 3 node above.

For example,

             11 (0)
           /       \
        7 (1)     25 (2)
       /      \
   14 (3)   1 (4)
                
              |
              v

             11 (0)
           /       \
        1 (1)     25 (2)
       /      \
   14 (3)   7 (4)
              
              |
              v

             1 (0)
           /       \
       11 (1)     25 (2)
       /      \
   14 (3)   7 (4)

-------------------------------------------
A test demo


#include 


struct heap {
    int used;
    int data[10];
};

#define swap(a, b) \
    do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
#define heap_swap(h, i, j)    swap((h)->data[i], (h)->data[j])
#define heap_sift(h, i, cmp)                        \
do {                                    \
    size_t _r, _j = i;                        \
                                    \
    for (; _j * 2 + 1 < (h)->used; _j = _r) {            \
        _r = _j * 2 + 1;                    \
        if (_r + 1 < (h)->used &&                \
            cmp((h)->data[_r], (h)->data[_r + 1]))        \
            _r++;                        \
                                    \
        if (cmp((h)->data[_r], (h)->data[_j]))            \
            break;                        \
        heap_swap(h, _r, _j);                    \
    }                                \
} while (0)


#define heap_pop(h, d, cmp)                        \
({                                    \
    int _r = (h)->used;                        \
    if (_r) {                            \
        (d) = (h)->data[0];                    \
        (h)->used--;                        \
        heap_swap(h, 0, (h)->used);                \
        heap_sift(h, 0, cmp);                    \
    }                                \
    _r;                                \
})

int compare (int a, int b)
{
    return a >= b;
}

int main()
{
    int i, p;

    struct heap h = {10, {12, 9, 21, 15, 5, 30, 100, 1, 56, 78}};

    for (i = h.used / 2 - 1; i >= 0; --i)
        heap_sift(&h, i, compare);

    for (i = 0; i < 10; i++) {
        heap_pop(&h, p, compare);
        printf("%u ", p);
    }
    printf("\n");
}

metadata


bkey


                   +-------+  +-------+  +-------+  +-------+
   cached dev      |bcache0|  |bcache1|  |bcache2|  |bcache3|
                   +---^---+  +---^---+  +---^---+  +---^---+
                       |          |          |          |
                   +----------------------------------------+
   cache set       |                CACHE SET               |
                   +----------------------------------------+
                       |          |          |          |
                    +-----+    +-----+    +-----+    +-----+
   backing dev      | sdb |    | sdc |    | sdd |    | sde |
                    +-----+    +-----+    +-----+    +-----+


63 60 58 56 55                37 36                20                    0
|--|--|--|-|------------------|-|-----------------|---------------------|    bkey->high
  |  |  |  |                    |         |                   |
  |  |  |  |                    |         |                   |
  |  |  |  |                    |         |                   V
  |  |  |  |                    |         V                    KEY_INODE
  |  |  |  |                    V       KEY_SIZE
  |  |  |  V                  KEY_DIRTY
  |  |  V KEY_PINNED
  |  V KEY_CSUM
  V HEADER_SIZE
KEY_PTRS

|------------------------------------------------------------------------|    bkey->low
                                   |
                                   V
                               KEY_OFFSET

|--------------|-----------------------------------------------|---------|    bkey->ptr[i]

      |                            |                                |
      |                            |                                V
      |                            V                             PTR_GEN
      V                        PTR_OFFSET
    PTR_DEV

btree


btree node

btree  ->  keys   ->   set[]   ->  data         -> start
           btree_keys  bset_tree   bset            bkey
                                   on-disk format  bkey array
btree->key

COWed btree


COW btree
              +-----------+                      insert
              |k0|k1...|kn|    <------------+--------------+
              +-----------+                 |              |
               |                            |              |
               v                            |              |
        +-----------+                 +-----------+  +-----------+    
        |k0|k1...|kn|                 |k0|k1...|kn|  |k0         |
        +-----------+                 +-----------+  +-----------+
         |                             |   NEW        |   NEW
         v                             v              v
    +-----------+               +-----------+  +-----------+
    |k0|k1...|kn|               |k0|k1...|kn|  |k0         |
    +-----------+               +-----------+  +-----------+
                                     NEW           NEW

|
v : PTR_OFFSET(k, 0)


Or


          +-----------+ 
             |k0|k1...|kn| 
             +-----------+ 
              |            
              v            
       +-----------+               Insert
       |k0|k1...|kn|<------------+--------------+       
       +-----------+             |              |
            |                       |              |
            v                       |              |
   +-----------+           +-----------+  +-----------+
   |k0|k1...|kn|           |k0|k1...|kn|  |k0         |
   +-----------+           +-----------+  +-----------+
                                NEW           NEW
( There is only one real insert into the original btree. Seems that we could submit all of the newly btree note write together with a same closure, then sync on this closure before do the final insert. ) The actual case is that every insert operation is synchronous.
See,
bch_btree_insert_node
---
    if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
        if (!b->level)
            bch_btree_leaf_dirty(b, journal_ref);
        else
            bch_btree_node_write(b, &cl);
    }
    ...
    /* wait for btree node write if necessary, after unlock */
    closure_sync(&cl);
---
And
bch_btree_insert_node
  -> btree_split
---
    n1 = btree_node_alloc_replacement(b, op);
    ...
    split = set_blocks(btree_bset_first(n1),
               block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;

    if (split) {
        ...
        n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
        ...
        bkey_copy_key(&n2->key, &b->key);

        bch_keylist_add(&parent_keys, &n2->key);

        bch_btree_node_write(n2, &cl);

        mutex_unlock(&n2->write_lock);
        rw_unlock(true, n2);
    } else {
        mutex_lock(&n1->write_lock);
        bch_btree_insert_keys(n1, op, insert_keys, replace_key);
    }

    bch_keylist_add(&parent_keys, &n1->key);

    bch_btree_node_write(n1, &cl);

    mutex_unlock(&n1->write_lock);

    if (n3) {
        /* Depth increases, make a new root */
        mutex_lock(&n3->write_lock);
        bkey_copy_key(&n3->key, &MAX_KEY);
        bch_btree_insert_keys(n3, op, &parent_keys, NULL);
        bch_btree_node_write(n3, &cl);
        mutex_unlock(&n3->write_lock);


        closure_sync(&cl);

        bch_btree_set_root(n3);
        rw_unlock(true, n3);
    } else if (!b->parent) {
        /* Root filled up but didn't need to be split */

        closure_sync(&cl);

        bch_btree_set_root(n1);
    } else {
        /* Split a non root node */

        closure_sync(&cl);

        make_btree_freeing_key(b, parent_keys.top);
        bch_keylist_push(&parent_keys);

        bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
        BUG_ON(!bch_keylist_empty(&parent_keys));
    }
---
An important scenario need to be handled. That is,
When the new btree node is inserted into the parent, the old one is still there.
             +-----------+ 
             |k0|k1...|kn| 
             +-----------+ 
              |            
              v            
       +-----------+               Insert
       |k0|k1...|kn|<------------+--------------+       
       +-----------+             |              |
            | It's me !!!           |              |
            v                       |              |
   +-----------+           +-----------+  +-----------+
   |k0|k1...|kn|           |k0|k1...|kn|  |k0         |
   +-----------+           +-----------+  +-----------+
                                NEW           NEW
How does the following caller identify the old one ?

make_btree_freeing_key

sort of bkeys

bch_btree_insert_key
---
    while (m != bset_bkey_last(i) &&
           bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
        prev = m, m = bkey_next(m);

    // This says that the bkey in one bset is incremental.

    ...
    bch_bset_insert(b, m, k);
copy:    bkey_copy(m, k); // redundant for bch_bset_insert case ?
merged:
    return status;

---

btree_keys  ->   set[]  -> data         -> start
            bset_tree   bset            bkey
                        on-disk format  bkey array

bch_btree_sort_partial
  -> __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
    // serach is NULL, so all of the bset in btree node will be added into btree_iter
  -> __btree_sort
    -> btree_mergesort

    btree_iter
                      bkeys
               |-----------------|
               start             end
               small  ---->  big
               ...

    start = bset_tree->data->start
    end   = bset_bkey_last(bset_tree->data)
---
    for (i = iter->used / 2 - 1; i >= 0; --i)
        heap_sift(iter, i, b->ops->sort_cmp);

    //binary tree search select the smallest bset (the value of the bset is the
    //first bkey in it.) Due to the bset is in order, iter->data->k should be
    // the smallest one among all of bsets.

    while (!btree_iter_end(iter)) {
        k = NULL;

        if (!k)
            k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
            ---
            if (!btree_iter_end(iter)) {
                bch_btree_iter_next_check(iter);

                ret = iter->data->k;

                //push to the next key

                iter->data->k = bkey_next(iter->data->k);
                ...
                if (iter->data->k == iter->data->end)

                // one bset has ended, pop it out and resort the left ones.

                    heap_pop(iter, b, cmp);
                else

                // the iter->data[0]'s value has changed, resort

                    heap_sift(iter, 0, cmp);
            }
            ---

        // __bch_tree_iter_next will return the smallest bkey among all of the
        // bset in the iter.

        if (bad(b, k))
            continue;


        //We will resort all of bkeys in all bsets and save them into out.

        if (!last) {
            last = out->start;
            bkey_copy(last, k);
        } else if (!bch_bkey_try_merge(b, last, k)) {
            last = bkey_next(last);
            bkey_copy(last, k);
        }
    }
---
 

bucket management


The caching device is divided into buckets, which are intended to match the physical device's erase blocks.
Every bucket has bucket_disk information.

struct prio_set {
    __u64            csum;
    __u64            magic;
    __u64            seq;
    __u32            version;
    __u32            pad;

    __u64            next_bucket;

    struct bucket_disk {
        __u16        prio;
        __u8        gen;
    } __attribute((packed)) data[];
};

prios_per_bucket(c) =
    ((bucket_bytes(c) - sizeof(struct prio_set)) / sizeof(struct bucket_disk))

          bucket
       +----------+
       | prio_set |
       +----------+
       |   b_d    |
       +----------+
       |          |
       .          .
       .          .
       |          |
       +----------+

Note:
  b_d - bucket_disk
bch_prio_write is used to written out all of this.

bucket allocation

There are following fifos to save the free buckets.

They will be separately assigned with a different size.
cache_alloc
---
    btree_buckets = ca->sb.njournal_buckets ?: 8;
    free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
    if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
                        GFP_KERNEL)) {
        ...
    }

    if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
                            GFP_KERNEL)) {
        ...
    }

    if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
        ...
    }

    if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
        ...
    }

    if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
        ...
    }
---
But they are all empty and need to be filled by allocator thread.
bch_allocator_thread will firstly try to invalidate the bucket then push it on free_inc.
bch_allocator_thread
  -> invalidate_buckets
    -> invalidate_buckets_random
      -> if bch_can_invalidate_bucket
            ---
            return (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) && !atomic_read(&b->pin) &&
        can_inc_bucket_gen(b);
            ---
            bch_invalidate_one_bucket
The question is how does the allocator thread keep away from the used buckets ?

The anwser is the gc_mark.
There are two scenarios that need to be considered.

buckets generation

Every bucket has a generation. When it is allocated, the PTR_GEN(bkey) will record this. Through dance of the buckets' generation, we could omit the btree deleting operation. See

static inline uint8_t gen_after(uint8_t a, uint8_t b)
{
    uint8_t r = a - b;

    // The 'r > 128' here is to defense the case b > a

    return r > 128U ? 0 : r;
}

static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
                unsigned int i)
{
    return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
}

But there is one scenario need to be noted.


             +-------------+
               |A-5| ... |B-5|                
  |            +-------------+                
                |          |                  
  |           v          v                  
         +------------+  +------------+       
  |      |     old    |  |     new    |       
         +------------+  +------------+
  |                                                   /\
  V      increment the bucket's gen of key A to 6    /  \    .---- POWER OFF !!!
                                             <------'    \  /
         write the prio_set out to disk                   \/

So when we power off and setup the bcache again, we will get,
         +-------------+
         |A-5| ... |B-5|          Bucket's gen of A is still 5.
         +-------------+                
          |          |                  
          v          v                  
   +------------+  +------------+       
   |     old    |  |     new    |       
   +------------+  +------------+


And we get both old and new for the same KEY_OFFSET & KEY_INODE.

How to fix this ? It's make_btree_freeing_key.

Insert a key with ZERO_KEY, PTR_OFFSET(a) and PTR_GEN(A) + 1.

The bkey B and the freed bkey will be inserted together and the COW btree
mechanism will ensure if B bkey is there, must be freed bkey.

Then when run_cache_set, bch_btree_check_recurse will identify the freed bkey.
bch_initial_mark_key
---
    for (i = 0; i < KEY_PTRS(k); i++)
        if (ptr_available(c, k, i) &&
            !ptr_stale(c, k, i)) {
            struct bucket *b = PTR_BUCKET(c, k, i);

            b->gen = PTR_GEN(k, i);

            // get the incremented generation of the freed bfree
            // this could ensure bkey A to be stale.

            ...
        }

    __bch_btree_mark_key(c, level, k);
    ---

    /*
     * ptr_invalid() can't return true for the keys that mark btree nodes as
     * freed, but since ptr_bad() returns true we'll never actually use them
     * for anything and thus we don't want mark their pointers here
     */
    Question:
        Yes, we never actually use the freed btree nodes for anything, but they
        also will not be reclaimed, right ?

    if (!bkey_cmp(k, &ZERO_KEY))
        return stale;
    ---
---

cache_data


COWed cache data

Yes, the bcache cache data is COWed.

cached_dev_write
---
    if (s->iop.bypass) {
        ...
    } else if (s->iop.writeback) {
        bch_writeback_add(dc);
        s->iop.bio = bio;
        ...
    } else {
        ...
    }

insert_data:
    closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
    continue_at(cl, cached_dev_write_complete, NULL);
---
bch_data_insert
  -> bch_data_insert_start
  ---
    do {
        ...
        k = op->insert_keys.top;
        bkey_init(k);
        SET_KEY_INODE(k, op->inode);
        SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);

        // Allocate buckets directly

        if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
                       op->write_point, op->write_prio,
                       op->writeback))
            goto err;

        n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);

        n->bi_end_io    = bch_data_insert_endio;
        n->bi_private    = cl;

        if (op->writeback) {
            SET_KEY_DIRTY(k, true);

            for (i = 0; i < KEY_PTRS(k); i++)
                SET_GC_MARK(PTR_BUCKET(op->c, k, i),
                        GC_MARK_DIRTY);
        }

        SET_KEY_CSUM(k, op->csum);
        if (KEY_CSUM(k))
            bio_csum(n, k);

        bch_keylist_push(&op->insert_keys);

        bio_set_op_attrs(n, REQ_OP_WRITE, 0);
        bch_submit_bbio(n, op->c, k, 0);
    } while (n != bio);

    op->insert_data_done = true;
    continue_at(cl, bch_data_insert_keys, op->wq);
    return;

  ---
The benefits of COWed cache data is,
    allocate new bucket     /\
    write on it            /  \    .----- POWER OFF !!!
                     <----'    \  /
    write complete              \/

    insert new bkey into the btree
    insert freed bkey into the btree
    (make_btree_freeing_key will increment
     the gen of original bucket)
If poweroff as above, the original data buckets will not be corrupted and it
will be still in the btree because make_btree_freeing_key has not been invoked.
(If we write the data to original block directly and poweroff before complete,
the data in the bucket will be corrupted. Even if the bucket size is same with
device block size, it may still be written partially.)