DM

Bio Stacked Layer

Bio based DM

flush to bio-based dm

change mapping on runtime

dm suspend
table load
cache

cache hit
cache miss
execlusive lock

snapshot

summary
implementation
- origin bio
- exception table

Bio Stacked Layer

It is bio-based dm and md where the generic_make_request mavbe invoked recursively. At the moment when generic_make_request is invoked recursively, the current->bio_list is not NULL, then the bio will be add on the tail of the list. However, there will be dependency issue in this case.


current->bio_list Head <- bio0 <- bio1 <- bio2
      |
      v
q->make_request_fn
      |
      v
current->bio_list Head <- bio1 <- bio2 <- bio0a <- bio0b

What if the target driver must wait the bio0 to be compeleted before it handle
bio1 ?
(I cannot give a actual example here, but it should exist in the md or dm target
 drivers.)

And the correct scenario should be as following:

current->bio_list Head <- bio0 <- bio1 <- bio2
      |
      v
q->make_request_fn
      |
      v
current->bio_list Head <- bio0a <- bio0b <- bio1 <- bio2

Then bio0a and bio0b which come from bio0 could be handled prior to bio1 and bio2.
This is excepted.

How to achieve this ?
Look at generic_make_request.


bio_list_on_stack[0] contains bios submitted by the current make_request_fn.
bio_list_on_stack[1] contains bios that were submitted before the current
make_request_fn, but that haven't been processed yet.

    if (current->bio_list) {
        bio_list_add(¤t->bio_list[0], bio);
        goto out;
    }

    BUG_ON(bio->bi_next);
    bio_list_init(&bio_list_on_stack[0]);
    current->bio_list = bio_list_on_stack;
    do {
        struct request_queue *q = bio->bi_disk->queue;
        blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
            BLK_MQ_REQ_NOWAIT : 0;

        if (likely(blk_queue_enter(q, flags) == 0)) {
            struct bio_list lower, same;

            bio_list_on_stack[1] = bio_list_on_stack[0];
            bio_list_init(&bio_list_on_stack[0]);
            Hand over the bios on list[0] to list[1], the
                list[0] here is used to save the new bios generated during .make_request_fn
            ret = q->make_request_fn(q, bio);

            blk_queue_exit(q);

            /* sort new bios into those for a lower level
             * and those for the same level
             */
            bio_list_init(&lower);
            bio_list_init(&same);
            while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
                if (q == bio->bi_disk->queue)
                    bio_list_add(&same, bio);
                else
                    bio_list_add(&lower, bio);
            
            /* now assemble so we handle the lowest level first */
            bio_list_merge(&bio_list_on_stack[0], &lower);
            bio_list_merge(&bio_list_on_stack[0], &same);
            bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
            
        } else {
            if (unlikely(!blk_queue_dying(q) &&
                    (bio->bi_opf & REQ_NOWAIT)))
                bio_wouldblock_error(bio);
            else
                bio_io_error(bio);
        }
        bio = bio_list_pop(&bio_list_on_stack[0]); //Get
        request from the head of the list[0]
    } while (bio);
    current->bio_list = NULL; /* deactivate */

Bio based DM

dm_target or ti

    /* target limits */
    sector_t begin;
    sector_t len;

    /* If non-zero, maximum size of I/O submitted to a target. */
    uint32_t max_io_len;
    
    Why we need this ?
    Consider the dm-stripe, if the stripe is 128K, the max_io_len here
    is 128k. Or we can say the dm_target is stripped by this max_io_len.
    
      ti.begin                           ti.begin + ti.len
            | _ _ _ | _ _ _ | _ _ _ | _ _ _ |
                    \_ _ _ _/
                        v
                  ti.max_io_len

max_io_len() will return the max io length that could be issued from specific sector. It will guarantee:

no io is bigger than ti.max_io_len
no io stretches over two blocks aligned with ti.max_io_len

__split_and_process_bio will split the bio based on max_io_len.

dm_make_request
  -> __dm_make_request
    -> __split_and_process_bio
    --
        while (ci.sector_count && !error) {
            error = __split_and_process_non_flush(&ci);
                     -> __clone_and_map_data_bio
                       -> __map_bio
                         -> generic_make_request
                     -> ci->sector += len
                     -> ci->sector_count -= len
            ...
        }
    --

cloned bio

              dm_io
             +------+
             |      | -> md  (dm device)
             |      | -> io_count (count of cloned bio)
             |      | -> orig_bio
             +------+- - - +-----+
             |  tio |      |     | -> io (father dm_io)
             +------+,     |     | -> ti
                      \    |     | -> inside_dm_io (true for embedded tio)
                       \   +-----+
                        \  |clone| (embedded cloned bio)
                         '-+-----+

Note: most of time, there will be only one cloned bio
      then the embedded tio will be use. otherwise, new
      tio will be allocated, its father dm_io.io_count
      will be increased to count this.

cloned bios share the same original bio's biovec, but they will point to different part. this is done in clone_bio.

static int clone_bio(struct dm_target_io *tio, struct bio *bio,
             sector_t sector, unsigned len)
{
    struct bio *clone = &tio->clone;

    __bio_clone_fast(clone, bio);

    ...

    if (bio_op(bio) != REQ_OP_ZONE_REPORT)
        bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
    clone->bi_iter.bi_size = to_bytes(len);
    ...
    return 0;
}

other other hand, cloned bios may point to different disk. this is done by ti->type->map() callback.
the original bio will stay in dm layer, the cloned bio will be issued to do the real io on behalf of original bio. when all the cloned bio are completed, the original bio could be able to be completed.

Before submit the clone bio, __map_bio will assign it a bi_end_io, clone_endio.
clone_endio
  -> dec_pending
 --
     if (atomic_dec_and_test(&io->io_count)) {
        ...
        io_error = io->status;
        bio = io->orig_bio;
        end_io_acct(io);
        free_io(md, io);

        if (io_error == BLK_STS_DM_REQUEUE)
            return;

        if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
            /*
             * Preflush done for flush with data, reissue
             * without REQ_PREFLUSH.
             */
            bio->bi_opf &= ~REQ_PREFLUSH;
            queue_io(md, bio);
        } else {
            /* done with normal IO or empty flush */
            if (io_error)
                bio->bi_status = io_error;
            bio_endio(bio);
        }
    }

 --

flush to bio-based dm

Yes, we have to send the flush to all of the target devices.

__split_and_process_bio
  -> __send_empty_flush
---
    while ((ti = dm_table_get_target(ci->map, target_nr++)))
        __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
          -> __clone_and_map_simple_bio
          ---
        __bio_clone_fast(clone, ci->bio);
        if (len)
            bio_setup_sector(clone, ci->sector, *len);

            // bio->bi_iter.bi_sector = sector;
            // bio->bi_iter.bi_size = to_bytes(len);
            // len is NULL here, so the bio here is an empty bio with only REQ_PREFLUSH

          ---
---

clone_endio
  -> dec_pending
  ---
    if (atomic_dec_and_test(&io->io_count)) {
        ...
        if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {

            /*
             * Preflush done for flush with data, reissue
             * without REQ_PREFLUSH.
             */

            bio->bi_opf &= ~REQ_PREFLUSH;
            queue_io(md, bio);
        } else {
        ...
        }
    }

  ---

change mapping on runtime

For the dm-linear, the dm devie looks like this,

                     dm-0
                            
                    +-----+  
     table->ti[0]   |     |
                    |     |
                    +-----+ .  ti[1].begin
                    |     |  \     sda
            ti[1]   |     |   \  +-----+
                    |     |    \ |     |
                    +-----+. [a]'+-----+  ((struct linear_c *)ti[1].private)->start
                    |     | \    |     | \
            ti[2]   |     |  \   |     |  > ti[1].len
                    +-----+   \  |     | /
                               ' +-----+
                                 |     |
                                 '''''''

[a]  table->highs[1]  (ti[1].begin + ti[1].len - 1)

populate_table is responsible for parse the table parameter and construct the organization above.

populate_table
---
    for (i = 0; i < param->target_count; i++) {

        r = next_target(spec, next, end, &spec, &target_params);
        ...
        r = dm_table_add_target(table, spec->target_type,
                    (sector_t) spec->sector_start,
                    (sector_t) spec->length,
                    target_params);
            ---
            tgt = t->targets + t->num_targets;
            memset(tgt, 0, sizeof(*tgt));
            ...
            tgt->type = dm_get_target_type(type);
            ...
            tgt->table = t;
            tgt->begin = start;
            tgt->len = len;
            tgt->error = "Unknown error";
            ...
            r = dm_split_args(&argc, &argv, params);
            ...
            r = tgt->type->ctr(tgt, argc, argv);
            ...
            t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
            ---
        ...
        next = spec->next;
    }
---

The dm-linear is supposed to support changing mapping and size at runtime.
How to achieve it ?
The key is the dev_suspend.

#STEP 1 suspend the device

    if (!noflush && do_lockfs) {
        r = lock_fs(md);
        if (r) {
            dm_table_presuspend_undo_targets(map);
            return r;
        }
    }

    /*
     * Here we must make sure that no processes are submitting requests
     * to target drivers i.e. no one may be executing
     * __split_and_process_bio. This is called from dm_request and
     * dm_wq_work.
     *
     * To get all processes out of __split_and_process_bio in dm_request,
     * we take the write lock. To prevent any process from reentering
     * __split_and_process_bio from dm_request and quiesce the thread
     * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
     * flush_workqueue(md->wq).
     */

    set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
    if (map)
        synchronize_srcu(&md->io_barrier);
    ...
    flush_workqueue(md->wq);


    /*
     * At this point no more requests are entering target request routines.
     * We call dm_wait_for_completion to wait for all existing requests
     * to finish.
     */

    r = dm_wait_for_completion(md, task_state);
    if (!r)
        set_bit(dmf_suspended_flag, &md->flags);

    if (noflush)
        clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
    if (map)
        synchronize_srcu(&md->io_barrier);
----

The DMF_BLOCK_IO_FOR_SUSPEND works here,

dm_make_request
---
    map = dm_get_live_table(md, &srcu_idx);


    /* if we're suspended, we have to queue this io for later */
    if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {

        dm_put_live_table(md, srcu_idx);

        if (!(bio->bi_opf & REQ_RAHEAD))
            queue_io(md, bio);
        else
            bio_io_error(bio);
        return ret;
    }

    ret = dm_process_bio(md, map, bio);

    dm_put_live_table(md, srcu_idx);

---

And

dm_wq_work
---
map = dm_get_live_table(md, &srcu_idx);

    while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {

        spin_lock_irq(&md->deferred_lock);
        c = bio_list_pop(&md->deferred);
        spin_unlock_irq(&md->deferred_lock);

        if (!c)
            break;

        if (dm_request_based(md))
            (void) generic_make_request(c);
        else
            (void) dm_process_bio(md, map, c);
    }

    dm_put_live_table(md, srcu_idx);

---


The method to account the inflight IOs of the bio based dm is worthy to look into.
md_in_flight
  -> md_in_flight_bios
---
    for_each_possible_cpu(cpu) {
        sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
        sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
    }
---

It use per-cpu variable but not percpu_ref_counter.

#STEP 2 table_load

The old table would be destroyed.
It is OK at this moment, because the dm has been suspended.

#STEP 3 swap table

do_resume
---
do_resume
  -> dm_swap_table
    -> __bind
    ---
    size = dm_table_get_size(t);
    ...
    __set_size(md, size);
    ...
    rcu_assign_pointer(md->map, (void *)t);
    ---
---

#STEP 4 dm_resume

do_resume
  -> dm_resume
    -> __dm_resume
      -> dm_queue_flush
      ---
        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
        smp_mb__after_atomic();
        queue_work(md->wq, &md->work);
      ---

dm suspend

The biggest target of dm suspend/resume is:

  We need to be able to change a mapping table under a mounted
  filesystem.  For example we might want to move some data in
  the background.  Before the table can be swapped with
  dm_bind_table, dm_suspend must be called to flush any in
  flight bios and ensure that any further io gets deferred.

How to achieve this ?

1. Flush all I/Os by lock_fs() if needed.
2. Stop new io to be dispatched, defer them
3. Wait for all the in-flight ios to be completed or requeued.

Code:__dm_suspend // under md->suspend_lock
case 1 will be explained later, let's look at the 2nd and 3rd case next.
case 2:
For bio-based dm

    set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
    if (map)
        synchronize_srcu(&md->io_barrier);

This srcu io_barrier will sync with both

__dm_make_request (normal)

__dm_make_request
---
    map = dm_get_live_table(md, &srcu_idx); //lock srcu md->io_barrier

    /* if we're suspended, we have to queue this io for later */
    if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
        dm_put_live_table(md, srcu_idx);

        if (!(bio->bi_opf & REQ_RAHEAD)) //take bio of readahead as io error ?
            queue_io(md, bio);  //hand the bio to dm_wq_work
        else
            bio_io_error(bio);
        return ret;
    }

    ret = process_bio(md, map, bio);

    dm_put_live_table(md, srcu_idx); //lock srcu md->io_barrier
---

dm_wq_work (requeue and defer)

    map = dm_get_live_table(md, &srcu_idx);

    while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
        spin_lock_irq(&md->deferred_lock);
        c = bio_list_pop(&md->deferred);
        spin_unlock_irq(&md->deferred_lock);

        if (!c)
            break;

        if (dm_request_based(md))
            generic_make_request(c);
        else
            __split_and_process_bio(md, map, c);
    }
    dm_put_live_table(md, srcu_idx);

For request based dm

    if (dm_request_based(md)) {
        dm_stop_queue(md->queue);

          -> dm_stop_queue
            -> blk_mq_quiesce_queue

        if (md->kworker_task)
            kthread_flush_worker(&md->kworker);

    md->kworker is the task to handle the request of block legacy req-based path.
    refer to 
    dm_old_request_fn
    map_tio_request

    }

    flush_workqueue(md->wq);

    What is the flush_workqueue here for ?
    the work can be queued by __dm_make_request->queue_io anytime.

case 3:
Wait the in-flight ios to be completed or requeued

    r = dm_wait_for_completion(md, task_state);

      -> wait !md_in_flight(md) on md->wait

    if (!r)
        set_bit(dmf_suspended_flag, &md->flags);

add:
__split_and_process_bio
  -> init_clone_info
    -> alloc_io
      -> start_io_acct
dec:
clone_endio
  -> dec_pending
    -> end_io_acct //dm_io->io_count reaches zero

table load

A bit explaining about the hash_cell in dm code.

struct hash_cell {
    struct list_head name_list;
    struct list_head uuid_list;

    char *name;
    char *uuid;
    struct mapped_device *md;
    struct dm_table *new_map;
};

There are two hash list in dm code with name or uuid of md as hash key separately.

The parameters structure of a dm table.

             --        dm_ioctl {
             |             __u32 version[3];
             |             __u32 data_size;  /* total size of data passed in
             |             __u32 data_start; /* offset to start of data relative to 
             |                              start of this struct */
             |             __u32 target_count;    /* in/out */
 data_start <             ...
             |             char name[DM_NAME_LEN];
             |             char uuid[DM_UUID_LEN];
             |             char data[7];
             |         }
             |         ...
             --        struct dm_target_spec {
             --            __u64 sector_start;
             |             __u64 length;
             |             __s32 status;        /* used when reading from kernel only */
      next  <         
             |             __u32 next; //Location of the next dm_target_spec.
             |         
             |              char target_type[DM_MAX_TYPE_NAME];
             |         };
             |         other target parameters
             --         struct dm_target_spec {
                           __u64 sector_start;
                           __u64 length;
                           __s32 status;        /* used when reading from kernel only */
                     
                          __u32 next; //Location of the next dm_target_spec.
                     
                         char target_type[DM_MAX_TYPE_NAME];
                      };
                      other target parameters

populate_table will parse the parameters.

populate_table
 -> dm_table_add_target(table,
                        spec->target_type,
                        spec->sector_start,
                        spec->length,
                        target_params);
   -> dm_get_target_type
   -> type->ctr

table_load will not install the new table, but just save it into the hash_cell.

    down_write(&_hash_lock);
    hc = dm_get_mdptr(md);
    ...
    if (hc->new_map)
        old_map = hc->new_map;
    hc->new_map = t;
    up_write(&_hash_lock);

the new table will be installed in do_resume

    down_write(&_hash_lock);

    hc = __find_device_hash_cell(param);
    ...
    md = hc->md;

    new_map = hc->new_map;
    hc->new_map = NULL;
    param->flags &= ~DM_INACTIVE_PRESENT_FLAG;

    up_write(&_hash_lock);

    /* Do we need to load a new map ? */
    if (new_map) {
        /* Suspend if it isn't already suspended */
        if (param->flags & DM_SKIP_LOCKFS_FLAG)
            suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
        if (param->flags & DM_NOFLUSH_FLAG)
            suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
        if (!dm_suspended_md(md))

            dm_suspend(md, suspend_flags);

        old_map = dm_swap_table(md, new_map);
            -> __bind
              -> rcu_assign_pointer(md->map, (void *)t)
              -> dm_sync_table(md)
        ...
    }

cache

weird structures

struct entry {
    unsigned hash_next:28;
    unsigned prev:28;
    unsigned next:28;
    unsigned level:6;
    bool dirty:1;
    bool allocated:1;
    bool sentinel:1;
    bool pending_work:1;

    dm_oblock_t oblock;
};
This is a record of a cache.
Except for some attributes and oblock, there isn't cblock in it.
This looks like an effort to reduce its size.
The cblock could be get through infer_cblock.

struct queue {
    struct entry_space *es;

    unsigned nr_elts;
    unsigned nr_levels;
    struct ilist qs[MAX_LEVELS];

    /*
     * We maintain a count of the number of entries we would like in each
     * level.
     */
    unsigned last_target_nr_elts;
    unsigned nr_top_levels;
    unsigned nr_in_top_levels;
    unsigned target_count[MAX_LEVELS];
};

cache hit

dm-cache map_bio
  -> policy_lookup
    -> smq_lookup
  //under smq_policy->lock
  -> __lookup
  ---
    e = h_lookup(&mq->table, oblock);
    if (e) {
        stats_level_accessed(&mq->cache_stats, e->level);

        requeue(mq, e);
        *cblock = infer_cblock(mq, e);
        return 0;
    } 
  ---

The entries in smq_policy->table are queued by __complete_background_work.

smq_complete_background_work
   //under smq_policy->lock
  -> __complete_background_work
  ---
      POLICY_PROMOTE
        if (success) {
            e->oblock = work->oblock;
            e->level = NR_CACHE_LEVELS - 1; //Looks like the highest level.
            push(mq, e);
              -> h_insert(&mq->table, e)
        }
    POLICY_DEMOTE
        if (success) {
            h_remove(&mq->table, e);
            free_entry(&mq->cache_alloc, e);
        }
  ---

smq_policy->table includes the blocks in cache

If the blocks are in cache

map_bio
---
    if policy_lookup tell us the blocks are in cache

    if (passthrough_mode(cache)) {
        ...
    } else {
        if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
            !is_dirty(cache, cblock)) {
            remap_to_origin_and_cache(cache, bio, block, cblock);
            ---
                struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);

                bio_chain(origin_bio, bio);

                __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
                  -> remap_to_origin
                submit_bio(origin_bio); //

                remap_to_cache(cache, bio, cblock);
            ---
        } else
            remap_to_cache_dirty(cache, bio, block, cblock);
            ---
            remap_to_cache(cache, bio, cblock);
            if (bio_data_dir(bio) == WRITE) {
                set_dirty(cache, cblock);
                clear_discard(cache, oblock_to_dblock(cache, oblock));
            }
            ---
    }
    ...
    return DM_MAPIO_REMAPPED;
---

cache miss

First of all, when cache miss, bio will be remapped to origin device.

map_bio
---
    if (r == -ENOENT) {
        struct per_bio_data *pb = get_per_bio_data(bio);
        inc_miss_counter(cache, bio);
        if (pb->req_nr == 0) {
            accounted_begin(cache, bio);
            remap_to_origin_clear_discard(cache, bio, block);
              -> __remap_to_origin_clear_discard
                -> remap_to_origin
        } else {
            /*
             * This is a duplicate writethrough io that is no
             * longer needed because the block has been demoted.
             */
            bio_endio(bio);
            return DM_MAPIO_SUBMITTED;
        }
    }
---

But what will do in the smq policy ?

smq_lookup
  -> __lookup
    -> update_hotspot_queue // if no entry in smq_policy->table
    ---
    struct entry *e = h_lookup(&mq->hotspot_table, hb);

    if (e) {
        stats_level_accessed(&mq->hotspot_stats, e->level);

        hi = get_index(&mq->hotspot_alloc, e);
        q_requeue(&mq->hotspot, e,
              test_and_set_bit(hi, mq->hotspot_hit_bits) ?
              0u : mq->hotspot_level_jump, 
              NULL, NULL);

    } 
    ---
There is another entry represents the c-o block pair in
smq_policy->hotspot_table.

After the first time when the entry is allocated and inserted,
the level of the entry will be upgraded when accessed by second time.
The granularity is hotspot_level_jump.

The level of the entry decides whether the entry could be premoted.
__lookup
---
    e = h_lookup(&mq->table, oblock);
    if (e) {
    ...
    } else {
        stats_miss(&mq->cache_stats);

        /*
         * The hotspot queue only gets updated with misses.
         */
        hs_e = update_hotspot_queue(mq, oblock);

        pr = should_promote(mq, hs_e, data_dir, fast_copy);
        if (pr != PROMOTE_NOT) {
            queue_promotion(mq, oblock, work);
            *background_work = true;
        }


        return -ENOENT;
    }
---

But the entry level is only upgraded when it is not set
in mq->hotspot_hit_bits, so what's about the second time ?

See:
smq_tick
  //under mq->lock
  -> end_hotspot_period
    -> clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);

mq->hotspot_hit_bits will be cleared here.

When will be smq_tick be inovked ?

The common path is:
clone_endio
  -> cache_end_io
    -> policy_tick
      -> smq_tick

execlusive lock

The process of promote or demote need to be execlusive with normal IO.
Premote:
        copy data from origin_dev to cache_dev
        modify metadata
        cache active
Demote:
        copy data from cache_dev to origin_dev
        modify metadata
        cache inactive

How to do that ?

mg_start
  -> mg_lock_writes
    -> mg_copy // no overwrite_bio
      -> mg_full_copy
        -> copy
copy
---
    o_region.bdev = cache->origin_dev->bdev;
    o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
    o_region.count = cache->sectors_per_block;

    c_region.bdev = cache->cache_dev->bdev;
    c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
    c_region.count = cache->sectors_per_block;

    if (promote)
        r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
    else
        r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
---

Before do the copy, dm-cache will try to lock the block area.

mg_start
  -> mg_lock_writes
---
    /*
     * Prevent writes to the block, but allow reads to continue.
     * Unless we're using an overwrite bio, in which case we lock
     * everything.
     */
    build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
    r = dm_cell_lock_v2(cache->prison, &key,
                mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
                prealloc, &mg->cell);
---
lock_level
#define WRITE_LOCK_LEVEL 0
#define READ_WRITE_LOCK_LEVEL 1
---
    return bio_data_dir(bio) == WRITE ?
        WRITE_LOCK_LEVEL :
        READ_WRITE_LOCK_LEVEL;
---

Usually, the block area will be locked with level WRITE_LOCK_LEVEL (0).
The read IO will be granted.

map_bio
  -> bio_detain_shared
    -> dm_cell_get_v2
      -> __get //under prison->lock
      ---
    if (__find_or_insert(prison, key, cell_prealloc, cell)) {
        if ((*cell)->exclusive_lock) {
            if (lock_level <= (*cell)->exclusive_level) {
                bio_list_add(&(*cell)->bios, inmate);
                return false;
            }
        }

        (*cell)->shared_count++;

    }
      ---

When an IO fails due to ongoing copy,
  the bio will be added to cell->bios, these bios will be handled by mg_complete
mg_complete
---
    bio_list_init(&bios);
    if (mg->cell) {
        if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
              -> __unlock
                -> get bios on cell->bios
            free_prison_cell(cache, mg->cell);
    }

    free_migration(mg);
    defer_bios(cache, &bios);
---

When a mg fails due to ongoing IO,
mg_lock_writes
---
    build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
    r = dm_cell_lock_v2(cache->prison, &key,
                mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
                prealloc, &mg->cell);
    ...

    if (r == 0)
        mg_copy(&mg->k.ws);
    else
        quiesce(mg, mg_copy);
          -> dm_cell_quiesce_v2
            -> __quiesce
              -> set cell->quiesce_continuation
---
cache_end_io
  -> bio_drop_shared_lock
    -> dm_cell_put_v2
      -> __put
---
    if (!cell->shared_count) {
        if (cell->exclusive_lock){

            if (cell->quiesce_continuation) {
                queue_work(prison->wq, cell->quiesce_continuation);
                cell->quiesce_continuation = NULL;
            }

        } else {
            rb_erase(&cell->node, &prison->cells);
            return true;
        }
    }

---

The process of udpating cache metadata is also under lock:
mg_full_copy
  ->  init_continuation(&mg->k, mg_upgrade_lock);
  -> copy
    -> dm_kcopyd_copy //copy_complete callback is copy_complete
copy_complete
  -> queue_continuation
    -> queue_work mg_upgrade_lock

mg_upgrade_lock
---
        /*
         * Now we want the lock to prevent both reads and writes.
         */
        r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
                        READ_WRITE_LOCK_LEVEL);
        if (r < 0)
            mg_complete(mg, false);
        else if (r)
            quiesce(mg, mg_update_metadata);
        else
            mg_update_metadata(ws);
---
mg_update_metadata will modify the cache metadata and invoke mg_complete finally
which will unlock the lock.

snapshot

lvm snapshot demo

https://www.clevernetsystems.com/lvm-snapshots-explained/
First, create a dummy device that we will initialize as a PV:

# dd if=/dev/zero of=test0 bs=8192 count=131072
# losetup /dev/loop0 test0
# pvcreate /dev/loop0
# pvs
PV VG Fmt Attr PSize PFree
/dev/loop0 lvm2 a-- 1.00g 1.00g

We now have a 1GB LVM2 Physical Volume.

# vgcreate vg0
# vgs
VG #PV #LV #SN Attr VSize VFree
vg0 1 0 0 wz--n- 1020.00m 1020.00m
# lvcreate -n lv0 -l 100 vg0
# lvs
LV VG Attr LSize Pool Origin Data% Move Log Copy% Convert
lv0 vg0 -wi-a---- 400.00m

We now have a Volume Group vg0 and a 400MB Logical Volume lv0. Let’s see what our device mapper looks like

# dmsetup table
vg0-lv0: 0 819200 linear 7:0 2048

We have a single device vg0-lv0, as expected.

Let’s take a snapshot of our Logical Volume:

# lvcreate -s -n snap1 -l 30 /dev/vg0/lv0
# lvs
LV VG Attr LSize Pool Origin Data% Move Log Copy% Convert
lv0 vg0 owi-a-s-- 400.00m
snap1 vg0 swi-a-s-- 120.00m lv0 0.00

We’ve created a 30 extents (120MB) snapshot of lv0 called snap1.

Let’s take a look at our device mapper:

# dmsetup table
vg0-lv0-real: 0 819200 linear 7:0 2048
vg0-snap1-cow: 0 245760 linear 7:0 821248
vg0-lv0: 0 819200 snapshot-origin 252:2
vg0-snap1: 0 819200 snapshot 252:2 252:3 P 8

What can we do on the snapshot ?

For example:
 - make a fs on vg0-lv0, then touch a file with "will"
 - make a snapshot of vg0-lv0, and also mount the fs there, we will also get the file with "will"
 - we modify the file or even delete the file on the directory which is mounted on the vg0-lv0,
 - then we will still cound get the file with "will" in the directory mounted on the vg0-snap1.
   it looks like a fork.

summary

Reading from the original volume

The user reads from vg0-lv0. The request is forwarded and the user retrieves data from vg0-lv0-real.

Writing to the original volume

The user writes to vg0-lv0. The original data is first copied from vg0-lv0-real to vg0-snap1-cow (This
is why it’s called COW, Copy On Write). Then the new data is written to vg0-lv0-real. If the user writes
again to vg0-lv0, modifying data that has already been copied to vg0-snap1-cow, the data from vg0-lv0-real
is simply overwritten. The data on vg0-snap1-cow remains the data that was valid at the time of the creation
of the snapshot.

Reading from the snapshot

When the user reads data from vg0-snap1, a lookup is done on vg0-snap1-cow to see if that particular piece of
data has been copied from vg0-lv0-real to vg0-snap1-cow. If that is the case, the value from vg0-snap1-cow is
returned. Otherwise, the data from vg0-lv0-real is returned. The user effectively sees the data that was valid
at the time of creation of the snapshot.

A few important things to note:


 The snapshot is empty at creation, whatever size it is. This means that the creation of a snapshot is immediate
     and invisible to the user.
 The snapshot will hold a copy of the original data, as the original data is modified. This means that the snapshot
     will grow over time. Therefore, it is not necessary to make a snapshot as big as the original volume. The snapshot
     should be big enough to hold the amount of data that is expected to be modified over the time of existence of the
     snapshot. Creating a snapshot bigger than the original volume is useless and creating a snapshot as big as the
      original volume will ensure that all data on the original volume can be copied over to the snapshot. If a snapshot
     is not big enough to hold all the modified data of the original volume, the snapshot is removed and suddenly disappears
     from the system, with all the consequences of removing a mounted device from a live system.
 A volume that has one or more snapshots will provide much less I/O performance. Remove the snapshot as soon as you’re
     done with it.

implementation


Time 0:

      Origin     Snap0
      +--+       +--+
      |A0|       |  |
      +--+       +--+
      |B0|       |  |
      +--+       +--+
      |C0|       |  |
      +--+       +--+
      |D0|       |  |
      +--+       +--+

Time 1:

      Origin     Snap0      Snap1
      +--+       +--+       +--+
      |A0|       |  |       |  |
      +--+       +--+       +--+
      |B1|       |B0|       |  |      B0 is copied to Snap0
      +--+       +--+       +--+
      |C0|       |  |       |  |
      +--+       +--+       +--+
      |D0|       |  |       |  |
      +--+       +--+       +--+

Time 2:

      Origin     Snap0      Snap1
      +--+       +--+       +--+
      |A0|       |  |       |  |
      +--+       +--+       +--+
      |B1|       |B0|       |  |     C0 is copied to Snap0/1
      +--+       +--+       +--+
      |C2|       |C0|       |C0|
      +--+       +--+       +--+
      |D0|       |  |       |  |
      +--+       +--+       +--+

Time 3:

      Origin     Snap0      Snap1
      +--+       +--+       +--+
      |A0|       |  |       |  |
      +--+       +--+       +--+
      |B3|       |B0|       |B1|      B1 is copied to Snap1
      +--+       +--+       +--+
      |C2|       |C0|       |C0|
      +--+       +--+       +--+
      |D0|       |  |       |  |
      +--+       +--+       +--+

exception table

There are two points need to be noted first.

snapshot is only used to preserve the content of the time point when the snapshot is created.

When a chunk is written on the origin device, the original chunk will be copied to the snapshot device prio to the new data reaches origin device.

Look at the exception table entry, dm_exception

struct dm_exception {
    struct list_head hash_list;

    chunk_t old_chunk;
    chunk_t new_chunk;
};

old_chunk, the chunk number on the origin device
new_chunk, the chunk number where the data is stored on the snapshot device
hash_list, there are two hash list on one dm_snapshot, complete and pending. the complete hash list contains the dm_exception of which copy has been completed. the pending hash list contains the dm_exception of which copy has not been completed.

Look at the source code:

io to origin device

origin_map
---
    bio_set_dev(bio, o->dev->bdev);

    if (unlikely(bio->bi_opf & REQ_PREFLUSH))
        return DM_MAPIO_REMAPPED;

    if (bio_data_dir(bio) != WRITE)
        return DM_MAPIO_REMAPPED;
    ...

    /* Only tell snapshots if this is a write */

    return do_origin(o->dev, bio);
      -> __origin_write
---
__origin_write
---

Yes, tell every snapshot

    list_for_each_entry (snap, snapshots, list) {
        ...
        mutex_lock(&snap->lock);
        ...
        chunk = sector_to_chunk(snap->store, sector);


        //Check exception table to see if block is already remapped in
        //this snapshot and trigger an exception if not.

        e = dm_lookup_exception(&snap->complete, chunk);
        if (e)
            goto next_snapshot;


        //Is there a pending one ?

        pe = __lookup_pending_exception(snap, chunk);
        if (!pe) {
            mutex_unlock(&snap->lock);
            pe = alloc_pending_exception(snap);
            mutex_lock(&snap->lock);

            if (!snap->valid) {
                free_pending_exception(pe);
                goto next_snapshot;
            }

            // we released the snap->lock above, things maybe changed after that.

            e = dm_lookup_exception(&snap->complete, chunk);
            if (e) {
                free_pending_exception(pe);
                goto next_snapshot;
            }

            pe = __find_pending_exception(snap, pe, chunk);
            if (!pe) {
                __invalidate_snapshot(snap, -ENOMEM);
                goto next_snapshot;
            }
        }


        //dm will do nothing for the original bio

        r = DM_MAPIO_SUBMITTED;

        /*
         * If an origin bio was supplied, queue it to wait for the
         * completion of this exception, and start this one last,
         * at the end of the function.
         */
        if (bio) {
            bio_list_add(&pe->origin_bios, bio);
            bio = NULL;

            if (!pe->started) {
                pe->started = 1;
                pe_to_start_last = pe;
            }
        }

        if (!pe->started) {
            pe->started = 1;
            pe_to_start_now = pe;
        }

next_snapshot:
        mutex_unlock(&snap->lock);

        if (pe_to_start_now) {
            start_copy(pe_to_start_now);
            pe_to_start_now = NULL;
        }
    }
    ...
---

io to snapshot device

snapshot_map
---
    if (bio->bi_opf & REQ_PREFLUSH) {
        bio_set_dev(bio, s->cow->bdev);
        return DM_MAPIO_REMAPPED;
    }

    chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
    ...
    mutex_lock(&s->lock);
    ...

    //If the block is already remapped - use that, else remap it

    e = dm_lookup_exception(&s->complete, chunk);
    if (e) {
        remap_exception(s, e, bio, chunk);
        goto out_unlock;
    }

    if (bio_data_dir(bio) == WRITE) {

    //Not talk about write on snapshot device temporarily.

    } else {

    //If not remapped, the data on origin device has not been modified after
    //snapshot is created.

        bio_set_dev(bio, s->origin->bdev);
        track_chunk(s, bio, chunk);
    }

---

origin bio

In start_copy, the copy_callback is hooked in the copy job.
After the copy is completed, copy_callback is invoked.

copy_callback
  -> complete_exception
    -> persistent_commit_exception
      -> pending_complete
---
    /* Submit any pending write bios */
    if (error) {
        if (full_bio)
            bio_io_error(full_bio);
        error_bios(snapshot_bios);
    } else {
        if (full_bio)
            bio_endio(full_bio);
        flush_bios(snapshot_bios);
    }

    retry_origin_bios(s, origin_bios);

    free_pending_exception(pe);

---