[Sheepdog] [PACH, RFC] sheep: use atomic for the vnode_info refcount outside the main thread

Yunkai Zhang yunkai.me at gmail.com
Mon May 7 18:09:10 CEST 2012


On Tue, May 8, 2012 at 12:04 AM, Yunkai Zhang <yunkai.me at gmail.com> wrote:
> On Mon, May 7, 2012 at 11:33 PM, Christoph Hellwig <hch at infradead.org> wrote:
>> This is the minimally tested patch to use atomics to refcount
>> the vnode_info structure.  Because the GCC built-in are too ugly to live
>> I added a minimal Linux Kernel style atomic.h wrapper around them.  this
>> was thrown together really quickly, so careful review is welcome, as
>> are pointers to readily available userspace atomic_t implementations.
>>
>> Note that we still never grab the first reference to vnode_info
>> from a worker thread, and always put references from the main thread,
>> which simplifies the concurrency issues greatly.
>>
>>
>> ---
>>  include/atomic.h   |   31 +++++++++++++++++++++++++++++++
>>  sheep/farm/trunk.c |    2 +-
>>  sheep/group.c      |   50 +++++++++++++++++++++++++++++++++++++++++---------
>>  sheep/ops.c        |   10 ++++++----
>>  sheep/sdnet.c      |   10 ++++++----
>>  sheep/sheep_priv.h |   19 ++++---------------
>>  sheep/vdi.c        |   31 +++++++++++++------------------
>>  7 files changed, 102 insertions(+), 51 deletions(-)
>>
>> Index: sheepdog/include/atomic.h
>> ===================================================================
>> --- /dev/null   1970-01-01 00:00:00.000000000 +0000
>> +++ sheepdog/include/atomic.h   2012-05-07 17:13:46.781406382 +0200
>> @@ -0,0 +1,31 @@
>> +#ifndef _SHEEPDOG_ATOMIC_H
>> +#define _SHEEPDOG_ATOMIC_H
>> +
>> +#include <stdbool.h>
>> +
>> +typedef int atomic_t;
>> +
>> +static inline void atomic_init(atomic_t *v, int val)
>> +{
>> +       *v = val;
>> +       __sync_synchronize();
>> +}
>> +
>> +static inline int atomic_read(atomic_t *v)
>> +{
>> +       return *v;
> You can't directly read it! You should use __sync_xxx API to
> read/update in any time.
>
> Use __sync_add_and_fetch(v, 0) to read it here.
>> +}
>> +
>> +static inline void atomic_inc(atomic_t *v)
>> +{
>> +       __sync_add_and_fetch(v, 1);
>> +}
> I suggests use this form:
> static inline atomic_t *atomic_inc(atomic_t *v)
> {
>         return __sync_add_and_fetch(v, 1);
> }
> so that we can get the added result directly -- needn't to call
> atomic_read() again.
>
> I had tried to wrap __sync_xxx API some time ago, because I used them
> in zookeeper driver.
>
> Zookeeper driver three APIs like these:
> atomic_t *atomic_inc(atomic_t *v)
> {
>    return __sync_add_and_fetch(v, 1);
> }
>
> atomic_t *atomic_dec(atomic_t *v)
> {
>    return __sync_sub_and_fetch(v, 1);
> }
>
> atomic_t *atomic_read(atomic_t *v)
> {
>    return __sync_add_and_fetch(v, 0);
> }

Sorry, there were typos, let me correct them:
atomic_t atomic_inc(atomic_t *v)
{
   return __sync_add_and_fetch(v, 1);
}

atomic_t atomic_dec(atomic_t *v)
{
   return __sync_sub_and_fetch(v, 1);
}

atomic_t atomic_read(atomic_t *v)
{
   return __sync_add_and_fetch(v, 0);
}

>
>> +
>> +static inline bool atomic_dec_and_test(atomic_t *v)
>> +{
>> +       if (__sync_sub_and_fetch(v, 1) == 0)
>> +               return true;
>> +       return false;
>> +}
>> +
>> +#endif /* _SHEEPDOG_ATOMIC_H */
>> Index: sheepdog/sheep/farm/trunk.c
>> ===================================================================
>> --- sheepdog.orig/sheep/farm/trunk.c    2012-05-06 23:21:16.537321808 +0200
>> +++ sheepdog/sheep/farm/trunk.c 2012-05-07 17:10:15.005400960 +0200
>> @@ -240,7 +240,7 @@ static int oid_stale(uint64_t oid)
>>        struct sd_vnode *v;
>>        int ret = 1;
>>
>> -       vnodes = get_vnode_info();
>> +       vnodes = get_current_vnode_info();
>>        nr_copies = get_nr_copies(vnodes);
>>        for (i = 0; i < nr_copies; i++) {
>>                v = oid_to_vnode(vnodes, oid, i);
>> Index: sheepdog/sheep/group.c
>> ===================================================================
>> --- sheepdog.orig/sheep/group.c 2012-05-07 10:59:08.769468543 +0200
>> +++ sheepdog/sheep/group.c      2012-05-07 17:17:23.273411925 +0200
>> @@ -9,6 +9,7 @@
>>  * along with this program. If not, see <http://www.gnu.org/licenses/>.
>>  */
>>  #include <assert.h>
>> +#include <atomic.h>
>>  #include <stdio.h>
>>  #include <stdlib.h>
>>  #include <unistd.h>
>> @@ -134,6 +135,13 @@ int get_zones_nr_from(struct sd_node *no
>>        return nr_zones;
>>  }
>>
>> +struct vnode_info {
>> +       struct sd_vnode entries[SD_MAX_VNODES];
>> +       int nr_vnodes;
>> +       int nr_zones;
>> +       atomic_t refcnt;
>> +};
>> +
>>  /*
>>  * If we have less zones available than the desired redundancy we have to do
>>  * with nr_zones copies, sorry.
>> @@ -146,21 +154,44 @@ int get_max_nr_copies_from(struct sd_nod
>>        return min((int)sys->nr_copies, get_zones_nr_from(nodes, nr_nodes));
>>  }
>>
>> -struct vnode_info *get_vnode_info(void)
>> +/*
>> + * Grab an additional reference to the passed in vnode info.
>> + *
>> + * The caller must already hold a reference to vnode_info, this function must
>> + * only be used to grab an additional reference from code that wants the
>> + * vnode information to outlive the request structure.
>> + */
>> +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info)
>> +{
>> +       assert(atomic_read(&vnode_info->refcnt) > 0);
>> +
>> +       atomic_inc(&vnode_info->refcnt);
>> +       return vnode_info;
>> +}
>> +
>> +/*
>> + * Release a reference to the current vnode information.
>> + *
>> + * Must be called from the main thread.
>> + */
>> +void put_vnode_info(struct vnode_info *vnode_info)
>>  {
>>        assert(is_main_thread());
>> -       assert(current_vnode_info);
>>
>> -       current_vnode_info->refcnt++;
>> -       return current_vnode_info;
>> +       if (atomic_dec_and_test(&vnode_info->refcnt))
>> +               free(vnode_info);
>>  }
>>
>> -void put_vnode_info(struct vnode_info *vnodes)
>> +/*
>> + * Get a reference to the currently active vnode information structure,
>> + * this must only be called from the main thread.
>> + */
>> +struct vnode_info *get_current_vnode_info(void)
>>  {
>>        assert(is_main_thread());
>> +       assert(current_vnode_info);
>>
>> -       if (vnodes && --vnodes->refcnt == 0)
>> -               free(vnodes);
>> +       return grab_vnode_info(current_vnode_info);
>>  }
>>
>>  struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
>> @@ -187,9 +218,10 @@ static int update_vnode_info(void)
>>        vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes,
>>                                                vnode_info->entries);
>>        vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
>> -       vnode_info->refcnt = 1;
>> +       atomic_init(&vnode_info->refcnt, 1);
>>
>> -       put_vnode_info(current_vnode_info);
>> +       if (current_vnode_info)
>> +               put_vnode_info(current_vnode_info);
>>        current_vnode_info = vnode_info;
>>        return 0;
>>  }
>> Index: sheepdog/sheep/ops.c
>> ===================================================================
>> --- sheepdog.orig/sheep/ops.c   2012-05-07 10:58:00.177466788 +0200
>> +++ sheepdog/sheep/ops.c        2012-05-07 17:17:01.573411373 +0200
>> @@ -63,7 +63,7 @@ struct sd_op_template {
>>
>>  struct flush_work {
>>        struct object_cache *cache;
>> -       struct vnode_info vnode_info;
>> +       struct vnode_info *vnode_info;
>>        struct work work;
>>  };
>>
>> @@ -582,7 +582,7 @@ static void flush_vdi_fn(struct work *wo
>>        struct flush_work *fw = container_of(work, struct flush_work, work);
>>
>>        dprintf("flush vdi %"PRIx32"\n", fw->cache->vid);
>> -       if (object_cache_push(&fw->vnode_info, fw->cache) != SD_RES_SUCCESS)
>> +       if (object_cache_push(fw->vnode_info, fw->cache) != SD_RES_SUCCESS)
>>                eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid);
>>  }
>>
>> @@ -590,6 +590,8 @@ static void flush_vdi_done(struct work *
>>  {
>>        struct flush_work *fw = container_of(work, struct flush_work, work);
>>        dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid);
>> +
>> +       put_vnode_info(fw->vnode_info);
>>        free(fw);
>>  }
>>
>> @@ -609,8 +611,8 @@ static int local_flush_vdi(struct reques
>>                        fw->work.fn = flush_vdi_fn;
>>                        fw->work.done = flush_vdi_done;
>>                        fw->cache = cache;
>> -                       memcpy(&fw->vnode_info, req->vnodes,
>> -                               sizeof(fw->vnode_info));
>> +                       fw->vnode_info = grab_vnode_info(req->vnodes);
>> +
>>                        queue_work(sys->flush_wqueue, &fw->work);
>>                }
>>        }
>> Index: sheepdog/sheep/sdnet.c
>> ===================================================================
>> --- sheepdog.orig/sheep/sdnet.c 2012-05-07 10:54:39.669461654 +0200
>> +++ sheepdog/sheep/sdnet.c      2012-05-07 17:20:04.521416055 +0200
>> @@ -150,8 +150,9 @@ static void io_op_done(struct work *work
>>  retry:
>>        req->rq.epoch = sys->epoch;
>>
>> -       put_vnode_info(req->vnodes);
>> -       req->vnodes = get_vnode_info();
>> +       if (req->vnodes)
>> +               put_vnode_info(req->vnodes);
>> +       req->vnodes = get_current_vnode_info();
>>        setup_access_to_local_objects(req);
>>        list_add_tail(&req->cev.event_list, &sys->request_queue);
>>
>> @@ -310,7 +311,7 @@ static void queue_request(struct request
>>         * called before we set up current_vnode_info
>>         */
>>        if (!is_force_op(req->op))
>> -               req->vnodes = get_vnode_info();
>> +               req->vnodes = get_current_vnode_info();
>>
>>        if (is_io_op(req->op)) {
>>                req->work.fn = do_io_request;
>> @@ -376,7 +377,8 @@ static void free_request(struct request
>>        sys->outstanding_data_size -= req->data_length;
>>
>>        list_del(&req->r_siblings);
>> -       put_vnode_info(req->vnodes);
>> +       if (req->vnodes)
>> +               put_vnode_info(req->vnodes);
>>        free(req->data);
>>        free(req);
>>  }
>> Index: sheepdog/sheep/sheep_priv.h
>> ===================================================================
>> --- sheepdog.orig/sheep/sheep_priv.h    2012-05-07 10:59:08.769468543 +0200
>> +++ sheepdog/sheep/sheep_priv.h 2012-05-07 17:11:01.197402145 +0200
>> @@ -63,19 +63,7 @@ struct client_info {
>>        int refcnt;
>>  };
>>
>> -/*
>> - * This should be private in group.c, but the asynchronous VDI deletion
>> - * prevents this currently.
>> - */
>> -struct vnode_info {
>> -       struct sd_vnode entries[SD_MAX_VNODES];
>> -       int nr_vnodes;
>> -       int nr_zones;
>> -       int refcnt;
>> -};
>> -
>> -
>> -struct request;
>> +struct vnode_info;
>>
>>  struct request {
>>        struct event_struct cev;
>> @@ -264,8 +252,9 @@ int get_vdi_attr(struct vnode_info *vnod
>>                int excl, int delete);
>>
>>  int get_zones_nr_from(struct sd_node *nodes, int nr_nodes);
>> -struct vnode_info *get_vnode_info(void);
>> -void put_vnode_info(struct vnode_info *vnodes);
>> +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
>> +void put_vnode_info(struct vnode_info *vnode_info);
>> +struct vnode_info *get_current_vnode_info(void);
>>
>>  struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
>>                int copy_idx);
>> Index: sheepdog/sheep/vdi.c
>> ===================================================================
>> --- sheepdog.orig/sheep/vdi.c   2012-05-07 10:56:03.861463810 +0200
>> +++ sheepdog/sheep/vdi.c        2012-05-07 17:15:54.865409669 +0200
>> @@ -354,7 +354,7 @@ struct deletion_work {
>>        int count;
>>        uint32_t *buf;
>>
>> -       struct vnode_info vnode_info;
>> +       struct vnode_info *vnode_info;
>>        int delete_error;
>>  };
>>
>> @@ -372,9 +372,9 @@ static int delete_inode(struct deletion_
>>                goto out;
>>        }
>>
>> -       nr_copies = get_nr_copies(&dw->vnode_info);
>> +       nr_copies = get_nr_copies(dw->vnode_info);
>>
>> -       ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
>> +       ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
>>                          (char *)inode, SD_INODE_HEADER_SIZE, 0, nr_copies);
>>        if (ret != SD_RES_SUCCESS) {
>>                ret = SD_RES_EIO;
>> @@ -386,7 +386,7 @@ static int delete_inode(struct deletion_
>>        else
>>                memset(inode->name, 0, sizeof(inode->name));
>>
>> -       ret = write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
>> +       ret = write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
>>                           (char *)inode, SD_INODE_HEADER_SIZE, 0, 0,
>>                           nr_copies, 0);
>>        if (ret != 0) {
>> @@ -416,9 +416,9 @@ static void delete_one(struct work *work
>>                goto out;
>>        }
>>
>> -       nr_copies = get_nr_copies(&dw->vnode_info);
>> +       nr_copies = get_nr_copies(dw->vnode_info);
>>
>> -       ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
>> +       ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
>>                          (void *)inode, sizeof(*inode),
>>                          0, nr_copies);
>>
>> @@ -437,7 +437,7 @@ static void delete_one(struct work *work
>>                        continue;
>>                }
>>
>> -               ret = remove_object(&dw->vnode_info, dw->epoch,
>> +               ret = remove_object(dw->vnode_info, dw->epoch,
>>                              vid_to_data_oid(inode->data_vdi_id[i], i),
>>                              nr_copies);
>>
>> @@ -448,7 +448,7 @@ static void delete_one(struct work *work
>>        }
>>
>>        if (dw->delete_error) {
>> -               write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
>> +               write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
>>                             (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0);
>>        }
>>
>> @@ -470,6 +470,7 @@ static void delete_one_done(struct work
>>
>>        list_del(&dw->dw_siblings);
>>
>> +       put_vnode_info(dw->vnode_info);
>>        free(dw->buf);
>>        free(dw);
>>
>> @@ -489,7 +490,7 @@ static int fill_vdi_list(struct deletion
>>        uint32_t vid;
>>        int nr_copies;
>>
>> -       nr_copies = get_nr_copies(&dw->vnode_info);
>> +       nr_copies = get_nr_copies(dw->vnode_info);
>>
>>        inode = malloc(SD_INODE_HEADER_SIZE);
>>        if (!inode) {
>> @@ -500,7 +501,7 @@ static int fill_vdi_list(struct deletion
>>        dw->buf[dw->count++] = root_vid;
>>  again:
>>        vid = dw->buf[done++];
>> -       ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid),
>> +       ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid),
>>                          (char *)inode, SD_INODE_HEADER_SIZE, 0,
>>                          nr_copies);
>>
>> @@ -614,18 +615,12 @@ int del_vdi(struct vnode_info *vnode_inf
>>        dw->count = 0;
>>        dw->vid = *vid;
>>        dw->epoch = epoch;
>> +       dw->vnode_info = grab_vnode_info(vnode_info);
>>
>>        dw->work.fn = delete_one;
>>        dw->work.done = delete_one_done;
>>
>> -       /*
>> -        * Unfortunately we have to copy over the vnode_info array here,
>> -        * as we can't grab additional references to it outside the main
>> -        * thread.
>> -        */
>> -       memcpy(&dw->vnode_info, vnode_info, sizeof(*vnode_info));
>> -
>> -       root_vid = get_vdi_root(&dw->vnode_info, dw->epoch, dw->vid);
>> +       root_vid = get_vdi_root(dw->vnode_info, dw->epoch, dw->vid);
>>        if (!root_vid) {
>>                ret = SD_RES_EIO;
>>                goto out_free_buf;
>> --
>> sheepdog mailing list
>> sheepdog at lists.wpkg.org
>> http://lists.wpkg.org/mailman/listinfo/sheepdog
>
>
>
> --
> Yunkai Zhang
> Work at Taobao



-- 
Yunkai Zhang
Work at Taobao



More information about the sheepdog mailing list