[Sheepdog] [PACH, RFC] sheep: use atomic for the vnode_info refcount outside the main thread
Yunkai Zhang
yunkai.me at gmail.com
Mon May 7 18:04:09 CEST 2012
On Mon, May 7, 2012 at 11:33 PM, Christoph Hellwig <hch at infradead.org> wrote:
> This is the minimally tested patch to use atomics to refcount
> the vnode_info structure. Because the GCC built-in are too ugly to live
> I added a minimal Linux Kernel style atomic.h wrapper around them. this
> was thrown together really quickly, so careful review is welcome, as
> are pointers to readily available userspace atomic_t implementations.
>
> Note that we still never grab the first reference to vnode_info
> from a worker thread, and always put references from the main thread,
> which simplifies the concurrency issues greatly.
>
>
> ---
> include/atomic.h | 31 +++++++++++++++++++++++++++++++
> sheep/farm/trunk.c | 2 +-
> sheep/group.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
> sheep/ops.c | 10 ++++++----
> sheep/sdnet.c | 10 ++++++----
> sheep/sheep_priv.h | 19 ++++---------------
> sheep/vdi.c | 31 +++++++++++++------------------
> 7 files changed, 102 insertions(+), 51 deletions(-)
>
> Index: sheepdog/include/atomic.h
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ sheepdog/include/atomic.h 2012-05-07 17:13:46.781406382 +0200
> @@ -0,0 +1,31 @@
> +#ifndef _SHEEPDOG_ATOMIC_H
> +#define _SHEEPDOG_ATOMIC_H
> +
> +#include <stdbool.h>
> +
> +typedef int atomic_t;
> +
> +static inline void atomic_init(atomic_t *v, int val)
> +{
> + *v = val;
> + __sync_synchronize();
> +}
> +
> +static inline int atomic_read(atomic_t *v)
> +{
> + return *v;
You can't directly read it! You should use __sync_xxx API to
read/update in any time.
Use __sync_add_and_fetch(v, 0) to read it here.
> +}
> +
> +static inline void atomic_inc(atomic_t *v)
> +{
> + __sync_add_and_fetch(v, 1);
> +}
I suggests use this form:
static inline atomic_t *atomic_inc(atomic_t *v)
{
return __sync_add_and_fetch(v, 1);
}
so that we can get the added result directly -- needn't to call
atomic_read() again.
I had tried to wrap __sync_xxx API some time ago, because I used them
in zookeeper driver.
Zookeeper driver three APIs like these:
atomic_t *atomic_inc(atomic_t *v)
{
return __sync_add_and_fetch(v, 1);
}
atomic_t *atomic_dec(atomic_t *v)
{
return __sync_sub_and_fetch(v, 1);
}
atomic_t *atomic_read(atomic_t *v)
{
return __sync_add_and_fetch(v, 0);
}
> +
> +static inline bool atomic_dec_and_test(atomic_t *v)
> +{
> + if (__sync_sub_and_fetch(v, 1) == 0)
> + return true;
> + return false;
> +}
> +
> +#endif /* _SHEEPDOG_ATOMIC_H */
> Index: sheepdog/sheep/farm/trunk.c
> ===================================================================
> --- sheepdog.orig/sheep/farm/trunk.c 2012-05-06 23:21:16.537321808 +0200
> +++ sheepdog/sheep/farm/trunk.c 2012-05-07 17:10:15.005400960 +0200
> @@ -240,7 +240,7 @@ static int oid_stale(uint64_t oid)
> struct sd_vnode *v;
> int ret = 1;
>
> - vnodes = get_vnode_info();
> + vnodes = get_current_vnode_info();
> nr_copies = get_nr_copies(vnodes);
> for (i = 0; i < nr_copies; i++) {
> v = oid_to_vnode(vnodes, oid, i);
> Index: sheepdog/sheep/group.c
> ===================================================================
> --- sheepdog.orig/sheep/group.c 2012-05-07 10:59:08.769468543 +0200
> +++ sheepdog/sheep/group.c 2012-05-07 17:17:23.273411925 +0200
> @@ -9,6 +9,7 @@
> * along with this program. If not, see <http://www.gnu.org/licenses/>.
> */
> #include <assert.h>
> +#include <atomic.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> @@ -134,6 +135,13 @@ int get_zones_nr_from(struct sd_node *no
> return nr_zones;
> }
>
> +struct vnode_info {
> + struct sd_vnode entries[SD_MAX_VNODES];
> + int nr_vnodes;
> + int nr_zones;
> + atomic_t refcnt;
> +};
> +
> /*
> * If we have less zones available than the desired redundancy we have to do
> * with nr_zones copies, sorry.
> @@ -146,21 +154,44 @@ int get_max_nr_copies_from(struct sd_nod
> return min((int)sys->nr_copies, get_zones_nr_from(nodes, nr_nodes));
> }
>
> -struct vnode_info *get_vnode_info(void)
> +/*
> + * Grab an additional reference to the passed in vnode info.
> + *
> + * The caller must already hold a reference to vnode_info, this function must
> + * only be used to grab an additional reference from code that wants the
> + * vnode information to outlive the request structure.
> + */
> +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info)
> +{
> + assert(atomic_read(&vnode_info->refcnt) > 0);
> +
> + atomic_inc(&vnode_info->refcnt);
> + return vnode_info;
> +}
> +
> +/*
> + * Release a reference to the current vnode information.
> + *
> + * Must be called from the main thread.
> + */
> +void put_vnode_info(struct vnode_info *vnode_info)
> {
> assert(is_main_thread());
> - assert(current_vnode_info);
>
> - current_vnode_info->refcnt++;
> - return current_vnode_info;
> + if (atomic_dec_and_test(&vnode_info->refcnt))
> + free(vnode_info);
> }
>
> -void put_vnode_info(struct vnode_info *vnodes)
> +/*
> + * Get a reference to the currently active vnode information structure,
> + * this must only be called from the main thread.
> + */
> +struct vnode_info *get_current_vnode_info(void)
> {
> assert(is_main_thread());
> + assert(current_vnode_info);
>
> - if (vnodes && --vnodes->refcnt == 0)
> - free(vnodes);
> + return grab_vnode_info(current_vnode_info);
> }
>
> struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
> @@ -187,9 +218,10 @@ static int update_vnode_info(void)
> vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes,
> vnode_info->entries);
> vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
> - vnode_info->refcnt = 1;
> + atomic_init(&vnode_info->refcnt, 1);
>
> - put_vnode_info(current_vnode_info);
> + if (current_vnode_info)
> + put_vnode_info(current_vnode_info);
> current_vnode_info = vnode_info;
> return 0;
> }
> Index: sheepdog/sheep/ops.c
> ===================================================================
> --- sheepdog.orig/sheep/ops.c 2012-05-07 10:58:00.177466788 +0200
> +++ sheepdog/sheep/ops.c 2012-05-07 17:17:01.573411373 +0200
> @@ -63,7 +63,7 @@ struct sd_op_template {
>
> struct flush_work {
> struct object_cache *cache;
> - struct vnode_info vnode_info;
> + struct vnode_info *vnode_info;
> struct work work;
> };
>
> @@ -582,7 +582,7 @@ static void flush_vdi_fn(struct work *wo
> struct flush_work *fw = container_of(work, struct flush_work, work);
>
> dprintf("flush vdi %"PRIx32"\n", fw->cache->vid);
> - if (object_cache_push(&fw->vnode_info, fw->cache) != SD_RES_SUCCESS)
> + if (object_cache_push(fw->vnode_info, fw->cache) != SD_RES_SUCCESS)
> eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid);
> }
>
> @@ -590,6 +590,8 @@ static void flush_vdi_done(struct work *
> {
> struct flush_work *fw = container_of(work, struct flush_work, work);
> dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid);
> +
> + put_vnode_info(fw->vnode_info);
> free(fw);
> }
>
> @@ -609,8 +611,8 @@ static int local_flush_vdi(struct reques
> fw->work.fn = flush_vdi_fn;
> fw->work.done = flush_vdi_done;
> fw->cache = cache;
> - memcpy(&fw->vnode_info, req->vnodes,
> - sizeof(fw->vnode_info));
> + fw->vnode_info = grab_vnode_info(req->vnodes);
> +
> queue_work(sys->flush_wqueue, &fw->work);
> }
> }
> Index: sheepdog/sheep/sdnet.c
> ===================================================================
> --- sheepdog.orig/sheep/sdnet.c 2012-05-07 10:54:39.669461654 +0200
> +++ sheepdog/sheep/sdnet.c 2012-05-07 17:20:04.521416055 +0200
> @@ -150,8 +150,9 @@ static void io_op_done(struct work *work
> retry:
> req->rq.epoch = sys->epoch;
>
> - put_vnode_info(req->vnodes);
> - req->vnodes = get_vnode_info();
> + if (req->vnodes)
> + put_vnode_info(req->vnodes);
> + req->vnodes = get_current_vnode_info();
> setup_access_to_local_objects(req);
> list_add_tail(&req->cev.event_list, &sys->request_queue);
>
> @@ -310,7 +311,7 @@ static void queue_request(struct request
> * called before we set up current_vnode_info
> */
> if (!is_force_op(req->op))
> - req->vnodes = get_vnode_info();
> + req->vnodes = get_current_vnode_info();
>
> if (is_io_op(req->op)) {
> req->work.fn = do_io_request;
> @@ -376,7 +377,8 @@ static void free_request(struct request
> sys->outstanding_data_size -= req->data_length;
>
> list_del(&req->r_siblings);
> - put_vnode_info(req->vnodes);
> + if (req->vnodes)
> + put_vnode_info(req->vnodes);
> free(req->data);
> free(req);
> }
> Index: sheepdog/sheep/sheep_priv.h
> ===================================================================
> --- sheepdog.orig/sheep/sheep_priv.h 2012-05-07 10:59:08.769468543 +0200
> +++ sheepdog/sheep/sheep_priv.h 2012-05-07 17:11:01.197402145 +0200
> @@ -63,19 +63,7 @@ struct client_info {
> int refcnt;
> };
>
> -/*
> - * This should be private in group.c, but the asynchronous VDI deletion
> - * prevents this currently.
> - */
> -struct vnode_info {
> - struct sd_vnode entries[SD_MAX_VNODES];
> - int nr_vnodes;
> - int nr_zones;
> - int refcnt;
> -};
> -
> -
> -struct request;
> +struct vnode_info;
>
> struct request {
> struct event_struct cev;
> @@ -264,8 +252,9 @@ int get_vdi_attr(struct vnode_info *vnod
> int excl, int delete);
>
> int get_zones_nr_from(struct sd_node *nodes, int nr_nodes);
> -struct vnode_info *get_vnode_info(void);
> -void put_vnode_info(struct vnode_info *vnodes);
> +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
> +void put_vnode_info(struct vnode_info *vnode_info);
> +struct vnode_info *get_current_vnode_info(void);
>
> struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
> int copy_idx);
> Index: sheepdog/sheep/vdi.c
> ===================================================================
> --- sheepdog.orig/sheep/vdi.c 2012-05-07 10:56:03.861463810 +0200
> +++ sheepdog/sheep/vdi.c 2012-05-07 17:15:54.865409669 +0200
> @@ -354,7 +354,7 @@ struct deletion_work {
> int count;
> uint32_t *buf;
>
> - struct vnode_info vnode_info;
> + struct vnode_info *vnode_info;
> int delete_error;
> };
>
> @@ -372,9 +372,9 @@ static int delete_inode(struct deletion_
> goto out;
> }
>
> - nr_copies = get_nr_copies(&dw->vnode_info);
> + nr_copies = get_nr_copies(dw->vnode_info);
>
> - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
> + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
> (char *)inode, SD_INODE_HEADER_SIZE, 0, nr_copies);
> if (ret != SD_RES_SUCCESS) {
> ret = SD_RES_EIO;
> @@ -386,7 +386,7 @@ static int delete_inode(struct deletion_
> else
> memset(inode->name, 0, sizeof(inode->name));
>
> - ret = write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
> + ret = write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
> (char *)inode, SD_INODE_HEADER_SIZE, 0, 0,
> nr_copies, 0);
> if (ret != 0) {
> @@ -416,9 +416,9 @@ static void delete_one(struct work *work
> goto out;
> }
>
> - nr_copies = get_nr_copies(&dw->vnode_info);
> + nr_copies = get_nr_copies(dw->vnode_info);
>
> - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
> + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
> (void *)inode, sizeof(*inode),
> 0, nr_copies);
>
> @@ -437,7 +437,7 @@ static void delete_one(struct work *work
> continue;
> }
>
> - ret = remove_object(&dw->vnode_info, dw->epoch,
> + ret = remove_object(dw->vnode_info, dw->epoch,
> vid_to_data_oid(inode->data_vdi_id[i], i),
> nr_copies);
>
> @@ -448,7 +448,7 @@ static void delete_one(struct work *work
> }
>
> if (dw->delete_error) {
> - write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
> + write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
> (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0);
> }
>
> @@ -470,6 +470,7 @@ static void delete_one_done(struct work
>
> list_del(&dw->dw_siblings);
>
> + put_vnode_info(dw->vnode_info);
> free(dw->buf);
> free(dw);
>
> @@ -489,7 +490,7 @@ static int fill_vdi_list(struct deletion
> uint32_t vid;
> int nr_copies;
>
> - nr_copies = get_nr_copies(&dw->vnode_info);
> + nr_copies = get_nr_copies(dw->vnode_info);
>
> inode = malloc(SD_INODE_HEADER_SIZE);
> if (!inode) {
> @@ -500,7 +501,7 @@ static int fill_vdi_list(struct deletion
> dw->buf[dw->count++] = root_vid;
> again:
> vid = dw->buf[done++];
> - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid),
> + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid),
> (char *)inode, SD_INODE_HEADER_SIZE, 0,
> nr_copies);
>
> @@ -614,18 +615,12 @@ int del_vdi(struct vnode_info *vnode_inf
> dw->count = 0;
> dw->vid = *vid;
> dw->epoch = epoch;
> + dw->vnode_info = grab_vnode_info(vnode_info);
>
> dw->work.fn = delete_one;
> dw->work.done = delete_one_done;
>
> - /*
> - * Unfortunately we have to copy over the vnode_info array here,
> - * as we can't grab additional references to it outside the main
> - * thread.
> - */
> - memcpy(&dw->vnode_info, vnode_info, sizeof(*vnode_info));
> -
> - root_vid = get_vdi_root(&dw->vnode_info, dw->epoch, dw->vid);
> + root_vid = get_vdi_root(dw->vnode_info, dw->epoch, dw->vid);
> if (!root_vid) {
> ret = SD_RES_EIO;
> goto out_free_buf;
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
--
Yunkai Zhang
Work at Taobao
More information about the sheepdog
mailing list