On Mon, May 7, 2012 at 11:33 PM, Christoph Hellwig <hch at infradead.org> wrote: > This is the minimally tested patch to use atomics to refcount > the vnode_info structure. Because the GCC built-in are too ugly to live > I added a minimal Linux Kernel style atomic.h wrapper around them. this > was thrown together really quickly, so careful review is welcome, as > are pointers to readily available userspace atomic_t implementations. > > Note that we still never grab the first reference to vnode_info > from a worker thread, and always put references from the main thread, > which simplifies the concurrency issues greatly. > > > --- > include/atomic.h | 31 +++++++++++++++++++++++++++++++ > sheep/farm/trunk.c | 2 +- > sheep/group.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- > sheep/ops.c | 10 ++++++---- > sheep/sdnet.c | 10 ++++++---- > sheep/sheep_priv.h | 19 ++++--------------- > sheep/vdi.c | 31 +++++++++++++------------------ > 7 files changed, 102 insertions(+), 51 deletions(-) > > Index: sheepdog/include/atomic.h > =================================================================== > --- /dev/null 1970-01-01 00:00:00.000000000 +0000 > +++ sheepdog/include/atomic.h 2012-05-07 17:13:46.781406382 +0200 > @@ -0,0 +1,31 @@ > +#ifndef _SHEEPDOG_ATOMIC_H > +#define _SHEEPDOG_ATOMIC_H > + > +#include <stdbool.h> > + > +typedef int atomic_t; > + > +static inline void atomic_init(atomic_t *v, int val) > +{ > + *v = val; > + __sync_synchronize(); > +} > + > +static inline int atomic_read(atomic_t *v) > +{ > + return *v; You can't directly read it! You should use __sync_xxx API to read/update in any time. Use __sync_add_and_fetch(v, 0) to read it here. > +} > + > +static inline void atomic_inc(atomic_t *v) > +{ > + __sync_add_and_fetch(v, 1); > +} I suggests use this form: static inline atomic_t *atomic_inc(atomic_t *v) { return __sync_add_and_fetch(v, 1); } so that we can get the added result directly -- needn't to call atomic_read() again. I had tried to wrap __sync_xxx API some time ago, because I used them in zookeeper driver. Zookeeper driver three APIs like these: atomic_t *atomic_inc(atomic_t *v) { return __sync_add_and_fetch(v, 1); } atomic_t *atomic_dec(atomic_t *v) { return __sync_sub_and_fetch(v, 1); } atomic_t *atomic_read(atomic_t *v) { return __sync_add_and_fetch(v, 0); } > + > +static inline bool atomic_dec_and_test(atomic_t *v) > +{ > + if (__sync_sub_and_fetch(v, 1) == 0) > + return true; > + return false; > +} > + > +#endif /* _SHEEPDOG_ATOMIC_H */ > Index: sheepdog/sheep/farm/trunk.c > =================================================================== > --- sheepdog.orig/sheep/farm/trunk.c 2012-05-06 23:21:16.537321808 +0200 > +++ sheepdog/sheep/farm/trunk.c 2012-05-07 17:10:15.005400960 +0200 > @@ -240,7 +240,7 @@ static int oid_stale(uint64_t oid) > struct sd_vnode *v; > int ret = 1; > > - vnodes = get_vnode_info(); > + vnodes = get_current_vnode_info(); > nr_copies = get_nr_copies(vnodes); > for (i = 0; i < nr_copies; i++) { > v = oid_to_vnode(vnodes, oid, i); > Index: sheepdog/sheep/group.c > =================================================================== > --- sheepdog.orig/sheep/group.c 2012-05-07 10:59:08.769468543 +0200 > +++ sheepdog/sheep/group.c 2012-05-07 17:17:23.273411925 +0200 > @@ -9,6 +9,7 @@ > * along with this program. If not, see <http://www.gnu.org/licenses/>. > */ > #include <assert.h> > +#include <atomic.h> > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > @@ -134,6 +135,13 @@ int get_zones_nr_from(struct sd_node *no > return nr_zones; > } > > +struct vnode_info { > + struct sd_vnode entries[SD_MAX_VNODES]; > + int nr_vnodes; > + int nr_zones; > + atomic_t refcnt; > +}; > + > /* > * If we have less zones available than the desired redundancy we have to do > * with nr_zones copies, sorry. > @@ -146,21 +154,44 @@ int get_max_nr_copies_from(struct sd_nod > return min((int)sys->nr_copies, get_zones_nr_from(nodes, nr_nodes)); > } > > -struct vnode_info *get_vnode_info(void) > +/* > + * Grab an additional reference to the passed in vnode info. > + * > + * The caller must already hold a reference to vnode_info, this function must > + * only be used to grab an additional reference from code that wants the > + * vnode information to outlive the request structure. > + */ > +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info) > +{ > + assert(atomic_read(&vnode_info->refcnt) > 0); > + > + atomic_inc(&vnode_info->refcnt); > + return vnode_info; > +} > + > +/* > + * Release a reference to the current vnode information. > + * > + * Must be called from the main thread. > + */ > +void put_vnode_info(struct vnode_info *vnode_info) > { > assert(is_main_thread()); > - assert(current_vnode_info); > > - current_vnode_info->refcnt++; > - return current_vnode_info; > + if (atomic_dec_and_test(&vnode_info->refcnt)) > + free(vnode_info); > } > > -void put_vnode_info(struct vnode_info *vnodes) > +/* > + * Get a reference to the currently active vnode information structure, > + * this must only be called from the main thread. > + */ > +struct vnode_info *get_current_vnode_info(void) > { > assert(is_main_thread()); > + assert(current_vnode_info); > > - if (vnodes && --vnodes->refcnt == 0) > - free(vnodes); > + return grab_vnode_info(current_vnode_info); > } > > struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid, > @@ -187,9 +218,10 @@ static int update_vnode_info(void) > vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes, > vnode_info->entries); > vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes); > - vnode_info->refcnt = 1; > + atomic_init(&vnode_info->refcnt, 1); > > - put_vnode_info(current_vnode_info); > + if (current_vnode_info) > + put_vnode_info(current_vnode_info); > current_vnode_info = vnode_info; > return 0; > } > Index: sheepdog/sheep/ops.c > =================================================================== > --- sheepdog.orig/sheep/ops.c 2012-05-07 10:58:00.177466788 +0200 > +++ sheepdog/sheep/ops.c 2012-05-07 17:17:01.573411373 +0200 > @@ -63,7 +63,7 @@ struct sd_op_template { > > struct flush_work { > struct object_cache *cache; > - struct vnode_info vnode_info; > + struct vnode_info *vnode_info; > struct work work; > }; > > @@ -582,7 +582,7 @@ static void flush_vdi_fn(struct work *wo > struct flush_work *fw = container_of(work, struct flush_work, work); > > dprintf("flush vdi %"PRIx32"\n", fw->cache->vid); > - if (object_cache_push(&fw->vnode_info, fw->cache) != SD_RES_SUCCESS) > + if (object_cache_push(fw->vnode_info, fw->cache) != SD_RES_SUCCESS) > eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid); > } > > @@ -590,6 +590,8 @@ static void flush_vdi_done(struct work * > { > struct flush_work *fw = container_of(work, struct flush_work, work); > dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid); > + > + put_vnode_info(fw->vnode_info); > free(fw); > } > > @@ -609,8 +611,8 @@ static int local_flush_vdi(struct reques > fw->work.fn = flush_vdi_fn; > fw->work.done = flush_vdi_done; > fw->cache = cache; > - memcpy(&fw->vnode_info, req->vnodes, > - sizeof(fw->vnode_info)); > + fw->vnode_info = grab_vnode_info(req->vnodes); > + > queue_work(sys->flush_wqueue, &fw->work); > } > } > Index: sheepdog/sheep/sdnet.c > =================================================================== > --- sheepdog.orig/sheep/sdnet.c 2012-05-07 10:54:39.669461654 +0200 > +++ sheepdog/sheep/sdnet.c 2012-05-07 17:20:04.521416055 +0200 > @@ -150,8 +150,9 @@ static void io_op_done(struct work *work > retry: > req->rq.epoch = sys->epoch; > > - put_vnode_info(req->vnodes); > - req->vnodes = get_vnode_info(); > + if (req->vnodes) > + put_vnode_info(req->vnodes); > + req->vnodes = get_current_vnode_info(); > setup_access_to_local_objects(req); > list_add_tail(&req->cev.event_list, &sys->request_queue); > > @@ -310,7 +311,7 @@ static void queue_request(struct request > * called before we set up current_vnode_info > */ > if (!is_force_op(req->op)) > - req->vnodes = get_vnode_info(); > + req->vnodes = get_current_vnode_info(); > > if (is_io_op(req->op)) { > req->work.fn = do_io_request; > @@ -376,7 +377,8 @@ static void free_request(struct request > sys->outstanding_data_size -= req->data_length; > > list_del(&req->r_siblings); > - put_vnode_info(req->vnodes); > + if (req->vnodes) > + put_vnode_info(req->vnodes); > free(req->data); > free(req); > } > Index: sheepdog/sheep/sheep_priv.h > =================================================================== > --- sheepdog.orig/sheep/sheep_priv.h 2012-05-07 10:59:08.769468543 +0200 > +++ sheepdog/sheep/sheep_priv.h 2012-05-07 17:11:01.197402145 +0200 > @@ -63,19 +63,7 @@ struct client_info { > int refcnt; > }; > > -/* > - * This should be private in group.c, but the asynchronous VDI deletion > - * prevents this currently. > - */ > -struct vnode_info { > - struct sd_vnode entries[SD_MAX_VNODES]; > - int nr_vnodes; > - int nr_zones; > - int refcnt; > -}; > - > - > -struct request; > +struct vnode_info; > > struct request { > struct event_struct cev; > @@ -264,8 +252,9 @@ int get_vdi_attr(struct vnode_info *vnod > int excl, int delete); > > int get_zones_nr_from(struct sd_node *nodes, int nr_nodes); > -struct vnode_info *get_vnode_info(void); > -void put_vnode_info(struct vnode_info *vnodes); > +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info); > +void put_vnode_info(struct vnode_info *vnode_info); > +struct vnode_info *get_current_vnode_info(void); > > struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid, > int copy_idx); > Index: sheepdog/sheep/vdi.c > =================================================================== > --- sheepdog.orig/sheep/vdi.c 2012-05-07 10:56:03.861463810 +0200 > +++ sheepdog/sheep/vdi.c 2012-05-07 17:15:54.865409669 +0200 > @@ -354,7 +354,7 @@ struct deletion_work { > int count; > uint32_t *buf; > > - struct vnode_info vnode_info; > + struct vnode_info *vnode_info; > int delete_error; > }; > > @@ -372,9 +372,9 @@ static int delete_inode(struct deletion_ > goto out; > } > > - nr_copies = get_nr_copies(&dw->vnode_info); > + nr_copies = get_nr_copies(dw->vnode_info); > > - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), > + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), > (char *)inode, SD_INODE_HEADER_SIZE, 0, nr_copies); > if (ret != SD_RES_SUCCESS) { > ret = SD_RES_EIO; > @@ -386,7 +386,7 @@ static int delete_inode(struct deletion_ > else > memset(inode->name, 0, sizeof(inode->name)); > > - ret = write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), > + ret = write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), > (char *)inode, SD_INODE_HEADER_SIZE, 0, 0, > nr_copies, 0); > if (ret != 0) { > @@ -416,9 +416,9 @@ static void delete_one(struct work *work > goto out; > } > > - nr_copies = get_nr_copies(&dw->vnode_info); > + nr_copies = get_nr_copies(dw->vnode_info); > > - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), > + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), > (void *)inode, sizeof(*inode), > 0, nr_copies); > > @@ -437,7 +437,7 @@ static void delete_one(struct work *work > continue; > } > > - ret = remove_object(&dw->vnode_info, dw->epoch, > + ret = remove_object(dw->vnode_info, dw->epoch, > vid_to_data_oid(inode->data_vdi_id[i], i), > nr_copies); > > @@ -448,7 +448,7 @@ static void delete_one(struct work *work > } > > if (dw->delete_error) { > - write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), > + write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), > (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0); > } > > @@ -470,6 +470,7 @@ static void delete_one_done(struct work > > list_del(&dw->dw_siblings); > > + put_vnode_info(dw->vnode_info); > free(dw->buf); > free(dw); > > @@ -489,7 +490,7 @@ static int fill_vdi_list(struct deletion > uint32_t vid; > int nr_copies; > > - nr_copies = get_nr_copies(&dw->vnode_info); > + nr_copies = get_nr_copies(dw->vnode_info); > > inode = malloc(SD_INODE_HEADER_SIZE); > if (!inode) { > @@ -500,7 +501,7 @@ static int fill_vdi_list(struct deletion > dw->buf[dw->count++] = root_vid; > again: > vid = dw->buf[done++]; > - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid), > + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid), > (char *)inode, SD_INODE_HEADER_SIZE, 0, > nr_copies); > > @@ -614,18 +615,12 @@ int del_vdi(struct vnode_info *vnode_inf > dw->count = 0; > dw->vid = *vid; > dw->epoch = epoch; > + dw->vnode_info = grab_vnode_info(vnode_info); > > dw->work.fn = delete_one; > dw->work.done = delete_one_done; > > - /* > - * Unfortunately we have to copy over the vnode_info array here, > - * as we can't grab additional references to it outside the main > - * thread. > - */ > - memcpy(&dw->vnode_info, vnode_info, sizeof(*vnode_info)); > - > - root_vid = get_vdi_root(&dw->vnode_info, dw->epoch, dw->vid); > + root_vid = get_vdi_root(dw->vnode_info, dw->epoch, dw->vid); > if (!root_vid) { > ret = SD_RES_EIO; > goto out_free_buf; > -- > sheepdog mailing list > sheepdog at lists.wpkg.org > http://lists.wpkg.org/mailman/listinfo/sheepdog -- Yunkai Zhang Work at Taobao |