On Tue, May 8, 2012 at 12:04 AM, Yunkai Zhang <yunkai.me at gmail.com> wrote: > On Mon, May 7, 2012 at 11:33 PM, Christoph Hellwig <hch at infradead.org> wrote: >> This is the minimally tested patch to use atomics to refcount >> the vnode_info structure. Because the GCC built-in are too ugly to live >> I added a minimal Linux Kernel style atomic.h wrapper around them. this >> was thrown together really quickly, so careful review is welcome, as >> are pointers to readily available userspace atomic_t implementations. >> >> Note that we still never grab the first reference to vnode_info >> from a worker thread, and always put references from the main thread, >> which simplifies the concurrency issues greatly. >> >> >> --- >> include/atomic.h | 31 +++++++++++++++++++++++++++++++ >> sheep/farm/trunk.c | 2 +- >> sheep/group.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- >> sheep/ops.c | 10 ++++++---- >> sheep/sdnet.c | 10 ++++++---- >> sheep/sheep_priv.h | 19 ++++--------------- >> sheep/vdi.c | 31 +++++++++++++------------------ >> 7 files changed, 102 insertions(+), 51 deletions(-) >> >> Index: sheepdog/include/atomic.h >> =================================================================== >> --- /dev/null 1970-01-01 00:00:00.000000000 +0000 >> +++ sheepdog/include/atomic.h 2012-05-07 17:13:46.781406382 +0200 >> @@ -0,0 +1,31 @@ >> +#ifndef _SHEEPDOG_ATOMIC_H >> +#define _SHEEPDOG_ATOMIC_H >> + >> +#include <stdbool.h> >> + >> +typedef int atomic_t; >> + >> +static inline void atomic_init(atomic_t *v, int val) >> +{ >> + *v = val; >> + __sync_synchronize(); >> +} >> + >> +static inline int atomic_read(atomic_t *v) >> +{ >> + return *v; > You can't directly read it! You should use __sync_xxx API to > read/update in any time. > > Use __sync_add_and_fetch(v, 0) to read it here. >> +} >> + >> +static inline void atomic_inc(atomic_t *v) >> +{ >> + __sync_add_and_fetch(v, 1); >> +} > I suggests use this form: > static inline atomic_t *atomic_inc(atomic_t *v) > { > return __sync_add_and_fetch(v, 1); > } > so that we can get the added result directly -- needn't to call > atomic_read() again. > > I had tried to wrap __sync_xxx API some time ago, because I used them > in zookeeper driver. > > Zookeeper driver three APIs like these: > atomic_t *atomic_inc(atomic_t *v) > { > return __sync_add_and_fetch(v, 1); > } > > atomic_t *atomic_dec(atomic_t *v) > { > return __sync_sub_and_fetch(v, 1); > } > > atomic_t *atomic_read(atomic_t *v) > { > return __sync_add_and_fetch(v, 0); > } Sorry, there were typos, let me correct them: atomic_t atomic_inc(atomic_t *v) { return __sync_add_and_fetch(v, 1); } atomic_t atomic_dec(atomic_t *v) { return __sync_sub_and_fetch(v, 1); } atomic_t atomic_read(atomic_t *v) { return __sync_add_and_fetch(v, 0); } > >> + >> +static inline bool atomic_dec_and_test(atomic_t *v) >> +{ >> + if (__sync_sub_and_fetch(v, 1) == 0) >> + return true; >> + return false; >> +} >> + >> +#endif /* _SHEEPDOG_ATOMIC_H */ >> Index: sheepdog/sheep/farm/trunk.c >> =================================================================== >> --- sheepdog.orig/sheep/farm/trunk.c 2012-05-06 23:21:16.537321808 +0200 >> +++ sheepdog/sheep/farm/trunk.c 2012-05-07 17:10:15.005400960 +0200 >> @@ -240,7 +240,7 @@ static int oid_stale(uint64_t oid) >> struct sd_vnode *v; >> int ret = 1; >> >> - vnodes = get_vnode_info(); >> + vnodes = get_current_vnode_info(); >> nr_copies = get_nr_copies(vnodes); >> for (i = 0; i < nr_copies; i++) { >> v = oid_to_vnode(vnodes, oid, i); >> Index: sheepdog/sheep/group.c >> =================================================================== >> --- sheepdog.orig/sheep/group.c 2012-05-07 10:59:08.769468543 +0200 >> +++ sheepdog/sheep/group.c 2012-05-07 17:17:23.273411925 +0200 >> @@ -9,6 +9,7 @@ >> * along with this program. If not, see <http://www.gnu.org/licenses/>. >> */ >> #include <assert.h> >> +#include <atomic.h> >> #include <stdio.h> >> #include <stdlib.h> >> #include <unistd.h> >> @@ -134,6 +135,13 @@ int get_zones_nr_from(struct sd_node *no >> return nr_zones; >> } >> >> +struct vnode_info { >> + struct sd_vnode entries[SD_MAX_VNODES]; >> + int nr_vnodes; >> + int nr_zones; >> + atomic_t refcnt; >> +}; >> + >> /* >> * If we have less zones available than the desired redundancy we have to do >> * with nr_zones copies, sorry. >> @@ -146,21 +154,44 @@ int get_max_nr_copies_from(struct sd_nod >> return min((int)sys->nr_copies, get_zones_nr_from(nodes, nr_nodes)); >> } >> >> -struct vnode_info *get_vnode_info(void) >> +/* >> + * Grab an additional reference to the passed in vnode info. >> + * >> + * The caller must already hold a reference to vnode_info, this function must >> + * only be used to grab an additional reference from code that wants the >> + * vnode information to outlive the request structure. >> + */ >> +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info) >> +{ >> + assert(atomic_read(&vnode_info->refcnt) > 0); >> + >> + atomic_inc(&vnode_info->refcnt); >> + return vnode_info; >> +} >> + >> +/* >> + * Release a reference to the current vnode information. >> + * >> + * Must be called from the main thread. >> + */ >> +void put_vnode_info(struct vnode_info *vnode_info) >> { >> assert(is_main_thread()); >> - assert(current_vnode_info); >> >> - current_vnode_info->refcnt++; >> - return current_vnode_info; >> + if (atomic_dec_and_test(&vnode_info->refcnt)) >> + free(vnode_info); >> } >> >> -void put_vnode_info(struct vnode_info *vnodes) >> +/* >> + * Get a reference to the currently active vnode information structure, >> + * this must only be called from the main thread. >> + */ >> +struct vnode_info *get_current_vnode_info(void) >> { >> assert(is_main_thread()); >> + assert(current_vnode_info); >> >> - if (vnodes && --vnodes->refcnt == 0) >> - free(vnodes); >> + return grab_vnode_info(current_vnode_info); >> } >> >> struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid, >> @@ -187,9 +218,10 @@ static int update_vnode_info(void) >> vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes, >> vnode_info->entries); >> vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes); >> - vnode_info->refcnt = 1; >> + atomic_init(&vnode_info->refcnt, 1); >> >> - put_vnode_info(current_vnode_info); >> + if (current_vnode_info) >> + put_vnode_info(current_vnode_info); >> current_vnode_info = vnode_info; >> return 0; >> } >> Index: sheepdog/sheep/ops.c >> =================================================================== >> --- sheepdog.orig/sheep/ops.c 2012-05-07 10:58:00.177466788 +0200 >> +++ sheepdog/sheep/ops.c 2012-05-07 17:17:01.573411373 +0200 >> @@ -63,7 +63,7 @@ struct sd_op_template { >> >> struct flush_work { >> struct object_cache *cache; >> - struct vnode_info vnode_info; >> + struct vnode_info *vnode_info; >> struct work work; >> }; >> >> @@ -582,7 +582,7 @@ static void flush_vdi_fn(struct work *wo >> struct flush_work *fw = container_of(work, struct flush_work, work); >> >> dprintf("flush vdi %"PRIx32"\n", fw->cache->vid); >> - if (object_cache_push(&fw->vnode_info, fw->cache) != SD_RES_SUCCESS) >> + if (object_cache_push(fw->vnode_info, fw->cache) != SD_RES_SUCCESS) >> eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid); >> } >> >> @@ -590,6 +590,8 @@ static void flush_vdi_done(struct work * >> { >> struct flush_work *fw = container_of(work, struct flush_work, work); >> dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid); >> + >> + put_vnode_info(fw->vnode_info); >> free(fw); >> } >> >> @@ -609,8 +611,8 @@ static int local_flush_vdi(struct reques >> fw->work.fn = flush_vdi_fn; >> fw->work.done = flush_vdi_done; >> fw->cache = cache; >> - memcpy(&fw->vnode_info, req->vnodes, >> - sizeof(fw->vnode_info)); >> + fw->vnode_info = grab_vnode_info(req->vnodes); >> + >> queue_work(sys->flush_wqueue, &fw->work); >> } >> } >> Index: sheepdog/sheep/sdnet.c >> =================================================================== >> --- sheepdog.orig/sheep/sdnet.c 2012-05-07 10:54:39.669461654 +0200 >> +++ sheepdog/sheep/sdnet.c 2012-05-07 17:20:04.521416055 +0200 >> @@ -150,8 +150,9 @@ static void io_op_done(struct work *work >> retry: >> req->rq.epoch = sys->epoch; >> >> - put_vnode_info(req->vnodes); >> - req->vnodes = get_vnode_info(); >> + if (req->vnodes) >> + put_vnode_info(req->vnodes); >> + req->vnodes = get_current_vnode_info(); >> setup_access_to_local_objects(req); >> list_add_tail(&req->cev.event_list, &sys->request_queue); >> >> @@ -310,7 +311,7 @@ static void queue_request(struct request >> * called before we set up current_vnode_info >> */ >> if (!is_force_op(req->op)) >> - req->vnodes = get_vnode_info(); >> + req->vnodes = get_current_vnode_info(); >> >> if (is_io_op(req->op)) { >> req->work.fn = do_io_request; >> @@ -376,7 +377,8 @@ static void free_request(struct request >> sys->outstanding_data_size -= req->data_length; >> >> list_del(&req->r_siblings); >> - put_vnode_info(req->vnodes); >> + if (req->vnodes) >> + put_vnode_info(req->vnodes); >> free(req->data); >> free(req); >> } >> Index: sheepdog/sheep/sheep_priv.h >> =================================================================== >> --- sheepdog.orig/sheep/sheep_priv.h 2012-05-07 10:59:08.769468543 +0200 >> +++ sheepdog/sheep/sheep_priv.h 2012-05-07 17:11:01.197402145 +0200 >> @@ -63,19 +63,7 @@ struct client_info { >> int refcnt; >> }; >> >> -/* >> - * This should be private in group.c, but the asynchronous VDI deletion >> - * prevents this currently. >> - */ >> -struct vnode_info { >> - struct sd_vnode entries[SD_MAX_VNODES]; >> - int nr_vnodes; >> - int nr_zones; >> - int refcnt; >> -}; >> - >> - >> -struct request; >> +struct vnode_info; >> >> struct request { >> struct event_struct cev; >> @@ -264,8 +252,9 @@ int get_vdi_attr(struct vnode_info *vnod >> int excl, int delete); >> >> int get_zones_nr_from(struct sd_node *nodes, int nr_nodes); >> -struct vnode_info *get_vnode_info(void); >> -void put_vnode_info(struct vnode_info *vnodes); >> +struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info); >> +void put_vnode_info(struct vnode_info *vnode_info); >> +struct vnode_info *get_current_vnode_info(void); >> >> struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid, >> int copy_idx); >> Index: sheepdog/sheep/vdi.c >> =================================================================== >> --- sheepdog.orig/sheep/vdi.c 2012-05-07 10:56:03.861463810 +0200 >> +++ sheepdog/sheep/vdi.c 2012-05-07 17:15:54.865409669 +0200 >> @@ -354,7 +354,7 @@ struct deletion_work { >> int count; >> uint32_t *buf; >> >> - struct vnode_info vnode_info; >> + struct vnode_info *vnode_info; >> int delete_error; >> }; >> >> @@ -372,9 +372,9 @@ static int delete_inode(struct deletion_ >> goto out; >> } >> >> - nr_copies = get_nr_copies(&dw->vnode_info); >> + nr_copies = get_nr_copies(dw->vnode_info); >> >> - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), >> + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), >> (char *)inode, SD_INODE_HEADER_SIZE, 0, nr_copies); >> if (ret != SD_RES_SUCCESS) { >> ret = SD_RES_EIO; >> @@ -386,7 +386,7 @@ static int delete_inode(struct deletion_ >> else >> memset(inode->name, 0, sizeof(inode->name)); >> >> - ret = write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), >> + ret = write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid), >> (char *)inode, SD_INODE_HEADER_SIZE, 0, 0, >> nr_copies, 0); >> if (ret != 0) { >> @@ -416,9 +416,9 @@ static void delete_one(struct work *work >> goto out; >> } >> >> - nr_copies = get_nr_copies(&dw->vnode_info); >> + nr_copies = get_nr_copies(dw->vnode_info); >> >> - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), >> + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), >> (void *)inode, sizeof(*inode), >> 0, nr_copies); >> >> @@ -437,7 +437,7 @@ static void delete_one(struct work *work >> continue; >> } >> >> - ret = remove_object(&dw->vnode_info, dw->epoch, >> + ret = remove_object(dw->vnode_info, dw->epoch, >> vid_to_data_oid(inode->data_vdi_id[i], i), >> nr_copies); >> >> @@ -448,7 +448,7 @@ static void delete_one(struct work *work >> } >> >> if (dw->delete_error) { >> - write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), >> + write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id), >> (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0); >> } >> >> @@ -470,6 +470,7 @@ static void delete_one_done(struct work >> >> list_del(&dw->dw_siblings); >> >> + put_vnode_info(dw->vnode_info); >> free(dw->buf); >> free(dw); >> >> @@ -489,7 +490,7 @@ static int fill_vdi_list(struct deletion >> uint32_t vid; >> int nr_copies; >> >> - nr_copies = get_nr_copies(&dw->vnode_info); >> + nr_copies = get_nr_copies(dw->vnode_info); >> >> inode = malloc(SD_INODE_HEADER_SIZE); >> if (!inode) { >> @@ -500,7 +501,7 @@ static int fill_vdi_list(struct deletion >> dw->buf[dw->count++] = root_vid; >> again: >> vid = dw->buf[done++]; >> - ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid), >> + ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid), >> (char *)inode, SD_INODE_HEADER_SIZE, 0, >> nr_copies); >> >> @@ -614,18 +615,12 @@ int del_vdi(struct vnode_info *vnode_inf >> dw->count = 0; >> dw->vid = *vid; >> dw->epoch = epoch; >> + dw->vnode_info = grab_vnode_info(vnode_info); >> >> dw->work.fn = delete_one; >> dw->work.done = delete_one_done; >> >> - /* >> - * Unfortunately we have to copy over the vnode_info array here, >> - * as we can't grab additional references to it outside the main >> - * thread. >> - */ >> - memcpy(&dw->vnode_info, vnode_info, sizeof(*vnode_info)); >> - >> - root_vid = get_vdi_root(&dw->vnode_info, dw->epoch, dw->vid); >> + root_vid = get_vdi_root(dw->vnode_info, dw->epoch, dw->vid); >> if (!root_vid) { >> ret = SD_RES_EIO; >> goto out_free_buf; >> -- >> sheepdog mailing list >> sheepdog at lists.wpkg.org >> http://lists.wpkg.org/mailman/listinfo/sheepdog > > > > -- > Yunkai Zhang > Work at Taobao -- Yunkai Zhang Work at Taobao |