On 05/24/2012 11:10 PM, Christoph Hellwig wrote: > With the current sheepdog code a test case like the following pseudocode > fails: > > for host in 1..4: > for shep in 1..5: > sheep > > collie cluster format --copies=2 > collie vdi create test-vdi 100M > collie vdi write test vdi <100M of random data> > > for host in 1: > for shep in 1..5: > kill sheep > > sleep 60 # wait for recovery to finish > > for host in 2: > for shep in 1..5: > kill sheep > > sleep 60 # wait for recovery to finish > > collie vdi write test-vdi -a host3 <any data> > > > fails because some objects are only replicated once, not twice. Debugging > showed that the problem was that the SD_OP_GET_OBJ_LIST command for some > of the remaining sheep did not return the full object list. > > Reverting the object list cached fixed the bug, and when I noticed that > the buffer scheme recently introduced could be applied directly to the > farm trunk active list I decided to go down that route instead of debugging > the object cache more. I can't see how the additional rbtree can provide > better performance than just using the farm data structures directly, > but I'm open for arguments. > > The other benefit is that almost 150 lines of code can be removed by this > patch, and I also could trivially check for too small buffers in the > SD_OP_GET_OBJ_LIST implementation. > > > Signed-off-by: Christoph Hellwig <hch at lst.de> > > diff --git a/sheep/Makefile.am b/sheep/Makefile.am > index 1cb2ebf..4ad5a76 100644 > --- a/sheep/Makefile.am > +++ b/sheep/Makefile.am > @@ -26,7 +26,7 @@ sbin_PROGRAMS = sheep > > sheep_SOURCES = sheep.c group.c sdnet.c gateway.c store.c vdi.c work.c \ > journal.c ops.c recovery.c cluster/local.c \ > - object_cache.c object_list_cache.c > + object_cache.c > > if BUILD_COROSYNC > sheep_SOURCES += cluster/corosync.c > diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c > index 1575d24..97e510b 100644 > --- a/sheep/farm/farm.c > +++ b/sheep/farm/farm.c > @@ -331,18 +331,11 @@ err: > return SD_RES_EIO; > } > > -static int farm_get_objlist(struct siocb *iocb) > +static int farm_get_objlist(uint64_t *list, uint32_t *length) > { > - uint64_t *objlist = (uint64_t *)iocb->buf; > - > - iocb->length = trunk_get_working_objlist(objlist); > - > - dprintf("%"PRIu32"\n", iocb->length); > - > - return SD_RES_SUCCESS; > + return trunk_get_working_objlist(list, length); > } > > - > static void *read_working_object(uint64_t oid, uint64_t offset, > uint32_t length) > { > diff --git a/sheep/farm/farm.h b/sheep/farm/farm.h > index cc829cd..75e0177 100644 > --- a/sheep/farm/farm.h > +++ b/sheep/farm/farm.h > @@ -67,7 +67,7 @@ extern int trunk_update_entry(uint64_t oid); > extern void trunk_reset(void); > extern void trunk_put_entry(uint64_t oid); > extern void trunk_get_entry(uint64_t oid); > -extern int trunk_get_working_objlist(uint64_t *list); > +extern int trunk_get_working_objlist(uint64_t *list, uint32_t *length); > > /* snap.c */ > extern int snap_init(void); > diff --git a/sheep/farm/trunk.c b/sheep/farm/trunk.c > index d47c31b..11b42a1 100644 > --- a/sheep/farm/trunk.c > +++ b/sheep/farm/trunk.c > @@ -34,6 +34,7 @@ > #define HASH_SIZE (1 << HASH_BITS) > > static LIST_HEAD(trunk_active_list); > +static int trunk_entry_active_list_version; > static pthread_mutex_t active_list_lock = PTHREAD_MUTEX_INITIALIZER; > static struct hlist_head trunk_hashtable[HASH_SIZE]; > static pthread_mutex_t hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = PTHREAD_MUTEX_INITIALIZER }; > @@ -72,6 +73,7 @@ static inline void get_entry(struct trunk_entry_incore *entry, struct hlist_head > hlist_add_head(&entry->hash, head); > pthread_mutex_lock(&active_list_lock); > list_add(&entry->active_list, &trunk_active_list); > + trunk_entry_active_list_version++; > trunk_entry_active_nr++; > pthread_mutex_unlock(&active_list_lock); > } > @@ -179,6 +181,7 @@ static inline void put_entry(struct trunk_entry_incore *entry) > > pthread_mutex_lock(&active_list_lock); > list_del(&entry->active_list); > + trunk_entry_active_list_version--; > trunk_entry_active_nr--; > pthread_mutex_unlock(&active_list_lock); > free(entry); > @@ -399,16 +402,50 @@ void trunk_reset(void) > "clean"); > } > > -int trunk_get_working_objlist(uint64_t *list) > +/* > + * Cache of the last get_objlist result. > + * > + * Right now these variables are protected by active_list_lock, but we could > + * add a separate mutex just for the objlist state if contention becomes a > + * problem. > + */ > +static uint64_t *trunc_objlist_buf; > +static uint32_t trunc_objlist_buf_len; > +static int trunk_objlist_version; > + > +int trunk_get_working_objlist(uint64_t *list, uint32_t *length) > { > - int nr = 0; > - struct trunk_entry_incore *entry; > + int i; > > pthread_mutex_lock(&active_list_lock); > - list_for_each_entry(entry, &trunk_active_list, active_list) { > - list[nr++] = entry->raw.oid; > + if (trunk_objlist_version != trunk_entry_active_list_version) { > + struct trunk_entry_incore *entry; > + int nr = 0; > + > + trunc_objlist_buf_len = > + trunk_entry_active_nr * sizeof(uint64_t); > + trunc_objlist_buf = > + xrealloc(trunc_objlist_buf, trunc_objlist_buf_len); > + > + list_for_each_entry(entry, &trunk_active_list, active_list) > + trunc_objlist_buf[nr++] = entry->raw.oid; > + } > + > + if (*length < trunc_objlist_buf_len) { > + eprintf("get_objlist buffer to small. Req: %d, list: %d\n", > + *length, trunc_objlist_buf_len); > + goto out_unlock; > } > + > + memcpy(list, trunc_objlist_buf, trunc_objlist_buf_len); > + *length = trunc_objlist_buf_len; > pthread_mutex_unlock(&active_list_lock); > > - return nr; > + dprintf("%ld\n", trunc_objlist_buf_len / sizeof(uint64_t)); > + for (i = 0; i < trunc_objlist_buf_len / sizeof(uint64_t); i++) > + dprintf("oid %"PRIx64"\n", trunc_objlist_buf[i]); > + return SD_RES_SUCCESS; > +out_unlock: > + pthread_mutex_unlock(&active_list_lock); > + return SD_RES_EIO; > } > diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c > index 0eb5223..e69de29 100644 > --- a/sheep/object_list_cache.c > +++ b/sheep/object_list_cache.c > @@ -1,164 +0,0 @@ > -/* > - * Copyright (C) 2012 Taobao Inc. > - * > - * Levin Li <xingke.lwp at taobao.com> > - * > - * This program is free software; you can redistribute it and/or > - * modify it under the terms of the GNU General Public License version > - * 2 as published by the Free Software Foundation. > - * > - * You should have received a copy of the GNU General Public License > - * along with this program. If not, see <http://www.gnu.org/licenses/>. > - */ > - > -#include <stdio.h> > -#include <stdlib.h> > -#include <unistd.h> > -#include <sys/types.h> > -#include <pthread.h> > - > -#include "sheep_priv.h" > -#include "strbuf.h" > -#include "util.h" > - > -struct objlist_cache_entry { > - uint64_t oid; > - struct rb_node node; > -}; > - > -struct objlist_cache obj_list_cache = { > - 1, /* tree_version */ > - 0, /* buf_version */ > - 0, /* cache_size */ > - STRBUF_INIT, /* buffer */ > - RB_ROOT, /* root */ > - }; > - > -int init_objlist_cache(void) > -{ > - int i; > - struct siocb iocb = { 0 }; > - uint64_t *buf; > - > - pthread_rwlock_init(&obj_list_cache.lock, NULL); > - > - if (sd_store) { > - buf = zalloc(1 << 22); > - if (!buf) { > - eprintf("no memory to allocate.\n"); > - return -1; > - } > - > - iocb.length = 0; > - iocb.buf = buf; > - sd_store->get_objlist(&iocb); > - > - for (i = 0; i < iocb.length; i++) > - check_and_insert_objlist_cache(buf[i]); > - > - free(buf); > - } > - > - return 0; > -} > - > -static struct objlist_cache_entry *objlist_cache_rb_insert(struct rb_root *root, > - struct objlist_cache_entry *new) > -{ > - struct rb_node **p = &root->rb_node; > - struct rb_node *parent = NULL; > - struct objlist_cache_entry *entry; > - > - while (*p) { > - parent = *p; > - entry = rb_entry(parent, struct objlist_cache_entry, node); > - > - if (new->oid < entry->oid) > - p = &(*p)->rb_left; > - else if (new->oid > entry->oid) > - p = &(*p)->rb_right; > - else > - return entry; /* already has this entry */ > - } > - rb_link_node(&new->node, parent, p); > - rb_insert_color(&new->node, root); > - > - return NULL; /* insert successfully */ > -} > - > -int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid) > -{ > - struct rb_node **p = &root->rb_node; > - struct rb_node *parent = NULL; > - struct objlist_cache_entry *entry; > - > - while (*p) { > - parent = *p; > - entry = rb_entry(parent, struct objlist_cache_entry, node); > - > - if (oid < entry->oid) > - p = &(*p)->rb_left; > - else if (oid > entry->oid) > - p = &(*p)->rb_right; > - else { > - rb_erase(parent, root); > - return 0; > - } > - } > - > - return -1; /* fail to remove */ > -} > - > -int check_and_insert_objlist_cache(uint64_t oid) > -{ > - struct objlist_cache_entry *entry, *p; > - > - entry = zalloc(sizeof(*entry)); > - > - if (!entry) { > - eprintf("no memory to allocate cache entry.\n"); > - return -1; > - } > - > - entry->oid = oid; > - rb_init_node(&entry->node); > - > - pthread_rwlock_wrlock(&obj_list_cache.lock); > - p = objlist_cache_rb_insert(&obj_list_cache.root, entry); > - if (p) > - free(entry); > - else { > - obj_list_cache.cache_size++; > - obj_list_cache.tree_version++; > - } > - pthread_rwlock_unlock(&obj_list_cache.lock); > - > - return 0; > -} > - > -int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *data) > -{ > - uint64_t *list = (uint64_t *)data; > - int nr = 0; > - struct objlist_cache_entry *entry; > - struct rb_node *p; > - > - rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t); > - > - pthread_rwlock_rdlock(&obj_list_cache.lock); > - if (obj_list_cache.tree_version == obj_list_cache.buf_version) > - memcpy(list, obj_list_cache.buffer.buf, rsp->data_length); > - else { > - for (p = rb_first(&obj_list_cache.root); p; p = rb_next(p)) { > - entry = rb_entry(p, struct objlist_cache_entry, node); > - list[nr++] = entry->oid; > - } > - > - strbuf_reset(&obj_list_cache.buffer); > - strbuf_add(&obj_list_cache.buffer, list, nr * sizeof(uint64_t)); > - obj_list_cache.buf_version = obj_list_cache.tree_version; > - } > - pthread_rwlock_unlock(&obj_list_cache.lock); > - > - return SD_RES_SUCCESS; > -} > diff --git a/sheep/ops.c b/sheep/ops.c > index b63955c..b2ea387 100644 > --- a/sheep/ops.c > +++ b/sheep/ops.c > @@ -428,8 +428,18 @@ static int local_kill_node(struct request *req) > > static int local_get_obj_list(struct request *req) > { > - return get_obj_list((const struct sd_list_req *)&req->rq, > - (struct sd_list_rsp *)&req->rp, req->data); > + uint32_t length; > + int ret; > + > + length = req->rq.data_length; > + ret = sd_store->get_objlist(req->data, &length); > + if (ret != 0) { > + eprintf("get_objlist failed: %d\n", ret); > + return ret; > + } > + > + req->rp.data_length = length; > + return SD_RES_SUCCESS; > } > > static int local_get_epoch(struct request *req) > @@ -724,12 +734,6 @@ static int store_remove_obj(struct request *req) > eprintf("%m\n"); > ret = SD_RES_EIO; > } > - pthread_rwlock_wrlock(&obj_list_cache.lock); > - if (!objlist_cache_rb_remove(&obj_list_cache.root, oid)) { > - obj_list_cache.cache_size--; > - obj_list_cache.tree_version++; > - } > - pthread_rwlock_unlock(&obj_list_cache.lock); > out: > strbuf_release(&buf); > return ret; > @@ -848,8 +852,6 @@ static int store_create_and_write_obj(struct request *req) > } else > ret = do_write_obj(&iocb, hdr, epoch, req->data, 1); > > - if (SD_RES_SUCCESS == ret) > - check_and_insert_objlist_cache(oid); > out: > if (buf) > free(buf); > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index 3d5a964..7937b11 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -173,7 +173,7 @@ struct store_driver { > int (*read)(uint64_t oid, struct siocb *); > int (*format)(struct siocb *); > /* Operations in recovery */ > - int (*get_objlist)(struct siocb *); > + int (*get_objlist)(uint64_t *list, uint32_t *length); > int (*link)(uint64_t oid, struct siocb *, uint32_t tgt_epoch); > int (*atomic_put)(uint64_t oid, struct siocb *); > int (*begin_recover)(struct siocb *); > @@ -207,7 +207,7 @@ struct objlist_cache { > int tree_version; > int buf_version; > int cache_size; > - struct strbuf buffer; > + uint64_t *oids; > struct rb_root root; > pthread_rwlock_t lock; > }; > @@ -220,7 +220,6 @@ extern char *jrnl_path; > extern char *epoch_path; > extern mode_t def_fmode; > extern mode_t def_dmode; > -extern struct objlist_cache obj_list_cache; > > /* One should call this function to get sys->epoch outside main thread */ > static inline uint32_t sys_epoch(void) > @@ -320,10 +319,6 @@ int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx, uint32_t epoch); > > int prealloc(int fd, uint32_t size); > > -int init_objlist_cache(void); > -int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid); > -int check_and_insert_objlist_cache(uint64_t oid); > - > void req_done(struct request *req); > > /* Operations */ > diff --git a/sheep/store.c b/sheep/store.c > index 3030adb..00f8064 100644 > --- a/sheep/store.c > +++ b/sheep/store.c > @@ -503,10 +503,6 @@ int init_store(const char *d, int enable_write_cache) > if (ret) > return ret; > > - ret = init_objlist_cache(); > - if (ret) > - return ret; > - > if (enable_write_cache) { > sys->enable_write_cache = 1; > ret = object_cache_init(d); Object list cache has been involved since we have simple store, which does not have a list or others to keep the full object list. Though we removed simple store, I suggest to keep object list cache for good of future consideration, maybe there would be another store driver involved into sheepdog which doesn't keep a global object list as simple store. Another problem as you mentioned, the object should not be removed from the list before all the nodes finish recovering, except that we delete the VDI, farm can not do that currently, because we need to clear the active_list after recovery complete. thanks, levin |