Currently the post-join even handler tries to contact one or more sheep to get the current vdi inuse bitmap. In addition to adding network round trips this has the problem that we need to properly block all vdi lookups until this work has completed, which doesn't work properly, as well as writing to the vdi bitmap both from the event worker thread as well as the main thread. The simples fix (so simple that it removes 90 lines of code) is add the vdi inuse bitmap to the join message, and simply use the bitmap from the join message in any newly joining node. The downside is that this increases the size of the join message by a large amount, which I'm requires increasing the size of the event buffer. So far I've only tested this with the local driver as my test cluster is busy, but I'd love to get some feedback on this design. Signed-off-by: Christoph Hellwig <hch at lst.de> --- sheep/cluster.h | 2 - sheep/group.c | 103 +++----------------------------------------------------- 2 files changed, 7 insertions(+), 98 deletions(-) Index: sheepdog/sheep/group.c =================================================================== --- sheepdog.orig/sheep/group.c 2012-05-23 19:51:21.516139460 +0200 +++ sheepdog/sheep/group.c 2012-05-23 19:57:18.328148597 +0200 @@ -50,6 +50,7 @@ struct join_message { uint32_t result; uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */ uint8_t store[STORE_LEN]; + DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); union { struct sd_node nodes[0]; struct sd_node leave_nodes[0]; @@ -73,9 +74,6 @@ struct work_notify { struct work_join { struct event_struct cev; - - struct sd_node *member_list; - size_t member_list_entries; struct sd_node joined; struct join_message *jm; @@ -502,53 +500,6 @@ out: return ret; } -static int get_vdi_bitmap_from(struct sd_node *node) -{ - struct sd_req hdr; - struct sd_rsp *rsp = (struct sd_rsp *)&hdr; - static DECLARE_BITMAP(tmp_vdi_inuse, SD_NR_VDIS); - int fd, i, ret = SD_RES_SUCCESS; - unsigned int rlen, wlen; - char host[128]; - - if (is_myself(node->addr, node->port)) - goto out; - - addr_to_str(host, sizeof(host), node->addr, 0); - - fd = connect_to(host, node->port); - if (fd < 0) { - vprintf(SDOG_ERR, "unable to get the VDI bitmap from %s: %m\n", host); - ret = -SD_RES_EIO; - goto out; - } - - vprintf(SDOG_ERR, "%s:%d\n", host, node->port); - - memset(&hdr, 0, sizeof(hdr)); - hdr.opcode = SD_OP_READ_VDIS; - hdr.epoch = sys->epoch; - hdr.data_length = sizeof(tmp_vdi_inuse); - rlen = hdr.data_length; - wlen = 0; - - ret = exec_req(fd, &hdr, (char *)tmp_vdi_inuse, - &wlen, &rlen); - - close(fd); - - if (ret || rsp->result != SD_RES_SUCCESS) { - vprintf(SDOG_ERR, "unable to get the VDI bitmap (%d, %d)\n", ret, - rsp->result); - goto out; - } - - for (i = 0; i < ARRAY_SIZE(sys->vdi_inuse); i++) - sys->vdi_inuse[i] |= tmp_vdi_inuse[i]; -out: - return ret; -} - static void update_node_info(struct sd_node *nodes, size_t nr_nodes) { print_node_list(nodes, nr_nodes); @@ -664,10 +615,6 @@ static void update_cluster_info(struct j } } -static void __sd_notify(struct event_struct *cevent) -{ -} - static void __sd_notify_done(struct event_struct *cevent) { struct work_notify *w = container_of(cevent, struct work_notify, cev); @@ -765,35 +712,6 @@ static int check_majority(struct sd_node return 0; } -static void __sd_join(struct event_struct *cevent) -{ - struct work_join *w = container_of(cevent, struct work_join, cev); - struct join_message *msg = w->jm; - int i; - - if (msg->cluster_status != SD_STATUS_OK && - msg->cluster_status != SD_STATUS_HALT) - return; - - if (sys_stat_ok()) - return; - - for (i = 0; i < w->member_list_entries; i++) { - /* We should not fetch vdi_bitmap from myself */ - if (node_eq(w->member_list + i, &sys->this_node)) - continue; - - get_vdi_bitmap_from(w->member_list + i); - - /* - * If a new comer try to join the running cluster, it only - * need read one copy of bitmap from one of other members. - */ - if (sys_stat_wait_format()) - break; - } -} - static void __sd_leave(struct event_struct *cevent) { struct work_leave *w = container_of(cevent, struct work_leave, cev); @@ -851,6 +769,7 @@ enum cluster_join_result sd_check_join_c jm->cluster_flags = sys->flags; jm->ctime = get_cluster_ctime(); jm->nr_leave_nodes = 0; + memcpy(&jm->vdi_inuse, &sys->vdi_inuse, sizeof(sys->vdi_inuse)); if (sd_store) strcpy((char *)jm->store, sd_store->name); @@ -916,6 +835,9 @@ static void __sd_join_done(struct event_ print_node_list(sys->nodes, sys->nr_nodes); + if (node_eq(&w->joined, &sys->this_node)) + memcpy(&sys->vdi_inuse, &jm->vdi_inuse, sizeof(sys->vdi_inuse)); + sys_stat_set(jm->cluster_status); if (sys_can_recover() && jm->inc_epoch) { @@ -951,7 +873,6 @@ static void event_free(struct event_stru switch (cevent->ctype) { case EVENT_JOIN: { struct work_join *w = container_of(cevent, struct work_join, cev); - free(w->member_list); free(w->jm); free(w); break; @@ -986,14 +907,10 @@ static void event_fn(struct work *work) switch (cevent->ctype) { case EVENT_JOIN: - __sd_join(cevent); + case EVENT_NOTIFY: break; case EVENT_LEAVE: __sd_leave(cevent); - break; - case EVENT_NOTIFY: - __sd_notify(cevent); - break; default: vprintf(SDOG_ERR, "unknown event %d\n", cevent->ctype); } @@ -1194,14 +1111,6 @@ void sd_join_handler(struct sd_node *joi vprintf(SDOG_DEBUG, "allow new confchg %p\n", cevent); - size = sizeof(struct sd_node) * nr_members; - w->member_list = zalloc(size); - if (!w->member_list) - panic("failed to allocate memory"); - - memcpy(w->member_list, members, size); - w->member_list_entries = nr_members; - w->joined = *joined; size = get_join_message_size(opaque); Index: sheepdog/sheep/cluster.h =================================================================== --- sheepdog.orig/sheep/cluster.h 2012-05-23 19:51:21.516139460 +0200 +++ sheepdog/sheep/cluster.h 2012-05-23 19:54:34.952144414 +0200 @@ -23,7 +23,7 @@ #include "logger.h" /* maximum payload size sent in ->notify and ->unblock */ -#define SD_MAX_EVENT_BUF_SIZE (64 * 1024) +#define SD_MAX_EVENT_BUF_SIZE (4096 * 1024) enum cluster_join_result { CJ_RES_SUCCESS, /* Success */ |