[sheepdog] [Sheepdog] [PATCH 0/4] fix a race when multiple sheep join a cluster very quickly
Christoph Hellwig
hch at infradead.org
Wed May 16 21:47:36 CEST 2012
On Wed, May 16, 2012 at 01:20:23PM -0400, Christoph Hellwig wrote:
> On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote:
> > On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote:
> > > Hi Hellwig, Do you have any comments on my latest unregister patch
> > > which have been updated to reflect your previous comments?
> >
> > I like it, but I ran into an issue where a VDI created on one
> > nodes doesn't seem to to be found on another node created later.
>
> The issues was that sys_stat_ok is now always true in __sd_join. The
> patch below records the state before we joined and stores it in the
> join work to restore the old behavior. It also filters the joining node
> out to match the old behaviour, just as your earlier versions of the
> patches did. The patch passes all my testing, but it still introduces
> the problem with I/O concurrent to epoch/node list changes vs recovery
> that Kazutaka mentioned in his recent mail, so it will need more work.
>
> Also note that it is based on top of my three patches as my working tree
> has them included.
This one still had a stupid brown paperbag bug in the leave handler,
updated version below:
diff --git a/sheep/group.c b/sheep/group.c
index 6921f5e..beb10d8 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -9,6 +9,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -78,6 +79,7 @@ struct work_join {
struct sd_node *member_list;
size_t member_list_entries;
struct sd_node joined;
+ bool get_vdi_bitmap;
struct join_message *jm;
};
@@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes)
static void __sd_join(struct event_struct *cevent)
{
struct work_join *w = container_of(cevent, struct work_join, cev);
- struct join_message *msg = w->jm;
int i;
- if (msg->cluster_status != SD_STATUS_OK &&
- msg->cluster_status != SD_STATUS_HALT)
- return;
-
- if (sys_stat_ok())
+ if (!w->get_vdi_bitmap)
return;
for (i = 0; i < w->member_list_entries; i++) {
-
- get_vdi_bitmap_from(w->member_list + i);
+ if (node_eq(&w->member_list[i], &w->joined))
+ continue;
+ get_vdi_bitmap_from(&w->member_list[i]);
/*
* If a new comer try to join the running cluster, it only
@@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent)
print_node_list(sys->nodes, sys->nr_nodes);
- if (!sys_stat_join_failed()) {
- update_cluster_info(jm, &w->joined, w->member_list,
- w->member_list_entries);
- }
-
if (sys_can_recover() && jm->inc_epoch) {
list_for_each_entry_safe(node, t, &sys->leave_list, list) {
list_del(&node->list);
@@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent)
static void __sd_leave_done(struct event_struct *cevent)
{
- struct work_leave *w = container_of(cevent, struct work_leave, cev);
-
- update_node_info(w->member_list, w->member_list_entries);
-
- if (sys_can_recover()) {
- sys->epoch++;
- update_epoch_store(sys->epoch);
- update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
-
+ if (sys_can_recover())
start_recovery(sys->epoch);
- }
if (sys_can_halt()) {
if (current_vnode_info->nr_zones < sys->nr_copies)
@@ -965,7 +949,6 @@ static void event_fn(struct work *work)
static void event_done(struct work *work)
{
struct event_struct *cevent;
- int ret;
if (!sys->cur_cevent)
vprintf(SDOG_ERR, "bug\n");
@@ -992,9 +975,6 @@ static void event_done(struct work *work)
vprintf(SDOG_DEBUG, "free %p\n", cevent);
event_free(cevent);
event_running = 0;
- ret = register_event(cdrv_fd, group_handler, NULL);
- if (ret)
- panic("failed to register event fd");
process_request_event_queues();
}
@@ -1106,7 +1086,6 @@ static inline void process_event_queue(void)
event_work.fn = event_fn;
event_work.done = event_done;
- unregister_event(cdrv_fd);
queue_work(sys->event_wqueue, &event_work);
}
@@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
int i, size;
int nr, nr_local, nr_leave;
struct node *n;
- struct join_message *jm;
+ struct join_message *jm = opaque;
uint32_t le = get_latest_epoch();
if (node_eq(joined, &sys->this_node)) {
@@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
w->joined = *joined;
+ /*
+ * Only get the VDI list if we were waiting for a format or
+ * join before, and the join was successful.
+ */
+ if (sys_stat_ok() ||
+ (jm->cluster_status != SD_STATUS_OK &&
+ jm->cluster_status != SD_STATUS_HALT))
+ w->get_vdi_bitmap = false;
+ else
+ w->get_vdi_bitmap = true;
+
size = get_join_message_size(opaque);
w->jm = zalloc(size);
if (!w->jm)
panic("failed to allocate memory\n");
memcpy(w->jm, opaque, size);
+
+ update_cluster_info(jm, joined, members, nr_members);
list_add_tail(&cevent->event_list, &sys->event_queue);
process_request_event_queues();
@@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
}
break;
case CJ_RES_MASTER_TRANSFER:
- jm = (struct join_message *)opaque;
nr = jm->nr_leave_nodes;
for (i = 0; i < nr; i++) {
if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list)
@@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members,
if (sys_stat_shutdown())
return;
+ update_node_info(members, nr_members);
+
+ if (sys_can_recover()) {
+ sys->epoch++;
+ update_epoch_store(sys->epoch);
+ update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
+ }
+
w = zalloc(sizeof(*w));
if (!w)
goto oom;
More information about the sheepdog
mailing list