[sheepdog] [Sheepdog] [PATCH 0/4] fix a race when multiple sheep join a cluster very quickly
Yunkai Zhang
yunkai.me at gmail.com
Thu May 17 04:43:15 CEST 2012
On Thu, May 17, 2012 at 1:20 AM, Christoph Hellwig <hch at infradead.org> wrote:
> On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote:
>> On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote:
>> > Hi Hellwig, Do you have any comments on my latest unregister patch
>> > which have been updated to reflect your previous comments?
>>
>> I like it, but I ran into an issue where a VDI created on one
>> nodes doesn't seem to to be found on another node created later.
>
> The issues was that sys_stat_ok is now always true in __sd_join. The
> patch below records the state before we joined and stores it in the
> join work to restore the old behavior. It also filters the joining node
> out to match the old behaviour, just as your earlier versions of the
> patches did. The patch passes all my testing, but it still introduces
> the problem with I/O concurrent to epoch/node list changes vs recovery
> that Kazutaka mentioned in his recent mail, so it will need more work.
>
> Also note that it is based on top of my three patches as my working tree
> has them included.
>
>
> diff --git a/sheep/group.c b/sheep/group.c
> index 6921f5e..8c684f7 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -9,6 +9,7 @@
> * along with this program. If not, see <http://www.gnu.org/licenses/>.
> */
> #include <assert.h>
> +#include <stdbool.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> @@ -78,6 +79,7 @@ struct work_join {
> struct sd_node *member_list;
> size_t member_list_entries;
> struct sd_node joined;
> + bool get_vdi_bitmap;
>
> struct join_message *jm;
> };
> @@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes)
> static void __sd_join(struct event_struct *cevent)
> {
> struct work_join *w = container_of(cevent, struct work_join, cev);
> - struct join_message *msg = w->jm;
> int i;
>
> - if (msg->cluster_status != SD_STATUS_OK &&
> - msg->cluster_status != SD_STATUS_HALT)
> - return;
> -
> - if (sys_stat_ok())
> + if (!w->get_vdi_bitmap)
> return;
>
> for (i = 0; i < w->member_list_entries; i++) {
> -
> - get_vdi_bitmap_from(w->member_list + i);
> + if (node_eq(&w->member_list[i], &w->joined))
> + continue;
Hi Hellwig, it seems that this is not my latest unregister patch. I
have removed this node_eq() condition in it.
> + get_vdi_bitmap_from(&w->member_list[i]);
>
> /*
> * If a new comer try to join the running cluster, it only
> @@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent)
>
> print_node_list(sys->nodes, sys->nr_nodes);
>
> - if (!sys_stat_join_failed()) {
> - update_cluster_info(jm, &w->joined, w->member_list,
> - w->member_list_entries);
> - }
> -
> if (sys_can_recover() && jm->inc_epoch) {
> list_for_each_entry_safe(node, t, &sys->leave_list, list) {
> list_del(&node->list);
> @@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent)
>
> static void __sd_leave_done(struct event_struct *cevent)
> {
> - struct work_leave *w = container_of(cevent, struct work_leave, cev);
> -
> - update_node_info(w->member_list, w->member_list_entries);
> -
> - if (sys_can_recover()) {
> - sys->epoch++;
> - update_epoch_store(sys->epoch);
> - update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
> -
> + if (sys_can_recover())
> start_recovery(sys->epoch);
> - }
>
> if (sys_can_halt()) {
> if (current_vnode_info->nr_zones < sys->nr_copies)
> @@ -965,7 +949,6 @@ static void event_fn(struct work *work)
> static void event_done(struct work *work)
> {
> struct event_struct *cevent;
> - int ret;
>
> if (!sys->cur_cevent)
> vprintf(SDOG_ERR, "bug\n");
> @@ -992,9 +975,6 @@ static void event_done(struct work *work)
> vprintf(SDOG_DEBUG, "free %p\n", cevent);
> event_free(cevent);
> event_running = 0;
> - ret = register_event(cdrv_fd, group_handler, NULL);
> - if (ret)
> - panic("failed to register event fd");
>
> process_request_event_queues();
> }
> @@ -1106,7 +1086,6 @@ static inline void process_event_queue(void)
> event_work.fn = event_fn;
> event_work.done = event_done;
>
> - unregister_event(cdrv_fd);
> queue_work(sys->event_wqueue, &event_work);
> }
>
> @@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
> int i, size;
> int nr, nr_local, nr_leave;
> struct node *n;
> - struct join_message *jm;
> + struct join_message *jm = opaque;
> uint32_t le = get_latest_epoch();
>
> if (node_eq(joined, &sys->this_node)) {
> @@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>
> w->joined = *joined;
>
> + /*
> + * Only get the VDI list if we were waiting for a format or
> + * join before, and the join was successful.
> + */
> + if (sys_stat_ok() ||
> + (jm->cluster_status != SD_STATUS_OK &&
> + jm->cluster_status != SD_STATUS_HALT))
> + w->get_vdi_bitmap = false;
> + else
> + w->get_vdi_bitmap = true;
> +
> size = get_join_message_size(opaque);
> w->jm = zalloc(size);
> if (!w->jm)
> panic("failed to allocate memory\n");
> memcpy(w->jm, opaque, size);
> +
> + update_cluster_info(jm, joined, members, nr_members);
>
> list_add_tail(&cevent->event_list, &sys->event_queue);
> process_request_event_queues();
> @@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
> }
> break;
> case CJ_RES_MASTER_TRANSFER:
> - jm = (struct join_message *)opaque;
> nr = jm->nr_leave_nodes;
> for (i = 0; i < nr; i++) {
> if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list)
> @@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members,
> if (sys_stat_shutdown())
> return;
>
> + update_node_info(w->member_list, w->member_list_entries);
> +
> + if (sys_can_recover()) {
> + sys->epoch++;
> + update_epoch_store(sys->epoch);
> + update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
> + }
> +
> w = zalloc(sizeof(*w));
> if (!w)
> goto oom;
--
Yunkai Zhang
Work at Taobao
More information about the sheepdog
mailing list