[sheepdog] [Sheepdog] [PATCH 0/4] fix a race when multiple sheep join a cluster very quickly
Yunkai Zhang
yunkai.me at gmail.com
Thu May 17 05:05:49 CEST 2012
On Thu, May 17, 2012 at 10:43 AM, Yunkai Zhang <yunkai.me at gmail.com> wrote:
> On Thu, May 17, 2012 at 1:20 AM, Christoph Hellwig <hch at infradead.org> wrote:
>> On Wed, May 16, 2012 at 10:49:44AM -0400, Christoph Hellwig wrote:
>>> On Wed, May 16, 2012 at 10:44:14PM +0800, Yunkai Zhang wrote:
>>> > Hi Hellwig, Do you have any comments on my latest unregister patch
>>> > which have been updated to reflect your previous comments?
>>>
>>> I like it, but I ran into an issue where a VDI created on one
>>> nodes doesn't seem to to be found on another node created later.
>>
>> The issues was that sys_stat_ok is now always true in __sd_join. The
I'll try to fix it.
>> patch below records the state before we joined and stores it in the
>> join work to restore the old behavior. It also filters the joining node
>> out to match the old behaviour, just as your earlier versions of the
>> patches did. The patch passes all my testing, but it still introduces
>> the problem with I/O concurrent to epoch/node list changes vs recovery
>> that Kazutaka mentioned in his recent mail, so it will need more work.
>>
>> Also note that it is based on top of my three patches as my working tree
>> has them included.
>>
>>
>> diff --git a/sheep/group.c b/sheep/group.c
>> index 6921f5e..8c684f7 100644
>> --- a/sheep/group.c
>> +++ b/sheep/group.c
>> @@ -9,6 +9,7 @@
>> * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> */
>> #include <assert.h>
>> +#include <stdbool.h>
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <unistd.h>
>> @@ -78,6 +79,7 @@ struct work_join {
>> struct sd_node *member_list;
>> size_t member_list_entries;
>> struct sd_node joined;
>> + bool get_vdi_bitmap;
>>
>> struct join_message *jm;
>> };
>> @@ -722,19 +724,15 @@ static int check_majority(struct sd_node *nodes, int nr_nodes)
>> static void __sd_join(struct event_struct *cevent)
>> {
>> struct work_join *w = container_of(cevent, struct work_join, cev);
>> - struct join_message *msg = w->jm;
>> int i;
>>
>> - if (msg->cluster_status != SD_STATUS_OK &&
>> - msg->cluster_status != SD_STATUS_HALT)
>> - return;
>> -
>> - if (sys_stat_ok())
>> + if (!w->get_vdi_bitmap)
>> return;
>>
>> for (i = 0; i < w->member_list_entries; i++) {
>> -
>> - get_vdi_bitmap_from(w->member_list + i);
>> + if (node_eq(&w->member_list[i], &w->joined))
>> + continue;
>
> Hi Hellwig, it seems that this is not my latest unregister patch. I
> have removed this node_eq() condition in it.
Sorry, I have tested your case with latest unregister patch, it also
has the some bug.
>
>> + get_vdi_bitmap_from(&w->member_list[i]);
>>
>> /*
>> * If a new comer try to join the running cluster, it only
>> @@ -867,11 +865,6 @@ static void __sd_join_done(struct event_struct *cevent)
>>
>> print_node_list(sys->nodes, sys->nr_nodes);
>>
>> - if (!sys_stat_join_failed()) {
>> - update_cluster_info(jm, &w->joined, w->member_list,
>> - w->member_list_entries);
>> - }
>> -
>> if (sys_can_recover() && jm->inc_epoch) {
>> list_for_each_entry_safe(node, t, &sys->leave_list, list) {
>> list_del(&node->list);
>> @@ -891,17 +884,8 @@ static void __sd_join_done(struct event_struct *cevent)
>>
>> static void __sd_leave_done(struct event_struct *cevent)
>> {
>> - struct work_leave *w = container_of(cevent, struct work_leave, cev);
>> -
>> - update_node_info(w->member_list, w->member_list_entries);
>> -
>> - if (sys_can_recover()) {
>> - sys->epoch++;
>> - update_epoch_store(sys->epoch);
>> - update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
>> -
>> + if (sys_can_recover())
>> start_recovery(sys->epoch);
>> - }
>>
>> if (sys_can_halt()) {
>> if (current_vnode_info->nr_zones < sys->nr_copies)
>> @@ -965,7 +949,6 @@ static void event_fn(struct work *work)
>> static void event_done(struct work *work)
>> {
>> struct event_struct *cevent;
>> - int ret;
>>
>> if (!sys->cur_cevent)
>> vprintf(SDOG_ERR, "bug\n");
>> @@ -992,9 +975,6 @@ static void event_done(struct work *work)
>> vprintf(SDOG_DEBUG, "free %p\n", cevent);
>> event_free(cevent);
>> event_running = 0;
>> - ret = register_event(cdrv_fd, group_handler, NULL);
>> - if (ret)
>> - panic("failed to register event fd");
>>
>> process_request_event_queues();
>> }
>> @@ -1106,7 +1086,6 @@ static inline void process_event_queue(void)
>> event_work.fn = event_fn;
>> event_work.done = event_done;
>>
>> - unregister_event(cdrv_fd);
>> queue_work(sys->event_wqueue, &event_work);
>> }
>>
>> @@ -1128,7 +1107,7 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>> int i, size;
>> int nr, nr_local, nr_leave;
>> struct node *n;
>> - struct join_message *jm;
>> + struct join_message *jm = opaque;
>> uint32_t le = get_latest_epoch();
>>
>> if (node_eq(joined, &sys->this_node)) {
>> @@ -1171,11 +1150,24 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>>
>> w->joined = *joined;
>>
>> + /*
>> + * Only get the VDI list if we were waiting for a format or
>> + * join before, and the join was successful.
>> + */
>> + if (sys_stat_ok() ||
>> + (jm->cluster_status != SD_STATUS_OK &&
>> + jm->cluster_status != SD_STATUS_HALT))
>> + w->get_vdi_bitmap = false;
>> + else
>> + w->get_vdi_bitmap = true;
>> +
>> size = get_join_message_size(opaque);
>> w->jm = zalloc(size);
>> if (!w->jm)
>> panic("failed to allocate memory\n");
>> memcpy(w->jm, opaque, size);
>> +
>> + update_cluster_info(jm, joined, members, nr_members);
>>
>> list_add_tail(&cevent->event_list, &sys->event_queue);
>> process_request_event_queues();
>> @@ -1210,7 +1202,6 @@ void sd_join_handler(struct sd_node *joined, struct sd_node *members,
>> }
>> break;
>> case CJ_RES_MASTER_TRANSFER:
>> - jm = (struct join_message *)opaque;
>> nr = jm->nr_leave_nodes;
>> for (i = 0; i < nr; i++) {
>> if (find_entry_list(&jm->leave_nodes[i], &sys->leave_list)
>> @@ -1268,6 +1259,14 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members,
>> if (sys_stat_shutdown())
>> return;
>>
>> + update_node_info(w->member_list, w->member_list_entries);
>> +
>> + if (sys_can_recover()) {
>> + sys->epoch++;
>> + update_epoch_store(sys->epoch);
>> + update_epoch_log(sys->epoch, sys->nodes, sys->nr_nodes);
>> + }
>> +
>> w = zalloc(sizeof(*w));
>> if (!w)
>> goto oom;
>
>
>
> --
> Yunkai Zhang
> Work at Taobao
--
Yunkai Zhang
Work at Taobao
More information about the sheepdog
mailing list