[Sheepdog] [RFC PATCH 1/2] sheep: remove vdi lock feature

Tue Dec 28 09:34:14 CET 2010

The vdi lock feature causes the following problems:

 - To support the lock feature, all sheepdog nodes have a list of
   locked images in memory.  When nodes are newly joined to the
   cluster, Sheepdog sends the list to them with a corosync
   multicast. However, the size of the list can be large when we open
   many images, and in that case, we cannot send the list with one
   mcast message because of the restriction of corosync.  Currently,
   sheepdog sends the list with multiple mcast messages, but it makes
   the codes hard to read.

 - When doing a live migration, qemu needs to open its image on source
   host and destination host at the same time, but the locking feature
   prevents it.

 - When qemu crashes, sheepdog needs to detect it and release the
   lock.  However it is difficult to detect the aliveness of VMs
   strictly if they run outside the cluster.

This patch removes the lock feature and solves the above problems.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/sheep.h    |    7 --
 sheep/group.c      |  182 +--------------------------------------------------
 sheep/sdnet.c      |    1 -
 sheep/sheep_priv.h |    3 +-
 sheep/store.c      |   55 ++++------------
 5 files changed, 19 insertions(+), 229 deletions(-)

diff --git a/include/sheep.h b/include/sheep.h
index 47ffd4c..551a9ce 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -120,13 +120,6 @@ struct sd_node_rsp {
 	uint64_t	store_free;
 };
 
-struct sheepdog_vm_list_entry {
-	uint8_t         name[SD_MAX_VDI_LEN];
-	uint8_t         host_addr[16];
-	uint16_t        host_port;
-	uint8_t	        pad[6];
-};
-
 struct sheepdog_node_list_entry {
 	uint64_t        id;
 	uint8_t         addr[16];
diff --git a/sheep/group.c b/sheep/group.c
index ed50390..5516a19 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -24,11 +24,6 @@
 #include "logger.h"
 #include "work.h"
 
-struct vm {
-	struct sheepdog_vm_list_entry ent;
-	struct list_head list;
-};
-
 struct node {
 	uint32_t nodeid;
 	uint32_t pid;
@@ -93,8 +88,6 @@ struct work_confchg {
 	struct cpg_address *joined_list;
 	size_t joined_list_entries;
 
-	uint32_t *failed_vdis;
-	int nr_failed_vdis;
 	int first_cpg_node;
 	int sd_node_left;
 };
@@ -229,20 +222,6 @@ static void get_node_list(struct sd_node_req *req,
 	rsp->master_idx = get_node_idx(&node->ent, data, nr_nodes);
 }
 
-static void get_vm_list(struct sd_rsp *rsp, void *data)
-{
-	int nr_vms;
-	struct vm *vm;
-
-	struct sheepdog_vm_list_entry *p = data;
-	list_for_each_entry(vm, &sys->vm_list, list) {
-		*p++ = vm->ent;
-	}
-
-	nr_vms = p - (struct sheepdog_vm_list_entry *)data;
-	rsp->data_length = nr_vms * sizeof(struct sheepdog_vm_list_entry);
-}
-
 void cluster_queue_request(struct work *work, int idx)
 {
 	struct request *req = container_of(work, struct request, work);
@@ -259,9 +238,6 @@ void cluster_queue_request(struct work *work, int idx)
 		get_node_list((struct sd_node_req *)hdr,
 			      (struct sd_node_rsp *)rsp, req->data);
 		break;
-	case SD_OP_GET_VM_LIST:
-		get_vm_list(rsp, req->data);
-		break;
 	case SD_OP_STAT_CLUSTER:
 		log = (struct epoch_log *)req->data;
 
@@ -329,18 +305,6 @@ forward:
 	free(msg);
 }
 
-static struct vm *lookup_vm(struct list_head *entries, char *name)
-{
-	struct vm *vm;
-
-	list_for_each_entry(vm, entries, list) {
-		if (!strcmp((char *)vm->ent.name, name))
-			return vm;
-	}
-
-	return NULL;
-}
-
 static void group_handler(int listen_fd, int events, void *data)
 {
 	cpg_dispatch(sys->handle, CPG_DISPATCH_ALL);
@@ -677,10 +641,6 @@ static void vdi_op(struct vdi_op_message *msg)
 			      hdr->snapid);
 		break;
 	case SD_OP_DEL_VDI:
-		if (lookup_vm(&sys->vm_list, (char *)data)) {
-			ret = SD_RES_VDI_LOCKED;
-			break;
-		}
 		ret = del_vdi(hdr->epoch, data, hdr->data_length, &vid, hdr->snapid);
 		break;
 	case SD_OP_LOCK_VDI:
@@ -715,7 +675,6 @@ static void vdi_op_done(struct vdi_op_message *msg)
 	const struct sd_vdi_req *hdr = &msg->req;
 	struct sd_vdi_rsp *rsp = &msg->rsp;
 	void *data = msg->data;
-	struct vm *vm;
 	struct request *req;
 	int ret = msg->rsp.result;
 	int i, latest_epoch, nr_nodes;
@@ -741,33 +700,7 @@ static void vdi_op_done(struct vdi_op_message *msg)
 		break;
 	}
 	case SD_OP_LOCK_VDI:
-		if (lookup_vm(&sys->vm_list, (char *)data)) {
-			ret = SD_RES_VDI_LOCKED;
-			break;
-		}
-
-		vm = zalloc(sizeof(*vm));
-		if (!vm) {
-			ret = SD_RES_UNKNOWN;
-			break;
-		}
-		strcpy((char *)vm->ent.name, (char *)data);
-		memcpy(vm->ent.host_addr, msg->header.from.addr,
-		       sizeof(vm->ent.host_addr));
-		vm->ent.host_port = msg->header.from.port;
-
-		list_add(&vm->list, &sys->vm_list);
-		break;
 	case SD_OP_RELEASE_VDI:
-		vm = lookup_vm(&sys->vm_list, (char *)data);
-		if (!vm) {
-			ret = SD_RES_VDI_NOT_LOCKED;
-			break;
-		}
-
-		list_del(&vm->list);
-		free(vm);
-		break;
 	case SD_OP_GET_VDI_INFO:
 		break;
 	case SD_OP_MAKE_FS:
@@ -818,36 +751,6 @@ out:
 	req->done(req);
 }
 
-static void update_running_vm_state(struct cpg_event *cevent)
-{
-	struct work_deliver *w = container_of(cevent, struct work_deliver, cev);
-	struct message_header *m = w->msg;
-	struct sheepdog_vm_list_entry *e;
-	int nr, i;
-	struct vm *vm;
-
-	if (sys->join_finished)
-		goto out;
-
-	/* This is my JOIN message. */
-	vprintf(SDOG_DEBUG "we update the vm list\n");
-
-	nr = (m->msg_length - sizeof(*m)) / sizeof(*e);
-	e = (struct sheepdog_vm_list_entry *)(m + 1);
-
-	for (i = 0; i < nr; i++) {
-		vm = zalloc(sizeof(*vm));
-		if (!vm)
-			panic("failed to allocate memory for a vm\n");
-
-		vm->ent = e[i];
-		vprintf(SDOG_DEBUG "%d, got %s\n", i, e[i].name);
-		list_add(&vm->list, &sys->vm_list);
-	}
-out:
-	cevent->skip = 1;
-}
-
 static void __sd_deliver(struct cpg_event *cevent)
 {
 	struct work_deliver *w = container_of(cevent, struct work_deliver, cev);
@@ -898,9 +801,7 @@ static void __sd_deliver(struct cpg_event *cevent)
 		}
 	}
 
-	if (m->state == DM_CONT)
-		update_running_vm_state(cevent);
-	else if (m->state == DM_FIN) {
+	if (m->state == DM_FIN) {
 		switch (m->op) {
 		case SD_MSG_JOIN:
 			update_cluster_info((struct join_message *)m);
@@ -915,45 +816,6 @@ static void __sd_deliver(struct cpg_event *cevent)
 static void send_join_response(struct work_deliver *w)
 {
 	struct message_header *m;
-	struct vm *vm;
-	struct sheepdog_vm_list_entry *e;
-	int i, nr = 2000;
-	char *buf;
-
-	/*
-	 * FIXME: we need to inform the node of the JOIN failure in
-	 * the case of OOM.
-	 */
-	buf = malloc(sizeof(*m) + sizeof(*e) * nr);
-	m = (struct message_header *)buf;
-	e = (struct sheepdog_vm_list_entry *)(buf + sizeof(*m));
-
-	i = 0;
-	m->state = DM_CONT;
-	m->pid = w->msg->pid;
-	m->nodeid = w->msg->nodeid;
-
-	vprintf(SDOG_DEBUG "%u %u\n", m->pid, m->nodeid);
-
-	list_for_each_entry(vm, &sys->vm_list, list) {
-		*e = vm->ent;
-		vprintf(SDOG_DEBUG "%d %s\n", i, e->name);
-		e++;
-		i++;
-
-		if (!(i % nr)) {
-			m->msg_length = sizeof(*m) + i * sizeof(*e);
-			send_message(sys->handle, m);
-			e = (struct sheepdog_vm_list_entry *)(buf + sizeof(*m));
-			i = 0;
-		}
-	}
-
-	if (i) {
-		m->msg_length = sizeof(*m) + i * sizeof(*e);
-		vprintf(SDOG_DEBUG "%d %d\n", i, m->msg_length);
-		send_message(sys->handle, m);
-	}
 
 	m = w->msg;
 	join((struct join_message *)m);
@@ -992,7 +854,7 @@ static void __sd_deliver_done(struct cpg_event *cevent)
 	}
 
 	if (do_recovery && sys->status == SD_STATUS_OK)
-		start_recovery(sys->epoch, NULL, 0);
+		start_recovery(sys->epoch);
 }
 
 static void sd_deliver(cpg_handle_t handle, const struct cpg_name *group_name,
@@ -1065,43 +927,9 @@ static void del_node(struct cpg_address *addr, struct work_confchg *w)
 	if (node) {
 		int nr;
 		struct sheepdog_node_list_entry e[SD_MAX_NODES];
-		struct vm *vm, *n;
-		int ret, size;
-		uint32_t vid;
-		void *buf;
 
 		w->sd_node_left++;
 
-		size = sizeof(*w->failed_vdis) * 64;
-		w->failed_vdis = malloc(size);
-		list_for_each_entry_safe(vm, n, &sys->vm_list, list) {
-			if (memcmp(vm->ent.host_addr, node->ent.addr,
-				   sizeof(node->ent.addr)) != 0)
-				continue;
-			if (vm->ent.host_port != node->ent.port)
-				continue;
-
-			if (w->nr_failed_vdis * sizeof(*w->failed_vdis) >= size) {
-				size *= 2;
-				buf = realloc(w->failed_vdis, size);
-				if (!buf) {
-					eprintf("out of memory, %d\n", size);
-					break;
-				}
-				w->failed_vdis = buf;
-			}
-
-			ret = lookup_vdi(sys->epoch, (char *)vm->ent.name,
-					 sizeof(vm->ent.name), &vid, 0);
-			if (ret == SD_RES_SUCCESS)
-				w->failed_vdis[w->nr_failed_vdis++] = vid;
-			else
-				eprintf("cannot find vdi %s\n", vm->ent.name);
-
-			list_del(&vm->list);
-			free(vm);
-		}
-
 		list_del(&node->list);
 		free(node);
 
@@ -1224,7 +1052,7 @@ static void __sd_confchg_done(struct cpg_event *cevent)
 		update_cluster_info(&msg);
 
 		if (sys->status == SD_STATUS_OK) /* sheepdog starts with one node */
-			start_recovery(sys->epoch, NULL, 0);
+			start_recovery(sys->epoch);
 
 		return;
 	}
@@ -1243,7 +1071,7 @@ skip_join:
 			panic("we can't handle the departure of multiple nodes %d, %Zd\n",
 			      w->sd_node_left, w->left_list_entries);
 
-		start_recovery(sys->epoch, w->failed_vdis, w->nr_failed_vdis);
+		start_recovery(sys->epoch);
 	}
 }
 
@@ -1255,7 +1083,6 @@ static void cpg_event_free(struct cpg_event *cevent)
 		free(w->member_list);
 		free(w->left_list);
 		free(w->joined_list);
-		free(w->failed_vdis);
 		free(w);
 		break;
 	}
@@ -1732,7 +1559,6 @@ join_retry:
 		sys->status = SD_STATUS_WAIT_FOR_JOIN;
 	INIT_LIST_HEAD(&sys->sd_node_list);
 	INIT_LIST_HEAD(&sys->cpg_node_list);
-	INIT_LIST_HEAD(&sys->vm_list);
 	INIT_LIST_HEAD(&sys->pending_list);
 
 	INIT_LIST_HEAD(&sys->outstanding_req_list);
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 2a6b706..1d38fd9 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -218,7 +218,6 @@ static void queue_request(struct request *req)
 		req->work.fn = store_queue_request;
 		break;
 	case SD_OP_GET_NODE_LIST:
-	case SD_OP_GET_VM_LIST:
 	case SD_OP_NEW_VDI:
 	case SD_OP_DEL_VDI:
 	case SD_OP_LOCK_VDI:
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 32a1bb2..e2b79be 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -116,7 +116,6 @@ struct cluster_info {
 	struct list_head cpg_node_list;
 	struct list_head sd_node_list;
 
-	struct list_head vm_list;
 	struct list_head pending_list;
 
 	DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
@@ -183,7 +182,7 @@ int remove_epoch(int epoch);
 int set_cluster_ctime(uint64_t ctime);
 uint64_t get_cluster_ctime(void);
 
-int start_recovery(uint32_t epoch, uint32_t *failed_vdis, int nr_failed_vdis);
+int start_recovery(uint32_t epoch);
 void resume_recovery_work(void);
 int is_recoverying_oid(uint64_t oid);
 
diff --git a/sheep/store.c b/sheep/store.c
index 4e94afc..69da41a 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -1016,9 +1016,6 @@ struct recovery_work {
 	struct work work;
 	struct list_head rw_siblings;
 
-	unsigned long *failed_vdis;
-	int nr_failed_vdis;
-
 	int count;
 	char *buf;
 };
@@ -1221,7 +1218,6 @@ static void recover_one(struct work *work, int idx)
 	int old_nr, cur_nr;
 	uint32_t epoch = rw->epoch;
 	int i, my_idx = -1, copy_idx = 0, cur_idx = -1;
-	int is_failed_oid = 0;
 
 	eprintf("%"PRIu32" %"PRIu32", %16"PRIx64"\n", rw->done, rw->count, oid);
 
@@ -1249,30 +1245,23 @@ static void recover_one(struct work *work, int idx)
 
 	cur_idx = obj_to_sheep(cur_entry, cur_nr, oid, 0);
 
-	for (i = 0; i < rw->nr_failed_vdis; i++) {
-		if (rw->failed_vdis[i] == oid_to_vid(oid))
-			is_failed_oid = 1;
-	}
-
-	if (!is_failed_oid) {
-		for (i = 0; i < cur_nr; i++) {
-			if (cur_entry[i].id == sys->this_node.id) {
-				my_idx = i;
-				break;
-			}
+	for (i = 0; i < cur_nr; i++) {
+		if (cur_entry[i].id == sys->this_node.id) {
+			my_idx = i;
+			break;
 		}
-		copy_idx = node_distance(my_idx, cur_idx, cur_nr);
-		dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", my_idx, cur_idx, cur_nr, copy_idx);
-
-		ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr,
-				    cur_idx, copy_idx, epoch, epoch - 1, oid,
-				    buf, SD_DATA_OBJ_SIZE);
-		if (ret == 0)
-			goto out;
 	}
+	copy_idx = node_distance(my_idx, cur_idx, cur_nr);
+	dprintf("%"PRIu32", %"PRIu32", %"PRIu32", %"PRIu32"\n", my_idx, cur_idx, cur_nr, copy_idx);
+
+	ret = __recover_one(rw, old_entry, old_nr, cur_entry, cur_nr,
+			    cur_idx, copy_idx, epoch, epoch - 1, oid,
+			    buf, SD_DATA_OBJ_SIZE);
+	if (ret == 0)
+		goto out;
 
 	for (i = 0; i < sys->nr_sobjs; i++) {
-		if (!is_failed_oid && i == copy_idx)
+		if (i == copy_idx)
 			continue;
 		ret = __recover_one(rw, old_entry, old_nr,
 				    cur_entry, cur_nr, cur_idx, i,
@@ -1374,7 +1363,6 @@ static void recover_done(struct work *work, int idx)
 	resume_pending_requests();
 
 	free(rw->buf);
-	free(rw->failed_vdis);
 	free(rw);
 
 	if (!list_empty(&recovery_work_list)) {
@@ -1559,7 +1547,7 @@ fail:
 	return;
 }
 
-int start_recovery(uint32_t epoch, uint32_t *failed_vdis, int nr_failed_vdis)
+int start_recovery(uint32_t epoch)
 {
 	struct recovery_work *rw;
 
@@ -1571,16 +1559,6 @@ int start_recovery(uint32_t epoch, uint32_t *failed_vdis, int nr_failed_vdis)
 	rw->epoch = epoch;
 	rw->count = 0;
 
-	if (failed_vdis) {
-		rw->failed_vdis = malloc(nr_failed_vdis * sizeof(*failed_vdis));
-		if (!rw->failed_vdis) {
-			eprintf("out of memory\n");
-			goto fail;
-		}
-		memcpy(rw->failed_vdis, failed_vdis,
-		       nr_failed_vdis * sizeof(*failed_vdis));
-	}
-
 	rw->work.fn = __start_recovery;
 	rw->work.done = recover_done;
 
@@ -1592,11 +1570,6 @@ int start_recovery(uint32_t epoch, uint32_t *failed_vdis, int nr_failed_vdis)
 	}
 
 	return 0;
-fail:
-	free(rw->buf);
-	free(rw->failed_vdis);
-	free(rw);
-	return -1;
 }
 
 static int init_path(const char *d, int *new)
-- 
1.5.6.5