[sheepdog] [PATCH 3/4] sheep: implement coherence protocol of inode objects in address spaces of clients

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Tue Aug 5 06:53:52 CEST 2014


The MSI states introduced by the previous patch should be updated via
gateway read and write of inode objects (read makes states of every
node shared, write makes writer's state modified and others
invalidated). If its state is invalidated, client is forced to reload
inode like a case of snapshot (a new return code
SD_RES_INODE_INVALIDATED is used for this purpose). This patch
implements the coherence protocol.

The update of the MSI states are done by atomic broadcast provided by
cluster managers. Therefore the update is costly operation, but it
happens rarely so it wouldn't be problematic.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 include/internal_proto.h |   1 +
 include/sheepdog_proto.h |   6 ++
 sheep/gateway.c          |  24 ++++++-
 sheep/ops.c              |  18 +++++
 sheep/sheep_priv.h       |   5 ++
 sheep/vdi.c              | 170 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 222 insertions(+), 2 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 9f3bd6d..c3dc3de 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -109,6 +109,7 @@
 #define SD_OP_REPAIR_REPLICA	0xC5
 #define SD_OP_OIDS_EXIST	0xC6
 #define SD_OP_VDI_STATE_SNAPSHOT_CTL  0xC7
+#define SD_OP_INODE_COHERENCE 0xC8
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index df0667f..654d7ac 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -84,6 +84,7 @@
 #define SD_RES_READONLY      0x1A /* Object is read-only */
 #define SD_RES_INCOMPLETE    0x1B /* Object (in kv) is incomplete uploading */
 #define SD_RES_COLLECTING_CINFO 0x1C /* sheep is collecting cluster wide status, not ready for operation */
+#define SD_RES_INODE_INVALIDATED 0x1D /* inode object in client is invalidated, refreshing is required */
 
 /* errors above 0x80 are sheepdog-internal */
 
@@ -194,6 +195,11 @@ struct sd_req {
 			uint32_t        get; /* 0 means free, 1 means get */
 			uint32_t        tgt_epoch;
 		} vdi_state_snapshot;
+		struct {
+			/* 1 means validate, 0 means invalidate */
+			uint32_t        vid;
+			uint32_t        validate;
+		} inode_coherence;
 
 
 		uint32_t		__pad[8];
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 8868bce..062bd90 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -616,14 +616,27 @@ static int update_obj_refcnt(const struct sd_req *hdr, uint32_t *vids,
 int gateway_read_obj(struct request *req)
 {
 	uint64_t oid = req->rq.obj.oid;
+	int ret;
+
+	if (!is_vdi_obj(oid) && is_refresh_required(oid_to_vid(oid))) {
+		sd_debug("refresh is required: %"PRIx64, oid);
+		return SD_RES_INODE_INVALIDATED;
+	}
 
+	/* XXX: object cache and iSCSI multipath cannot coexist */
 	if (!bypass_object_cache(req))
 		return object_cache_handle_request(req);
 
 	if (is_erasure_oid(oid))
-		return gateway_forward_request(req);
+		ret = gateway_forward_request(req);
 	else
-		return gateway_replication_read(req);
+		ret = gateway_replication_read(req);
+
+	if (ret != SD_RES_SUCCESS)
+		return ret;
+
+	validate_myself(oid_to_vid(oid));
+	return ret;
 }
 
 int gateway_write_obj(struct request *req)
@@ -634,6 +647,11 @@ int gateway_write_obj(struct request *req)
 	uint32_t *vids = NULL, *new_vids = req->data;
 	struct generation_reference *refs = NULL;
 
+	if (is_refresh_required(oid_to_vid(oid))) {
+		sd_debug("refresh is required: %"PRIx64, oid);
+		return SD_RES_INODE_INVALIDATED;
+	}
+
 	if (oid_is_readonly(oid))
 		return SD_RES_READONLY;
 
@@ -660,6 +678,8 @@ int gateway_write_obj(struct request *req)
 		update_obj_refcnt(hdr, vids, new_vids, refs);
 	}
 out:
+	invalidate_other_nodes(oid_to_vid(oid));
+
 	free(vids);
 	free(refs);
 	return ret;
diff --git a/sheep/ops.c b/sheep/ops.c
index 259616d..9688991 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1401,6 +1401,18 @@ static int local_vdi_state_snapshot_ctl(const struct sd_req *req,
 	return SD_RES_SUCCESS;
 }
 
+static int cluster_inode_coherence(const struct sd_req *req,
+				   struct sd_rsp *rsp, void *data,
+				   const struct sd_node *sender)
+{
+	sd_debug("inode coherence: %s %"PRIx32" from %s",
+		 req->inode_coherence.validate ? "validate" : "invalidate",
+		 req->inode_coherence.vid, node_to_str(sender));
+
+	return inode_coherence_update(req->inode_coherence.vid,
+			       !!req->inode_coherence.validate, &sender->nid);
+}
+
 static struct sd_op_template sd_ops[] = {
 
 	/* cluster operations */
@@ -1540,6 +1552,12 @@ static struct sd_op_template sd_ops[] = {
 		.process_main = cluster_alter_vdi_copy,
 	},
 
+	[SD_OP_INODE_COHERENCE] = {
+		.name = "INODE_COHERENCE",
+		.type = SD_OP_TYPE_CLUSTER,
+		.process_main = cluster_inode_coherence,
+	},
+
 	/* local operations */
 
 	[SD_OP_GET_STORE_LIST] = {
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 1fa5274..94a3a4f 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -347,6 +347,11 @@ void free_vdi_state_snapshot(int epoch);
 void log_vdi_op_lock(uint32_t vid, const struct node_id *owner, int type);
 void log_vdi_op_unlock(uint32_t vid, const struct node_id *owner, int type);
 void play_logged_vdi_ops(void);
+bool is_refresh_required(uint32_t vid);
+void validate_myself(uint32_t vid);
+void invalidate_other_nodes(uint32_t vid);
+int inode_coherence_update(uint32_t vid, bool validate,
+			   const struct node_id *sender);
 
 extern int ec_max_data_strip;
 
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 5f41e74..b8212d9 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -643,6 +643,176 @@ void play_logged_vdi_ops(void)
 	}
 }
 
+worker_fn bool is_refresh_required(uint32_t vid)
+{
+	struct vdi_state_entry *entry;
+
+	sd_read_lock(&vdi_state_lock);
+	entry = vdi_state_search(&vdi_state_root, vid);
+	sd_rw_unlock(&vdi_state_lock);
+
+	if (!entry) {
+		sd_alert("VID: %"PRIx32" doesn't exist", vid);
+		return false;
+	}
+
+	if (entry->snapshot)
+		return false;
+
+	if (entry->lock_state != LOCK_STATE_SHARED)
+		return false;
+
+	if (!is_modified(entry))
+		return false;
+
+	for (int i = 0; i < entry->nr_participants; i++) {
+		if (node_id_cmp(&entry->participants[i], &sys->this_node.nid))
+			continue;
+
+		if (entry->participants_state[i] ==
+		    SHARED_LOCK_STATE_INVALIDATED)
+			return true;
+		else
+			return false;
+	}
+
+	sd_alert("this node isn't locking VID: %"PRIx32, vid);
+	return false;
+}
+
+worker_fn void validate_myself(uint32_t vid)
+{
+	struct vdi_state_entry *entry;
+	struct sd_req hdr;
+	int ret;
+
+	sd_read_lock(&vdi_state_lock);
+	entry = vdi_state_search(&vdi_state_root, vid);
+	sd_rw_unlock(&vdi_state_lock);
+
+	if (!entry) {
+		sd_alert("VID: %"PRIx32" doesn't exist", vid);
+		return;
+	}
+
+	if (entry->snapshot)
+		return;
+
+	if (entry->lock_state != LOCK_STATE_SHARED)
+		return;
+
+	for (int i = 0; i < entry->nr_participants; i++) {
+		if (node_id_cmp(&entry->participants[i], &sys->this_node.nid))
+			continue;
+
+		if (entry->participants_state[i] !=
+		    SHARED_LOCK_STATE_INVALIDATED)
+			return;
+		goto validate;
+	}
+
+	sd_alert("this node isn't locking VID: %"PRIx32, vid);
+	return;
+
+validate:
+	sd_init_req(&hdr, SD_OP_INODE_COHERENCE);
+	hdr.inode_coherence.vid = vid;
+	hdr.inode_coherence.validate = 1;
+	ret = sheep_exec_req(&sys->this_node.nid, &hdr, NULL);
+	if (ret == SD_RES_SUCCESS)
+		return;
+
+	sd_err("failed to validate VID: %"PRIx32" by %s",
+	       vid, node_id_to_str(&sys->this_node.nid));
+}
+
+worker_fn void invalidate_other_nodes(uint32_t vid)
+{
+	struct vdi_state_entry *entry;
+	struct sd_req hdr;
+	int ret;
+
+	sd_read_lock(&vdi_state_lock);
+	entry = vdi_state_search(&vdi_state_root, vid);
+	sd_rw_unlock(&vdi_state_lock);
+
+	if (!entry) {
+		sd_alert("VID: %"PRIx32" doesn't exist", vid);
+		return;
+	}
+
+	if (entry->lock_state != LOCK_STATE_SHARED)
+		return;
+
+	for (int i = 0; i < entry->nr_participants; i++) {
+		if (node_id_cmp(&entry->participants[i], &sys->this_node.nid))
+			continue;
+
+		if (entry->participants_state[i] !=
+		    SHARED_LOCK_STATE_MODIFIED)
+			goto invalidate;
+
+		/* already owned by myself */
+		return;
+	}
+
+	sd_alert("this node isn't locking VID: %"PRIx32, vid);
+	return;
+
+invalidate:
+	sd_init_req(&hdr, SD_OP_INODE_COHERENCE);
+	hdr.inode_coherence.vid = vid;
+	hdr.inode_coherence.validate = 0;
+	ret = sheep_exec_req(&sys->this_node.nid, &hdr, NULL);
+	if (ret == SD_RES_SUCCESS)
+		return;
+
+	sd_err("failed to validate VID: %"PRIx32" by %s",
+	       vid, node_id_to_str(&sys->this_node.nid));
+}
+
+main_fn int inode_coherence_update(uint32_t vid, bool validate,
+				   const struct node_id *sender)
+{
+	struct vdi_state_entry *entry;
+	bool invalidated = false;
+
+	sd_read_lock(&vdi_state_lock);
+	entry = vdi_state_search(&vdi_state_root, vid);
+	sd_rw_unlock(&vdi_state_lock);
+
+	if (!entry) {
+		sd_alert("VID: %"PRIx32" doesn't exist", vid);
+		return SD_RES_NO_VDI;
+	}
+
+	assert(entry->lock_state == LOCK_STATE_SHARED);
+
+	if (validate) {
+		for (int i = 0; i < entry->nr_participants; i++)
+			entry->participants_state[i] = SHARED_LOCK_STATE_SHARED;
+	} else {
+		for (int i = 0; i < entry->nr_participants; i++) {
+			if (node_id_cmp(&entry->participants[i], sender))
+				entry->participants_state[i] =
+					SHARED_LOCK_STATE_INVALIDATED;
+			else {
+				entry->participants_state[i] =
+					SHARED_LOCK_STATE_MODIFIED;
+				invalidated = true;
+			}
+		}
+
+		if (!invalidated) {
+			sd_err("%s isn't participating in VID: %"PRIx32,
+			       node_id_to_str(sender), vid);
+			return SD_RES_NO_VDI;
+		}
+	}
+
+	return SD_RES_SUCCESS;
+}
+
 static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
 				    uint32_t new_snapid, uint32_t new_vid,
 				    uint32_t *data_vdi_id,
-- 
1.8.3.2




More information about the sheepdog mailing list