[sheepdog] [PATCH V2 07/11] sheep: mark dirty object that belongs to the leaving nodes

Yunkai Zhang yunkai.me at gmail.com
Thu Aug 9 10:43:45 CEST 2012


From: Yunkai Zhang <qiushu.zyk at taobao.com>

After leaving nodes join the cluster again, we should recover the dirty objects
otherwise these nodes may lose object's data updating.

So we should mark the dirty object which belongs to the leaving nodes after
those nodes left the cluster and before recovery started.

obj_list_tree is a good place to hold these mark information, let's do it.

Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
 include/internal_proto.h  |  3 +++
 include/sheep.h           | 23 ++++++++++++++++++++
 sheep/cluster.h           | 23 --------------------
 sheep/farm/trunk.c        |  2 +-
 sheep/gateway.c           | 34 ++++++++++++++++++++++++++----
 sheep/object_list_cache.c | 53 +++++++++++++++++++++++++++++++++++++++++++----
 sheep/ops.c               | 23 +++++++++++++++++---
 sheep/recovery.c          |  2 +-
 sheep/sheep_priv.h        |  4 ++--
 9 files changed, 129 insertions(+), 38 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index e0ea5cd..742aa1b 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -72,6 +72,9 @@
 #define SD_FLAG_CMD_EXCL     0x0200
 #define SD_FLAG_CMD_DEL      0x0400
 
+/*flag indicates need to mark dirty object */
+#define SD_FLAG_MARK_DIRTY_OBJECT 0x0800
+
 /* internal error return values, must be above 0x80 */
 #define SD_RES_OLD_NODE_VER  0x81 /* Remote node has an old epoch */
 #define SD_RES_NEW_NODE_VER  0x82 /* Remote node has a new epoch */
diff --git a/include/sheep.h b/include/sheep.h
index 719d18f..6908b00 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -279,4 +279,27 @@ static inline int nodes_to_vnodes(struct sd_node *nodes, int nr,
 	return nr_vnodes;
 }
 
+static inline char *node_to_str(struct sd_node *id)
+{
+	static char str[256];
+	char name[256];
+	int af = AF_INET6;
+	uint8_t *addr = id->nid.addr;
+
+	/* Find address family type */
+	if (addr[12]) {
+		int  oct_no = 0;
+		while (!addr[oct_no] && oct_no++ < 12)
+			;
+		if (oct_no == 12)
+			af = AF_INET;
+	}
+
+	snprintf(str, sizeof(str), "%s ip:%s port:%d",
+		(af == AF_INET) ? "IPv4" : "IPv6",
+		addr_to_str(name, sizeof(name), id->nid.addr, 0), id->nid.port);
+
+	return str;
+}
+
 #endif
diff --git a/sheep/cluster.h b/sheep/cluster.h
index b5c3a30..8b5acd1 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -150,29 +150,6 @@ static inline const char *get_cdrv_option(struct cluster_driver *cdrv,
 		return NULL;
 }
 
-static inline char *node_to_str(struct sd_node *id)
-{
-	static char str[256];
-	char name[256];
-	int af = AF_INET6;
-	uint8_t *addr = id->nid.addr;
-
-	/* Find address family type */
-	if (addr[12]) {
-		int  oct_no = 0;
-		while (!addr[oct_no] && oct_no++ < 12)
-			;
-		if (oct_no == 12)
-			af = AF_INET;
-	}
-
-	snprintf(str, sizeof(str), "%s ip:%s port:%d",
-		(af == AF_INET) ? "IPv4" : "IPv6",
-		addr_to_str(name, sizeof(name), id->nid.addr, 0), id->nid.port);
-
-	return str;
-}
-
 static inline struct sd_node *str_to_node(const char *str, struct sd_node *id)
 {
 	int port, af = AF_INET6;
diff --git a/sheep/farm/trunk.c b/sheep/farm/trunk.c
index 8a53f9f..c82188a 100644
--- a/sheep/farm/trunk.c
+++ b/sheep/farm/trunk.c
@@ -119,7 +119,7 @@ int trunk_init(void)
 		oid = strtoull(d->d_name, NULL, 16);
 		if (oid == 0 || oid == ULLONG_MAX)
 			continue;
-		objlist_cache_insert(oid);
+		objlist_cache_insert(oid, 0);
 		lookup_trunk_entry(oid, 1);
 	}
 
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 201ce5f..59a4cf5 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -238,7 +238,7 @@ static int gateway_forward_request(struct request *req)
 	struct sd_vnode *v;
 	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
 	uint64_t oid = req->rq.obj.oid;
-	int nr_copies;
+	int nr_copies, has_left_node = 0;
 	struct write_info wi;
 	struct sd_op_template *op;
 	struct sd_req hdr;
@@ -254,6 +254,32 @@ static int gateway_forward_request(struct request *req)
 	oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
 		      nr_copies, obj_vnodes);
 
+	if (sys->disable_recovery) {
+		int idx, n = 0;
+		struct sd_vnode *vn[SD_MAX_COPIES];
+
+		memcpy(vn, obj_vnodes, nr_copies * sizeof(*vn));
+
+		for (i = 0; i < nr_copies; i++) {
+			idx = vn[i]->node_idx;
+
+			dprintf("obj_vnodes[%d], node_idx:%d, left:%d, %s\n",
+				i, idx, req->vinfo->nodes[idx].left,
+				node_to_str(&req->vinfo->nodes[idx]));
+
+			if (!vnode_has_left(req->vinfo, vn[i]))
+				obj_vnodes[n++] = vn[i];
+		}
+
+		if (n < nr_copies) {
+			nr_copies = n;
+			has_left_node = 1;
+		}
+	}
+
+	if (has_left_node)
+		hdr.flags |= SD_FLAG_MARK_DIRTY_OBJECT;
+
 	for (i = 0; i < nr_copies; i++) {
 		struct sockfd *sfd;
 
@@ -263,9 +289,6 @@ static int gateway_forward_request(struct request *req)
 			continue;
 		}
 
-		if (sys->disable_recovery && vnode_has_left(req->vinfo, v))
-			continue;
-
 		sfd = sheep_get_sockfd(&v->nid);
 		if (!sfd) {
 			err_ret = SD_RES_NETWORK_ERROR;
@@ -285,6 +308,9 @@ static int gateway_forward_request(struct request *req)
 	if (local != -1 && err_ret == SD_RES_SUCCESS) {
 		v = obj_vnodes[local];
 
+		if (has_left_node)
+			req->rq.flags |= SD_FLAG_MARK_DIRTY_OBJECT;
+
 		assert(op);
 		ret = sheep_do_op_work(op, req);
 
diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index df94dce..fe6b5bd 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -25,6 +25,12 @@ struct objlist_cache_entry {
 	uint64_t oid;
 	struct list_head list;
 	struct rb_node node;
+
+	/* This flag indicates whether this object
+	 * is dirty after disabled cluster recovery
+	 * 0: clean, 1: updated, 2: deleted
+	 */
+	uint8_t dirty_flag;
 };
 
 struct objlist_cache {
@@ -49,6 +55,28 @@ struct objlist_cache obj_list_cache = {
 	.lock		= PTHREAD_RWLOCK_INITIALIZER,
 };
 
+static struct objlist_cache_entry *objlist_cache_rb_find(struct rb_root *root,
+							 uint64_t oid)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct objlist_cache_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct objlist_cache_entry, node);
+
+		if (oid < entry->oid)
+			p = &(*p)->rb_left;
+		else if (oid > entry->oid)
+			p = &(*p)->rb_right;
+		else
+			return entry; /* found it */
+	}
+
+	return NULL;
+}
+
 static struct objlist_cache_entry *objlist_cache_rb_insert(struct rb_root *root,
 		struct objlist_cache_entry *new)
 {
@@ -64,8 +92,10 @@ static struct objlist_cache_entry *objlist_cache_rb_insert(struct rb_root *root,
 			p = &(*p)->rb_left;
 		else if (new->oid > entry->oid)
 			p = &(*p)->rb_right;
-		else
+		else {
+			entry->dirty_flag = new->dirty_flag;
 			return entry; /* already has this entry */
+		}
 	}
 	rb_link_node(&new->node, parent, p);
 	rb_insert_color(&new->node, root);
@@ -98,17 +128,31 @@ static int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid)
 	return -1; /* fail to remove */
 }
 
-void objlist_cache_remove(uint64_t oid)
+void objlist_cache_remove(uint64_t oid, int has_left_node)
 {
+	struct objlist_cache_entry *entry;
+
 	pthread_rwlock_wrlock(&obj_list_cache.lock);
-	if (!objlist_cache_rb_remove(&obj_list_cache.root, oid)) {
+	if (has_left_node) {
+		/*
+		 * Leaving nodes need this entry to store delete information
+		 * so we should not delete it directly, it will be cleaned when
+		 * enable recovery.
+		 */
+		entry = objlist_cache_rb_find(&obj_list_cache.root, oid);
+		if (entry) {
+			entry->dirty_flag = 2;
+			obj_list_cache.cache_size--;
+			obj_list_cache.tree_version++;
+		}
+	} else if (!objlist_cache_rb_remove(&obj_list_cache.root, oid)) {
 		obj_list_cache.cache_size--;
 		obj_list_cache.tree_version++;
 	}
 	pthread_rwlock_unlock(&obj_list_cache.lock);
 }
 
-int objlist_cache_insert(uint64_t oid)
+int objlist_cache_insert(uint64_t oid, int dirty_flag)
 {
 	struct objlist_cache_entry *entry, *p;
 
@@ -120,6 +164,7 @@ int objlist_cache_insert(uint64_t oid)
 	}
 
 	entry->oid = oid;
+	entry->dirty_flag = dirty_flag;
 	rb_init_node(&entry->node);
 
 	pthread_rwlock_wrlock(&obj_list_cache.lock);
diff --git a/sheep/ops.c b/sheep/ops.c
index 38634f5..de65b5d 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -731,8 +731,12 @@ out:
 int peer_remove_obj(struct request *req)
 {
 	uint64_t oid = req->rq.obj.oid;
+	int has_left_node = 0;
 
-	objlist_cache_remove(oid);
+	if (req->rq.flags & SD_FLAG_MARK_DIRTY_OBJECT)
+		has_left_node = 1;
+
+	objlist_cache_remove(oid, has_left_node);
 	object_cache_remove(oid);
 
 	return sd_store->remove_object(oid);
@@ -799,11 +803,20 @@ int peer_write_obj(struct request *req)
 	struct sd_req *hdr = &req->rq;
 	uint32_t epoch = hdr->epoch;
 	struct siocb iocb;
+	int ret, has_left_node = 0;
+
+	if (req->rq.flags & SD_FLAG_MARK_DIRTY_OBJECT)
+		has_left_node = 1;
 
 	memset(&iocb, 0, sizeof(iocb));
 	iocb.epoch = epoch;
 	iocb.flags = hdr->flags;
-	return do_write_obj(&iocb, hdr, epoch, req->data, 0);
+	ret = do_write_obj(&iocb, hdr, epoch, req->data, 0);
+
+	if (SD_RES_SUCCESS == ret)
+		objlist_cache_insert(req->rq.obj.oid, has_left_node);
+
+	return ret;
 }
 
 int peer_create_and_write_obj(struct request *req)
@@ -815,6 +828,10 @@ int peer_create_and_write_obj(struct request *req)
 	char *buf = NULL;
 	struct siocb iocb;
 	int ret = SD_RES_SUCCESS;
+	int has_left_node = 0;
+
+	if (req->rq.flags & SD_FLAG_MARK_DIRTY_OBJECT)
+		has_left_node = 1;
 
 	memset(&iocb, 0, sizeof(iocb));
 	iocb.epoch = epoch;
@@ -847,7 +864,7 @@ int peer_create_and_write_obj(struct request *req)
 		ret = do_write_obj(&iocb, hdr, epoch, req->data, 1);
 
 	if (SD_RES_SUCCESS == ret)
-		objlist_cache_insert(oid);
+		objlist_cache_insert(oid, has_left_node);
 out:
 	if (buf)
 		free(buf);
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 5164aa7..d74810f 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -135,7 +135,7 @@ done:
 	dprintf("recovered oid %"PRIx64" from %d to epoch %d\n", oid, tgt_epoch, epoch);
 out:
 	if (ret == SD_RES_SUCCESS)
-		objlist_cache_insert(oid);
+		objlist_cache_insert(oid, 0);
 	free(buf);
 	return ret;
 }
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index b530f71..2c3e195 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -276,8 +276,8 @@ void local_req_init(void);
 
 int prealloc(int fd, uint32_t size);
 
-int objlist_cache_insert(uint64_t oid);
-void objlist_cache_remove(uint64_t oid);
+int objlist_cache_insert(uint64_t oid, int has_left_node);
+void objlist_cache_remove(uint64_t oid, int has_left_node);
 
 void put_request(struct request *req);
 
-- 
1.7.11.2




More information about the sheepdog mailing list