[sheepdog] [PATCH v2] sheep: support disk more than 2T

Thu Jan 17 12:11:27 CET 2013

From: Liu Yuan <tailai.ly at taobao.com>

Currently we only support disk as backend storage less than 2T due to hardcoded
4M list buffer. The underlying object list cache already return error code when
passed buffer is too small. Simply retry with larger buffer for this case will
allow us enjoy however big disk is and remove this limit.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 v2: correct xmalloc size in finish_schedule_oids()

 include/sheepdog_proto.h  |    1 +
 sheep/object_list_cache.c |    2 +-
 sheep/recovery.c          |   61 +++++++++++++++++++++++++++------------------
 3 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 865f451..5942c76 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -74,6 +74,7 @@
 #define SD_RES_KILLED           0x24 /* Node is killed */
 #define SD_RES_OID_EXIST        0x25 /* Object ID exists already */
 #define SD_RES_AGAIN            0x26 /* Ask to try again */
+#define SD_RES_BUFFER_SMALL     0x27 /* The buffer is too small */
 
 /* errors above 0x80 are sheepdog-internal */
 
diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index 0ad505e..c6b85ee 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -164,7 +164,7 @@ out:
 	if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) {
 		pthread_rwlock_unlock(&obj_list_cache.lock);
 		eprintf("GET_OBJ_LIST buffer too small\n");
-		return SD_RES_EIO;
+		return SD_RES_BUFFER_SMALL;
 	}
 
 	rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t);
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 94c5501..464ef99 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -380,7 +380,7 @@ static inline void finish_schedule_oids(struct recovery_work *rw)
 	if (nr_recovered == rw->count - 1)
 		goto done;
 
-	new_oids = xmalloc(1 << 20); /* FIXME */
+	new_oids = xmalloc(list_buffer_size);
 	memcpy(new_oids, rw->oids, nr_recovered * sizeof(uint64_t));
 	memcpy(new_oids + nr_recovered, rw->prio_oids,
 	       rw->nr_prio_oids * sizeof(uint64_t));
@@ -502,36 +502,49 @@ static void finish_object_list(struct work *work)
 	return;
 }
 
+/* Dynamically grown list buffer default as 4M (2T storage) */
+#define DEFAULT_LIST_BUFFER_SIZE (UINT64_C(1) << 22)
+static size_t list_buffer_size = DEFAULT_LIST_BUFFER_SIZE;
 /* Fetch the object list from all the nodes in the cluster */
-static int fetch_object_list(struct sd_node *e, uint32_t epoch,
-			     uint8_t *buf, size_t buf_size)
+static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch,
+				   size_t *nr_oids)
 {
 	char name[128];
 	struct sd_list_req hdr;
 	struct sd_list_rsp *rsp = (struct sd_list_rsp *)&hdr;
+	size_t buf_size = list_buffer_size;
+	uint64_t *buf = xmalloc(buf_size);
 	int ret;
 
 	addr_to_str(name, sizeof(name), e->nid.addr, 0);
-
 	dprintf("%s %"PRIu32"\n", name, e->nid.port);
 
+retry:
 	sd_init_req((struct sd_req *)&hdr, SD_OP_GET_OBJ_LIST);
 	hdr.tgt_epoch = epoch - 1;
 	hdr.data_length = buf_size;
-
 	ret = sheep_exec_req(&e->nid, (struct sd_req *)&hdr, buf);
 
-	if (ret != SD_RES_SUCCESS)
-		return -1;
-
-	dprintf("%zu\n", rsp->data_length / sizeof(uint64_t));
+	switch (ret) {
+	case SD_RES_SUCCESS:
+		break;
+	case SD_RES_BUFFER_SMALL:
+		buf_size *= 2;
+		buf = xrealloc(buf, buf_size);
+		goto retry;
+	default:
+		free(buf);
+		return NULL;
+	}
 
-	return rsp->data_length / sizeof(uint64_t);
+	*nr_oids = rsp->data_length / sizeof(uint64_t);
+	dprintf("%zu\n", *nr_oids);
+	return buf;
 }
 
 /* Screen out objects that don't belong to this node */
 static void screen_object_list(struct recovery_work *rw,
-			       uint64_t *oids, int nr_oids)
+			       uint64_t *oids, size_t nr_oids)
 {
 	const struct sd_vnode *vnodes[SD_MAX_COPIES];
 	int old_count = rw->count;
@@ -555,6 +568,11 @@ static void screen_object_list(struct recovery_work *rw,
 				continue;
 
 			rw->oids[rw->count++] = oids[i];
+			/* enlarge the list buffer if full */
+			if (rw->count == list_buffer_size / sizeof(uint64_t)) {
+				list_buffer_size *= 2;
+				rw->oids = xrealloc(rw->oids, list_buffer_size);
+			}
 			break;
 		}
 	}
@@ -575,35 +593,32 @@ static void prepare_object_list(struct work *work)
 {
 	struct recovery_work *rw = container_of(work, struct recovery_work,
 						work);
-	uint8_t *buf = NULL;
-	size_t buf_size = SD_DATA_OBJ_SIZE; /* FIXME */
 	struct sd_node *cur = rw->cur_vinfo->nodes;
 	int cur_nr = rw->cur_vinfo->nr_nodes;
 	int start = random() % cur_nr, i, end = cur_nr;
+	uint64_t *oids;
 
 	dprintf("%u\n", rw->epoch);
-
 	wait_get_vdis_done();
-
-	buf = xmalloc(buf_size);
 again:
 	/* We need to start at random node for better load balance */
 	for (i = start; i < end; i++) {
-		int buf_nr;
+		size_t nr_oids;
 		struct sd_node *node = cur + i;
 
 		if (next_rw) {
 			dprintf("go to the next recovery\n");
-			goto out;
+			return;
 		}
 		if (newly_joined(node, rw))
 			/* new node doesn't have a list file */
 			continue;
 
-		buf_nr = fetch_object_list(node, rw->epoch, buf, buf_size);
-		if (buf_nr < 0)
+		oids = fetch_object_list(node, rw->epoch, &nr_oids);
+		if (!oids)
 			continue;
-		screen_object_list(rw, (uint64_t *)buf, buf_nr);
+		screen_object_list(rw, oids, nr_oids);
+		free(oids);
 	}
 
 	if (start != 0) {
@@ -613,8 +628,6 @@ again:
 	}
 
 	dprintf("%d\n", rw->count);
-out:
-	free(buf);
 }
 
 static inline bool node_is_gateway_only(void)
@@ -636,7 +649,7 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
 	}
 
 	rw->state = RW_INIT;
-	rw->oids = xmalloc(1 << 20); /* FIXME */
+	rw->oids = xmalloc(list_buffer_size);
 	rw->epoch = sys->epoch;
 	rw->count = 0;
 
-- 
1.7.9.5