[sheepdog] [PATCH v3] sheep: support disk more than 2T

Thu Jan 17 16:43:13 CET 2013

From: Liu Yuan <tailai.ly at taobao.com>

Currently we only support disk as backend storage less than 2T due to hardcoded
4M list buffer. The underlying object list cache already return error code when
passed buffer is too small. Simply retry with larger buffer for this case will
allow us enjoy however big disk is and remove this limit.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 include/internal_proto.h  |    1 +
 sheep/object_list_cache.c |    2 +-
 sheep/recovery.c          |   61 +++++++++++++++++++++++++++------------------
 3 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 364a478..3677405 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -86,6 +86,7 @@
 #define SD_RES_INVALID_EPOCH 0x85 /* Invalid epoch */
 #define SD_RES_NETWORK_ERROR 0x86 /* Network error between sheep */
 #define SD_RES_NO_CACHE      0x87 /* No cache object found */
+#define SD_RES_BUFFER_SMALL  0x88 /* The buffer is too small */
 
 #define SD_FLAG_NOHALT       0x0004 /* Serve the IO rquest even lack of nodes */
 #define SD_FLAG_QUORUM       0x0008 /* Serve the IO rquest as long we are quorate */
diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index 0ad505e..c6b85ee 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -164,7 +164,7 @@ out:
 	if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) {
 		pthread_rwlock_unlock(&obj_list_cache.lock);
 		eprintf("GET_OBJ_LIST buffer too small\n");
-		return SD_RES_EIO;
+		return SD_RES_BUFFER_SMALL;
 	}
 
 	rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t);
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 94c5501..fcfc33a 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -45,6 +45,9 @@ struct recovery_work {
 
 static struct recovery_work *next_rw;
 static struct recovery_work *recovering_work;
+/* Dynamically grown list buffer default as 4M (2T storage) */
+#define DEFAULT_LIST_BUFFER_SIZE (UINT64_C(1) << 22)
+static size_t list_buffer_size = DEFAULT_LIST_BUFFER_SIZE;
 
 static int obj_cmp(const void *oid1, const void *oid2)
 {
@@ -380,7 +383,7 @@ static inline void finish_schedule_oids(struct recovery_work *rw)
 	if (nr_recovered == rw->count - 1)
 		goto done;
 
-	new_oids = xmalloc(1 << 20); /* FIXME */
+	new_oids = xmalloc(list_buffer_size);
 	memcpy(new_oids, rw->oids, nr_recovered * sizeof(uint64_t));
 	memcpy(new_oids + nr_recovered, rw->prio_oids,
 	       rw->nr_prio_oids * sizeof(uint64_t));
@@ -503,35 +506,45 @@ static void finish_object_list(struct work *work)
 }
 
 /* Fetch the object list from all the nodes in the cluster */
-static int fetch_object_list(struct sd_node *e, uint32_t epoch,
-			     uint8_t *buf, size_t buf_size)
+static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch,
+				   size_t *nr_oids)
 {
 	char name[128];
 	struct sd_list_req hdr;
 	struct sd_list_rsp *rsp = (struct sd_list_rsp *)&hdr;
+	size_t buf_size = list_buffer_size;
+	uint64_t *buf = xmalloc(buf_size);
 	int ret;
 
 	addr_to_str(name, sizeof(name), e->nid.addr, 0);
-
 	dprintf("%s %"PRIu32"\n", name, e->nid.port);
 
+retry:
 	sd_init_req((struct sd_req *)&hdr, SD_OP_GET_OBJ_LIST);
 	hdr.tgt_epoch = epoch - 1;
 	hdr.data_length = buf_size;
-
 	ret = sheep_exec_req(&e->nid, (struct sd_req *)&hdr, buf);
 
-	if (ret != SD_RES_SUCCESS)
-		return -1;
-
-	dprintf("%zu\n", rsp->data_length / sizeof(uint64_t));
+	switch (ret) {
+	case SD_RES_SUCCESS:
+		break;
+	case SD_RES_BUFFER_SMALL:
+		buf_size *= 2;
+		buf = xrealloc(buf, buf_size);
+		goto retry;
+	default:
+		free(buf);
+		return NULL;
+	}
 
-	return rsp->data_length / sizeof(uint64_t);
+	*nr_oids = rsp->data_length / sizeof(uint64_t);
+	dprintf("%zu\n", *nr_oids);
+	return buf;
 }
 
 /* Screen out objects that don't belong to this node */
 static void screen_object_list(struct recovery_work *rw,
-			       uint64_t *oids, int nr_oids)
+			       uint64_t *oids, size_t nr_oids)
 {
 	const struct sd_vnode *vnodes[SD_MAX_COPIES];
 	int old_count = rw->count;
@@ -555,6 +568,11 @@ static void screen_object_list(struct recovery_work *rw,
 				continue;
 
 			rw->oids[rw->count++] = oids[i];
+			/* enlarge the list buffer if full */
+			if (rw->count == list_buffer_size / sizeof(uint64_t)) {
+				list_buffer_size *= 2;
+				rw->oids = xrealloc(rw->oids, list_buffer_size);
+			}
 			break;
 		}
 	}
@@ -575,35 +593,32 @@ static void prepare_object_list(struct work *work)
 {
 	struct recovery_work *rw = container_of(work, struct recovery_work,
 						work);
-	uint8_t *buf = NULL;
-	size_t buf_size = SD_DATA_OBJ_SIZE; /* FIXME */
 	struct sd_node *cur = rw->cur_vinfo->nodes;
 	int cur_nr = rw->cur_vinfo->nr_nodes;
 	int start = random() % cur_nr, i, end = cur_nr;
+	uint64_t *oids;
 
 	dprintf("%u\n", rw->epoch);
-
 	wait_get_vdis_done();
-
-	buf = xmalloc(buf_size);
 again:
 	/* We need to start at random node for better load balance */
 	for (i = start; i < end; i++) {
-		int buf_nr;
+		size_t nr_oids;
 		struct sd_node *node = cur + i;
 
 		if (next_rw) {
 			dprintf("go to the next recovery\n");
-			goto out;
+			return;
 		}
 		if (newly_joined(node, rw))
 			/* new node doesn't have a list file */
 			continue;
 
-		buf_nr = fetch_object_list(node, rw->epoch, buf, buf_size);
-		if (buf_nr < 0)
+		oids = fetch_object_list(node, rw->epoch, &nr_oids);
+		if (!oids)
 			continue;
-		screen_object_list(rw, (uint64_t *)buf, buf_nr);
+		screen_object_list(rw, oids, nr_oids);
+		free(oids);
 	}
 
 	if (start != 0) {
@@ -613,8 +628,6 @@ again:
 	}
 
 	dprintf("%d\n", rw->count);
-out:
-	free(buf);
 }
 
 static inline bool node_is_gateway_only(void)
@@ -636,7 +649,7 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
 	}
 
 	rw->state = RW_INIT;
-	rw->oids = xmalloc(1 << 20); /* FIXME */
+	rw->oids = xmalloc(list_buffer_size);
 	rw->epoch = sys->epoch;
 	rw->count = 0;
 
-- 
1.7.9.5