[sheepdog] [RFC PATCH] object cache: revert object_cache_pull() to older version

Mon Jun 4 08:04:02 CEST 2012

From: Liu Yuan <tailai.ly at taobao.com>

The current object_cache_pull() cause bellow bug:
...
do_gateway_request(288) 2, 80d6d76e00000000 , 1
Jun 04 10:16:37 connect_to(241) 2126, 10.232.134.3:7000
Jun 04 10:16:37 client_handler(747) closed connection 2116
Jun 04 10:16:37 destroy_client(678) connection from: 127.0.0.1:60214
Jun 04 10:16:37 listen_handler(797) accepted a new connection: 2116
Jun 04 10:16:37 client_rx_handler(586) connection from: 127.0.0.1:60216
Jun 04 10:16:37 queue_request(385) 2
Jun 04 10:16:37 do_gateway_request(288) 2, 80d6d76e00000000 , 1
Jun 04 10:16:37 do_gateway_request(308) failed: 2, 80d6d76e00000000 , 1, 54014b01
...

This is because we use forward_read_obj_req(), which tries to multiplex a socket
FD if concurrent requests access to the same object and unforunately routed to
the same node.

Object cache has a very high pressure of current requests access to the same
COW object from cloned VMs, so this problem emerges. It looks to me that,
besides object cache, QEMU requests are  also be subject to this problem too
because QEMU's sheepdog block layer can issue multiple requests in one go.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/object_cache.c |   85 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 21 deletions(-)

diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 533769c..c70bc31 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -466,15 +466,19 @@ out:
 }
 
 /* Fetch the object, cache it in success */
-static int object_cache_pull(struct vnode_info *vnodes, struct object_cache *oc,
-		      uint32_t idx)
+static int object_cache_pull(struct vnode_info *vnode_info,
+			     struct object_cache *oc,
+			     uint32_t idx)
 {
-	struct request read_req;
-	struct sd_req *hdr = &read_req.rq;
-	int ret = SD_RES_NO_MEM;
+	int i, fd, ret = SD_RES_NO_MEM;
+	unsigned wlen = 0, rlen, data_length, read_len;
 	uint64_t oid;
-	uint32_t data_length;
+	struct sd_req hdr = { 0 };
+	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+	struct sd_vnode *v;
+	struct sd_vnode *vnodes[SD_MAX_COPIES];
 	void *buf;
+	int nr_copies;
 
 	if (idx & CACHE_VDI_BIT) {
 		oid = vid_to_vdi_oid(oc->vid);
@@ -489,27 +493,66 @@ static int object_cache_pull(struct vnode_info *vnodes, struct object_cache *oc,
 		eprintf("failed to allocate memory\n");
 		goto out;
 	}
-	memset(&read_req, 0, sizeof(read_req));
-	hdr->opcode = SD_OP_READ_OBJ;
-	hdr->data_length = data_length;
-	hdr->epoch = sys_epoch();
 
-	hdr->obj.oid = oid;
-	hdr->obj.offset = 0;
-	hdr->obj.copies = get_nr_copies(vnodes);
+	/* Check if we can read locally */
+	nr_copies = get_nr_copies(vnode_info);
+	oid_to_vnodes(vnode_info, oid, nr_copies, vnodes);
+
+	for (i = 0; i < nr_copies; i++) {
+		v = vnodes[i];
+		if (vnode_is_local(v)) {
+			struct siocb iocb = { 0 };
+			iocb.epoch = sys_epoch();
+			iocb.buf = buf;
+			iocb.length = data_length;
+			iocb.offset = 0;
+			ret = sd_store->read(oid, &iocb);
+			if (ret != SD_RES_SUCCESS)
+				goto pull_remote;
+			/* read succeed */
+			read_len = iocb.length;
+			dprintf("[local] %08"PRIx32"\n", idx);
+			goto out;
+		}
+	}
+
+pull_remote:
+	/* Okay, no luck, let's read remotely */
+	for (i = 0; i < nr_copies; i++) {
+		v = vnodes[i];
 
-	read_req.data = buf;
-	read_req.op = get_sd_op(hdr->opcode);
-	read_req.vnodes = vnodes;
+		if (vnode_is_local(v))
+			continue;
+
+		hdr.opcode = SD_OP_READ_OBJ;
+		hdr.epoch = sys_epoch();
+		hdr.data_length = rlen = data_length;
+		hdr.flags = SD_FLAG_CMD_IO_LOCAL;
+		hdr.obj.oid = oid;
 
-	ret = forward_read_obj_req(&read_req);
+		fd = get_sheep_fd(v->addr, v->port, v->node_idx,
+				  hdr.epoch);
+		if (fd < 0)
+			continue;
 
-	if (ret == SD_RES_SUCCESS) {
-		dprintf("oid %"PRIx64" pulled successfully\n", oid);
-		ret = create_cache_object(oc, idx, buf, data_length);
+		ret = exec_req(fd, &hdr, buf, &wlen, &rlen);
+		if (ret) { /* network errors */
+			del_sheep_fd(fd);
+			ret = SD_RES_NETWORK_ERROR;
+		} else
+			ret = rsp->result;
+		read_len = rlen;
+
+		dprintf("[remote] %08"PRIx32", res:%"PRIx32"\n", idx, ret);
+		if (ret != SD_RES_SUCCESS)
+			continue;
+		else
+			break;
 	}
-	free(buf);
 out:
+	if (ret == SD_RES_SUCCESS)
+		ret = create_cache_object(oc, idx, buf, read_len);
+	free(buf);
 	return ret;
 }
 
-- 
1.7.10.2