From: Liu Yuan <tailai.ly at taobao.com> The current object_cache_pull() cause bellow bug: ... do_gateway_request(288) 2, 80d6d76e00000000 , 1 Jun 04 10:16:37 connect_to(241) 2126, 10.232.134.3:7000 Jun 04 10:16:37 client_handler(747) closed connection 2116 Jun 04 10:16:37 destroy_client(678) connection from: 127.0.0.1:60214 Jun 04 10:16:37 listen_handler(797) accepted a new connection: 2116 Jun 04 10:16:37 client_rx_handler(586) connection from: 127.0.0.1:60216 Jun 04 10:16:37 queue_request(385) 2 Jun 04 10:16:37 do_gateway_request(288) 2, 80d6d76e00000000 , 1 Jun 04 10:16:37 do_gateway_request(308) failed: 2, 80d6d76e00000000 , 1, 54014b01 ... This is because we use forward_read_obj_req(), which tries to multiplex a socket FD if concurrent requests access to the same object and unforunately routed to the same node. Object cache has a very high pressure of current requests access to the same COW object from cloned VMs, so this problem emerges. It looks to me that, besides object cache, QEMU requests are also be subject to this problem too because QEMU's sheepdog block layer can issue multiple requests in one go. Signed-off-by: Liu Yuan <tailai.ly at taobao.com> --- sheep/object_cache.c | 85 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 21 deletions(-) diff --git a/sheep/object_cache.c b/sheep/object_cache.c index 533769c..c70bc31 100644 --- a/sheep/object_cache.c +++ b/sheep/object_cache.c @@ -466,15 +466,19 @@ out: } /* Fetch the object, cache it in success */ -static int object_cache_pull(struct vnode_info *vnodes, struct object_cache *oc, - uint32_t idx) +static int object_cache_pull(struct vnode_info *vnode_info, + struct object_cache *oc, + uint32_t idx) { - struct request read_req; - struct sd_req *hdr = &read_req.rq; - int ret = SD_RES_NO_MEM; + int i, fd, ret = SD_RES_NO_MEM; + unsigned wlen = 0, rlen, data_length, read_len; uint64_t oid; - uint32_t data_length; + struct sd_req hdr = { 0 }; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + struct sd_vnode *v; + struct sd_vnode *vnodes[SD_MAX_COPIES]; void *buf; + int nr_copies; if (idx & CACHE_VDI_BIT) { oid = vid_to_vdi_oid(oc->vid); @@ -489,27 +493,66 @@ static int object_cache_pull(struct vnode_info *vnodes, struct object_cache *oc, eprintf("failed to allocate memory\n"); goto out; } - memset(&read_req, 0, sizeof(read_req)); - hdr->opcode = SD_OP_READ_OBJ; - hdr->data_length = data_length; - hdr->epoch = sys_epoch(); - hdr->obj.oid = oid; - hdr->obj.offset = 0; - hdr->obj.copies = get_nr_copies(vnodes); + /* Check if we can read locally */ + nr_copies = get_nr_copies(vnode_info); + oid_to_vnodes(vnode_info, oid, nr_copies, vnodes); + + for (i = 0; i < nr_copies; i++) { + v = vnodes[i]; + if (vnode_is_local(v)) { + struct siocb iocb = { 0 }; + iocb.epoch = sys_epoch(); + iocb.buf = buf; + iocb.length = data_length; + iocb.offset = 0; + ret = sd_store->read(oid, &iocb); + if (ret != SD_RES_SUCCESS) + goto pull_remote; + /* read succeed */ + read_len = iocb.length; + dprintf("[local] %08"PRIx32"\n", idx); + goto out; + } + } + +pull_remote: + /* Okay, no luck, let's read remotely */ + for (i = 0; i < nr_copies; i++) { + v = vnodes[i]; - read_req.data = buf; - read_req.op = get_sd_op(hdr->opcode); - read_req.vnodes = vnodes; + if (vnode_is_local(v)) + continue; + + hdr.opcode = SD_OP_READ_OBJ; + hdr.epoch = sys_epoch(); + hdr.data_length = rlen = data_length; + hdr.flags = SD_FLAG_CMD_IO_LOCAL; + hdr.obj.oid = oid; - ret = forward_read_obj_req(&read_req); + fd = get_sheep_fd(v->addr, v->port, v->node_idx, + hdr.epoch); + if (fd < 0) + continue; - if (ret == SD_RES_SUCCESS) { - dprintf("oid %"PRIx64" pulled successfully\n", oid); - ret = create_cache_object(oc, idx, buf, data_length); + ret = exec_req(fd, &hdr, buf, &wlen, &rlen); + if (ret) { /* network errors */ + del_sheep_fd(fd); + ret = SD_RES_NETWORK_ERROR; + } else + ret = rsp->result; + read_len = rlen; + + dprintf("[remote] %08"PRIx32", res:%"PRIx32"\n", idx, ret); + if (ret != SD_RES_SUCCESS) + continue; + else + break; } - free(buf); out: + if (ret == SD_RES_SUCCESS) + ret = create_cache_object(oc, idx, buf, read_len); + free(buf); return ret; } -- 1.7.10.2 |