[sheepdog] [PATCH v2] optimize repairing replica in dog vdi check command
Hitoshi Mitake
mitake.hitoshi at lab.ntt.co.jp
Tue Jul 22 08:12:07 CEST 2014
At Fri, 18 Jul 2014 15:23:47 +0800,
Ruoyu wrote:
>
> Current repairing replica logic is described as below.
>
> 1. Read data stream from majority node to dog process's buffer.
> 2. Write buffer to destinate node to finish repairing.
>
> This behavior leads to two side-effects.
>
> 1. Data is transfered twice probably.
> 2. Version of the data to be repaired is out of control. Once
> recovery is in progress, copying data directly is danger.
Applied, thanks.
But the essential problem (2nd one of the above side effects) isn't
solved yet. It seems that solving the problem requires a new locking
mechanism which can be used by clients (QEMU and tgt) and "dog vdi
check". The new locking mechanism should be able to serialize write
requests and repair replica requests. Current sheepdog doesn't provide
it, so checking VDIs which are used by living clients is dangerous...
Thanks,
Hitoshi
>
> The solution is dog process sends a new command to ask for repairing.
> If the destinate node receives the command, tries reading the object
> as normal, and then, writes to it's local storage.
>
> Functional test case 077 is passed after patching.
>
> Signed-off-by: Ruoyu <liangry at ucweb.com>
> ---
> dog/vdi.c | 59 ++++++++++++++++--------------------------------
> include/internal_proto.h | 1 +
> include/sheepdog_proto.h | 5 ++++
> sheep/ops.c | 48 +++++++++++++++++++++++++++++++++++++++
> 4 files changed, 74 insertions(+), 39 deletions(-)
>
> diff --git a/dog/vdi.c b/dog/vdi.c
> index ec78a2c..9fc1677 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -1395,41 +1395,6 @@ out:
> return ret;
> }
>
> -static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid)
> -{
> - struct sd_req hdr;
> - struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> - int ret;
> - void *buf;
> - size_t size = get_objsize(oid);
> -
> - buf = xmalloc(size);
> -
> - sd_init_req(&hdr, SD_OP_READ_PEER);
> - hdr.epoch = sd_epoch;
> - hdr.flags = 0;
> - hdr.data_length = size;
> -
> - hdr.obj.oid = oid;
> -
> - ret = dog_exec_req(&vnode->node->nid, &hdr, buf);
> - if (ret < 0)
> - exit(EXIT_SYSFAIL);
> -
> - switch (rsp->result) {
> - case SD_RES_SUCCESS:
> - break;
> - case SD_RES_NO_OBJ:
> - free(buf);
> - return NULL;
> - default:
> - sd_err("FATAL: failed to read %"PRIx64", %s", oid,
> - sd_strerror(rsp->result));
> - exit(EXIT_FAILURE);
> - }
> - return buf;
> -}
> -
> static void write_object_to(const struct sd_vnode *vnode, uint64_t oid,
> void *buf, bool create, uint8_t ec_index)
> {
> @@ -1501,11 +1466,27 @@ static void vdi_repair_work(struct work *work)
> struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
> work);
> struct vdi_check_info *info = vcw->info;
> - void *buf;
> + const struct sd_vnode *src = info->majority->vnode;
> + const struct sd_vnode *dst = vcw->vnode;
> + struct sd_req hdr;
> + int ret;
> + char n1[MAX_NODE_STR_LEN], n2[MAX_NODE_STR_LEN];
>
> - buf = read_object_from(info->majority->vnode, info->oid);
> - write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
> - free(buf);
> + sd_init_req(&hdr, SD_OP_REPAIR_REPLICA);
> + hdr.epoch = sd_epoch;
> + memcpy(hdr.forw.addr, src->node->nid.addr, sizeof(hdr.forw.addr));
> + hdr.forw.port = src->node->nid.port;
> + hdr.forw.oid = info->oid;
> +
> + ret = send_light_req(&dst->node->nid, &hdr);
> + if (ret) {
> + strcpy(n1, addr_to_str(src->node->nid.addr,
> + src->node->nid.port));
> + strcpy(n2, addr_to_str(dst->node->nid.addr,
> + dst->node->nid.port));
> + sd_err("failed to repair object %016"PRIx64
> + " from %s to %s", info->oid, n1, n2);
> + }
> }
>
> static void vdi_repair_main(struct work *work)
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index 7ec2872..2affc42 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -106,6 +106,7 @@
> #define SD_OP_DECREF_PEER 0xC2
> #define SD_OP_PREVENT_INODE_UPDATE 0xC3
> #define SD_OP_ALLOW_INODE_UPDATE 0xC4
> +#define SD_OP_REPAIR_REPLICA 0xC5
>
> /* internal flags for hdr.flags, must be above 0x80 */
> #define SD_FLAG_CMD_RECOVERY 0x0080
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 8b5834b..d6a8d35 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -176,6 +176,11 @@ struct sd_req {
> uint32_t generation;
> uint32_t count;
> } ref;
> + struct {
> + uint64_t oid;
> + uint8_t addr[16];
> + uint16_t port;
> + } forw;
>
> uint32_t __pad[8];
> };
> diff --git a/sheep/ops.c b/sheep/ops.c
> index fb26077..12f5836 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -1238,6 +1238,48 @@ static int local_allow_inode_update(const struct sd_req *req,
> return SD_RES_SUCCESS;
> }
>
> +static int local_repair_replica(struct request *req)
> +{
> + int ret;
> + struct node_id nid;
> + struct sd_req hdr;
> + struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> + struct siocb iocb = { 0 };
> + uint64_t oid = req->rq.forw.oid;
> + size_t rlen = get_store_objsize(oid);
> + void *buf = xvalloc(rlen);
> +
> + sd_init_req(&hdr, SD_OP_READ_PEER);
> + hdr.epoch = req->rq.epoch;
> + hdr.data_length = rlen;
> + hdr.obj.oid = oid;
> +
> + memcpy(nid.addr, req->rq.forw.addr, sizeof(nid.addr));
> + nid.port = req->rq.forw.port;
> + ret = sheep_exec_req(&nid, &hdr, buf);
> + if (ret == SD_RES_SUCCESS) {
> + sd_debug("read object %016"PRIx64" from %s successfully, "
> + "try saving to local", oid,
> + addr_to_str(nid.addr, nid.port));
> + iocb.epoch = req->rq.epoch;
> + iocb.length = rsp->data_length;
> + iocb.offset = rsp->obj.offset;
> + iocb.buf = buf;
> + ret = sd_store->create_and_write(oid, &iocb);
> + if (ret != SD_RES_SUCCESS)
> + sd_err("failed to write object %016"PRIx64
> + " to local", oid);
> + } else {
> + sd_err("failed to read object %016"PRIx64
> + " from %s: %s", oid,
> + addr_to_str(nid.addr, nid.port),
> + sd_strerror(ret));
> + }
> +
> + free(buf);
> + return ret;
> +}
> +
> static struct sd_op_template sd_ops[] = {
>
> /* cluster operations */
> @@ -1589,6 +1631,12 @@ static struct sd_op_template sd_ops[] = {
> .process_main = local_allow_inode_update,
> },
>
> + [SD_OP_REPAIR_REPLICA] = {
> + .name = "REPAIR_REPLICA",
> + .type = SD_OP_TYPE_LOCAL,
> + .process_work = local_repair_replica,
> + },
> +
> /* gateway I/O operations */
> [SD_OP_CREATE_AND_WRITE_OBJ] = {
> .name = "CREATE_AND_WRITE_OBJ",
> --
> 1.8.3.2
>
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list