[sheepdog] [PATCH v2] optimize repairing replica in dog vdi check command
Ruoyu
liangry at ucweb.com
Fri Jul 25 04:35:19 CEST 2014
On 2014年07月22日 14:12, Hitoshi Mitake wrote:
> At Fri, 18 Jul 2014 15:23:47 +0800,
> Ruoyu wrote:
>> Current repairing replica logic is described as below.
>>
>> 1. Read data stream from majority node to dog process's buffer.
>> 2. Write buffer to destinate node to finish repairing.
>>
>> This behavior leads to two side-effects.
>>
>> 1. Data is transfered twice probably.
>> 2. Version of the data to be repaired is out of control. Once
>> recovery is in progress, copying data directly is danger.
> Applied, thanks.
>
> But the essential problem (2nd one of the above side effects) isn't
> solved yet. It seems that solving the problem requires a new locking
> mechanism which can be used by clients (QEMU and tgt) and "dog vdi
> check". The new locking mechanism should be able to serialize write
> requests and repair replica requests. Current sheepdog doesn't provide
> it, so checking VDIs which are used by living clients is dangerous...
I agree with you. So, I think a warning message in executing dog vdi
check or dog cluster check is needed.
But it might be conflicted with the patch "dog vdi check -e". If you
think it cannot be applied in the near future, I will submit a patch
with warning message only to remind user should run vdi check offline.
>
> Thanks,
> Hitoshi
>
>> The solution is dog process sends a new command to ask for repairing.
>> If the destinate node receives the command, tries reading the object
>> as normal, and then, writes to it's local storage.
>>
>> Functional test case 077 is passed after patching.
>>
>> Signed-off-by: Ruoyu <liangry at ucweb.com>
>> ---
>> dog/vdi.c | 59 ++++++++++++++++--------------------------------
>> include/internal_proto.h | 1 +
>> include/sheepdog_proto.h | 5 ++++
>> sheep/ops.c | 48 +++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 74 insertions(+), 39 deletions(-)
>>
>> diff --git a/dog/vdi.c b/dog/vdi.c
>> index ec78a2c..9fc1677 100644
>> --- a/dog/vdi.c
>> +++ b/dog/vdi.c
>> @@ -1395,41 +1395,6 @@ out:
>> return ret;
>> }
>>
>> -static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid)
>> -{
>> - struct sd_req hdr;
>> - struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
>> - int ret;
>> - void *buf;
>> - size_t size = get_objsize(oid);
>> -
>> - buf = xmalloc(size);
>> -
>> - sd_init_req(&hdr, SD_OP_READ_PEER);
>> - hdr.epoch = sd_epoch;
>> - hdr.flags = 0;
>> - hdr.data_length = size;
>> -
>> - hdr.obj.oid = oid;
>> -
>> - ret = dog_exec_req(&vnode->node->nid, &hdr, buf);
>> - if (ret < 0)
>> - exit(EXIT_SYSFAIL);
>> -
>> - switch (rsp->result) {
>> - case SD_RES_SUCCESS:
>> - break;
>> - case SD_RES_NO_OBJ:
>> - free(buf);
>> - return NULL;
>> - default:
>> - sd_err("FATAL: failed to read %"PRIx64", %s", oid,
>> - sd_strerror(rsp->result));
>> - exit(EXIT_FAILURE);
>> - }
>> - return buf;
>> -}
>> -
>> static void write_object_to(const struct sd_vnode *vnode, uint64_t oid,
>> void *buf, bool create, uint8_t ec_index)
>> {
>> @@ -1501,11 +1466,27 @@ static void vdi_repair_work(struct work *work)
>> struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
>> work);
>> struct vdi_check_info *info = vcw->info;
>> - void *buf;
>> + const struct sd_vnode *src = info->majority->vnode;
>> + const struct sd_vnode *dst = vcw->vnode;
>> + struct sd_req hdr;
>> + int ret;
>> + char n1[MAX_NODE_STR_LEN], n2[MAX_NODE_STR_LEN];
>>
>> - buf = read_object_from(info->majority->vnode, info->oid);
>> - write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
>> - free(buf);
>> + sd_init_req(&hdr, SD_OP_REPAIR_REPLICA);
>> + hdr.epoch = sd_epoch;
>> + memcpy(hdr.forw.addr, src->node->nid.addr, sizeof(hdr.forw.addr));
>> + hdr.forw.port = src->node->nid.port;
>> + hdr.forw.oid = info->oid;
>> +
>> + ret = send_light_req(&dst->node->nid, &hdr);
>> + if (ret) {
>> + strcpy(n1, addr_to_str(src->node->nid.addr,
>> + src->node->nid.port));
>> + strcpy(n2, addr_to_str(dst->node->nid.addr,
>> + dst->node->nid.port));
>> + sd_err("failed to repair object %016"PRIx64
>> + " from %s to %s", info->oid, n1, n2);
>> + }
>> }
>>
>> static void vdi_repair_main(struct work *work)
>> diff --git a/include/internal_proto.h b/include/internal_proto.h
>> index 7ec2872..2affc42 100644
>> --- a/include/internal_proto.h
>> +++ b/include/internal_proto.h
>> @@ -106,6 +106,7 @@
>> #define SD_OP_DECREF_PEER 0xC2
>> #define SD_OP_PREVENT_INODE_UPDATE 0xC3
>> #define SD_OP_ALLOW_INODE_UPDATE 0xC4
>> +#define SD_OP_REPAIR_REPLICA 0xC5
>>
>> /* internal flags for hdr.flags, must be above 0x80 */
>> #define SD_FLAG_CMD_RECOVERY 0x0080
>> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
>> index 8b5834b..d6a8d35 100644
>> --- a/include/sheepdog_proto.h
>> +++ b/include/sheepdog_proto.h
>> @@ -176,6 +176,11 @@ struct sd_req {
>> uint32_t generation;
>> uint32_t count;
>> } ref;
>> + struct {
>> + uint64_t oid;
>> + uint8_t addr[16];
>> + uint16_t port;
>> + } forw;
>>
>> uint32_t __pad[8];
>> };
>> diff --git a/sheep/ops.c b/sheep/ops.c
>> index fb26077..12f5836 100644
>> --- a/sheep/ops.c
>> +++ b/sheep/ops.c
>> @@ -1238,6 +1238,48 @@ static int local_allow_inode_update(const struct sd_req *req,
>> return SD_RES_SUCCESS;
>> }
>>
>> +static int local_repair_replica(struct request *req)
>> +{
>> + int ret;
>> + struct node_id nid;
>> + struct sd_req hdr;
>> + struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
>> + struct siocb iocb = { 0 };
>> + uint64_t oid = req->rq.forw.oid;
>> + size_t rlen = get_store_objsize(oid);
>> + void *buf = xvalloc(rlen);
>> +
>> + sd_init_req(&hdr, SD_OP_READ_PEER);
>> + hdr.epoch = req->rq.epoch;
>> + hdr.data_length = rlen;
>> + hdr.obj.oid = oid;
>> +
>> + memcpy(nid.addr, req->rq.forw.addr, sizeof(nid.addr));
>> + nid.port = req->rq.forw.port;
>> + ret = sheep_exec_req(&nid, &hdr, buf);
>> + if (ret == SD_RES_SUCCESS) {
>> + sd_debug("read object %016"PRIx64" from %s successfully, "
>> + "try saving to local", oid,
>> + addr_to_str(nid.addr, nid.port));
>> + iocb.epoch = req->rq.epoch;
>> + iocb.length = rsp->data_length;
>> + iocb.offset = rsp->obj.offset;
>> + iocb.buf = buf;
>> + ret = sd_store->create_and_write(oid, &iocb);
>> + if (ret != SD_RES_SUCCESS)
>> + sd_err("failed to write object %016"PRIx64
>> + " to local", oid);
>> + } else {
>> + sd_err("failed to read object %016"PRIx64
>> + " from %s: %s", oid,
>> + addr_to_str(nid.addr, nid.port),
>> + sd_strerror(ret));
>> + }
>> +
>> + free(buf);
>> + return ret;
>> +}
>> +
>> static struct sd_op_template sd_ops[] = {
>>
>> /* cluster operations */
>> @@ -1589,6 +1631,12 @@ static struct sd_op_template sd_ops[] = {
>> .process_main = local_allow_inode_update,
>> },
>>
>> + [SD_OP_REPAIR_REPLICA] = {
>> + .name = "REPAIR_REPLICA",
>> + .type = SD_OP_TYPE_LOCAL,
>> + .process_work = local_repair_replica,
>> + },
>> +
>> /* gateway I/O operations */
>> [SD_OP_CREATE_AND_WRITE_OBJ] = {
>> .name = "CREATE_AND_WRITE_OBJ",
>> --
>> 1.8.3.2
>>
>>
>> --
>> sheepdog mailing list
>> sheepdog at lists.wpkg.org
>> http://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list