[sheepdog] [PATCH v2] optimize repairing replica in dog vdi check command

Fri Jul 25 04:35:19 CEST 2014

On 2014年07月22日 14:12, Hitoshi Mitake wrote:
> At Fri, 18 Jul 2014 15:23:47 +0800,
> Ruoyu wrote:
>> Current repairing replica logic is described as below.
>>
>> 1. Read data stream from majority node to dog process's buffer.
>> 2. Write buffer to destinate node to finish repairing.
>>
>> This behavior leads to two side-effects.
>>
>> 1. Data is transfered twice probably.
>> 2. Version of the data to be repaired is out of control. Once
>>     recovery is in progress, copying data directly is danger.
> Applied, thanks.
>
> But the essential problem (2nd one of the above side effects) isn't
> solved yet. It seems that solving the problem requires a new locking
> mechanism which can be used by clients (QEMU and tgt) and "dog vdi
> check". The new locking mechanism should be able to serialize write
> requests and repair replica requests. Current sheepdog doesn't provide
> it, so checking VDIs which are used by living clients is dangerous...
I agree with you. So, I think a warning message in executing dog vdi 
check or dog cluster check is needed.
But it might be conflicted with the patch "dog vdi check -e". If you 
think it cannot be applied in the near future, I will submit a patch 
with warning message only to remind user should run vdi check offline.
>
> Thanks,
> Hitoshi
>
>> The solution is dog process sends a new command to ask for repairing.
>> If the destinate node receives the command, tries reading the object
>> as normal, and then, writes to it's local storage.
>>
>> Functional test case 077 is passed after patching.
>>
>> Signed-off-by: Ruoyu <liangry at ucweb.com>
>> ---
>>   dog/vdi.c                | 59 ++++++++++++++++--------------------------------
>>   include/internal_proto.h |  1 +
>>   include/sheepdog_proto.h |  5 ++++
>>   sheep/ops.c              | 48 +++++++++++++++++++++++++++++++++++++++
>>   4 files changed, 74 insertions(+), 39 deletions(-)
>>
>> diff --git a/dog/vdi.c b/dog/vdi.c
>> index ec78a2c..9fc1677 100644
>> --- a/dog/vdi.c
>> +++ b/dog/vdi.c
>> @@ -1395,41 +1395,6 @@ out:
>>   	return ret;
>>   }
>>   
>> -static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid)
>> -{
>> -	struct sd_req hdr;
>> -	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
>> -	int ret;
>> -	void *buf;
>> -	size_t size = get_objsize(oid);
>> -
>> -	buf = xmalloc(size);
>> -
>> -	sd_init_req(&hdr, SD_OP_READ_PEER);
>> -	hdr.epoch = sd_epoch;
>> -	hdr.flags = 0;
>> -	hdr.data_length = size;
>> -
>> -	hdr.obj.oid = oid;
>> -
>> -	ret = dog_exec_req(&vnode->node->nid, &hdr, buf);
>> -	if (ret < 0)
>> -		exit(EXIT_SYSFAIL);
>> -
>> -	switch (rsp->result) {
>> -	case SD_RES_SUCCESS:
>> -		break;
>> -	case SD_RES_NO_OBJ:
>> -		free(buf);
>> -		return NULL;
>> -	default:
>> -		sd_err("FATAL: failed to read %"PRIx64", %s", oid,
>> -		       sd_strerror(rsp->result));
>> -		exit(EXIT_FAILURE);
>> -	}
>> -	return buf;
>> -}
>> -
>>   static void write_object_to(const struct sd_vnode *vnode, uint64_t oid,
>>   			    void *buf, bool create, uint8_t ec_index)
>>   {
>> @@ -1501,11 +1466,27 @@ static void vdi_repair_work(struct work *work)
>>   	struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
>>   						  work);
>>   	struct vdi_check_info *info = vcw->info;
>> -	void *buf;
>> +	const struct sd_vnode *src = info->majority->vnode;
>> +	const struct sd_vnode *dst = vcw->vnode;
>> +	struct sd_req hdr;
>> +	int ret;
>> +	char n1[MAX_NODE_STR_LEN], n2[MAX_NODE_STR_LEN];
>>   
>> -	buf = read_object_from(info->majority->vnode, info->oid);
>> -	write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
>> -	free(buf);
>> +	sd_init_req(&hdr, SD_OP_REPAIR_REPLICA);
>> +	hdr.epoch = sd_epoch;
>> +	memcpy(hdr.forw.addr, src->node->nid.addr, sizeof(hdr.forw.addr));
>> +	hdr.forw.port = src->node->nid.port;
>> +	hdr.forw.oid = info->oid;
>> +
>> +	ret = send_light_req(&dst->node->nid, &hdr);
>> +	if (ret) {
>> +		strcpy(n1, addr_to_str(src->node->nid.addr,
>> +					   src->node->nid.port));
>> +		strcpy(n2, addr_to_str(dst->node->nid.addr,
>> +					   dst->node->nid.port));
>> +		sd_err("failed to repair object %016"PRIx64
>> +				" from %s to %s", info->oid, n1, n2);
>> +	}
>>   }
>>   
>>   static void vdi_repair_main(struct work *work)
>> diff --git a/include/internal_proto.h b/include/internal_proto.h
>> index 7ec2872..2affc42 100644
>> --- a/include/internal_proto.h
>> +++ b/include/internal_proto.h
>> @@ -106,6 +106,7 @@
>>   #define SD_OP_DECREF_PEER    0xC2
>>   #define SD_OP_PREVENT_INODE_UPDATE    0xC3
>>   #define SD_OP_ALLOW_INODE_UPDATE      0xC4
>> +#define SD_OP_REPAIR_REPLICA	0xC5
>>   
>>   /* internal flags for hdr.flags, must be above 0x80 */
>>   #define SD_FLAG_CMD_RECOVERY 0x0080
>> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
>> index 8b5834b..d6a8d35 100644
>> --- a/include/sheepdog_proto.h
>> +++ b/include/sheepdog_proto.h
>> @@ -176,6 +176,11 @@ struct sd_req {
>>   			uint32_t	generation;
>>   			uint32_t	count;
>>   		} ref;
>> +		struct {
>> +			uint64_t	oid;
>> +			uint8_t		addr[16];
>> +			uint16_t	port;
>> +		} forw;
>>   
>>   		uint32_t		__pad[8];
>>   	};
>> diff --git a/sheep/ops.c b/sheep/ops.c
>> index fb26077..12f5836 100644
>> --- a/sheep/ops.c
>> +++ b/sheep/ops.c
>> @@ -1238,6 +1238,48 @@ static int local_allow_inode_update(const struct sd_req *req,
>>   	return SD_RES_SUCCESS;
>>   }
>>   
>> +static int local_repair_replica(struct request *req)
>> +{
>> +	int ret;
>> +	struct node_id nid;
>> +	struct sd_req hdr;
>> +	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
>> +	struct siocb iocb = { 0 };
>> +	uint64_t oid = req->rq.forw.oid;
>> +	size_t rlen = get_store_objsize(oid);
>> +	void *buf = xvalloc(rlen);
>> +
>> +	sd_init_req(&hdr, SD_OP_READ_PEER);
>> +	hdr.epoch = req->rq.epoch;
>> +	hdr.data_length = rlen;
>> +	hdr.obj.oid = oid;
>> +
>> +	memcpy(nid.addr, req->rq.forw.addr, sizeof(nid.addr));
>> +	nid.port = req->rq.forw.port;
>> +	ret = sheep_exec_req(&nid, &hdr, buf);
>> +	if (ret == SD_RES_SUCCESS) {
>> +		sd_debug("read object %016"PRIx64" from %s successfully, "
>> +				"try saving to local", oid,
>> +				addr_to_str(nid.addr, nid.port));
>> +		iocb.epoch = req->rq.epoch;
>> +		iocb.length = rsp->data_length;
>> +		iocb.offset = rsp->obj.offset;
>> +		iocb.buf = buf;
>> +		ret = sd_store->create_and_write(oid, &iocb);
>> +		if (ret != SD_RES_SUCCESS)
>> +			sd_err("failed to write object %016"PRIx64
>> +					" to local", oid);
>> +	} else {
>> +		sd_err("failed to read object %016"PRIx64
>> +				" from %s: %s", oid,
>> +				addr_to_str(nid.addr, nid.port),
>> +				sd_strerror(ret));
>> +	}
>> +
>> +	free(buf);
>> +	return ret;
>> +}
>> +
>>   static struct sd_op_template sd_ops[] = {
>>   
>>   	/* cluster operations */
>> @@ -1589,6 +1631,12 @@ static struct sd_op_template sd_ops[] = {
>>   		.process_main = local_allow_inode_update,
>>   	},
>>   
>> +	[SD_OP_REPAIR_REPLICA] = {
>> +		.name = "REPAIR_REPLICA",
>> +		.type = SD_OP_TYPE_LOCAL,
>> +		.process_work = local_repair_replica,
>> +	},
>> +
>>   	/* gateway I/O operations */
>>   	[SD_OP_CREATE_AND_WRITE_OBJ] = {
>>   		.name = "CREATE_AND_WRITE_OBJ",
>> -- 
>> 1.8.3.2
>>
>>
>> -- 
>> sheepdog mailing list
>> sheepdog at lists.wpkg.org
>> http://lists.wpkg.org/mailman/listinfo/sheepdog