[sheepdog] [PATCH v1, RFC] sheep: writeback cache semantics in backend store

Wed Aug 29 05:36:35 CEST 2012

On 08/29/2012 07:21 AM, MORITA Kazutaka wrote:
> At Wed, 29 Aug 2012 01:56:10 +0900,
> Hitoshi Mitake wrote:
>>
>> v1: differences from v0 are,
>> * check syncfs() in configure script
>> * send SD_OP_FLUSH_PEER to all sheeps
> 
> Please add a scissors line so that we can remove the change log from
> the commit log easily.
> 
>>  
>> +static int flush_all_node(struct request *req)
>> +{
>> +	int i, ret, err_ret, epfd, waiting, cnt;
>> +	struct sd_node *s;
>> +	struct node_id *node_sent[SD_MAX_NODES];
>> +	struct sockfd *sfd, *sfd_sent[SD_MAX_NODES];
>> +	struct sd_req hdr;
>> +	struct vnode_info *vinfo = req->vinfo;
>> +	struct epoll_event ev;
>> +
>> +	err_ret = SD_RES_SUCCESS;
>> +
>> +	epfd = epoll_create(SD_MAX_NODES);
>> +	if (epfd == -1) {
>> +		eprintf("failed to create epoll file descriptor");
>> +		return SD_RES_EIO;
>> +	}
>> +
>> +	sd_init_req(&hdr, SD_OP_FLUSH_PEER);
>> +
>> +	bzero(&ev, sizeof(struct epoll_event));
>> +	ev.events = EPOLLIN;
>> +
>> +	for (waiting = 0, i = 0; i < vinfo->nr_nodes; i++) {
>> +		unsigned int wlen = 0;
>> +
>> +		s = &vinfo->nodes[i];
>> +
>> +		if (node_is_local(s)) {
>> +			_peer_flush();
>> +			continue;
>> +		}
>> +
>> +		sfd = sheep_get_sockfd(&s->nid);
>> +		if (!sfd) {
>> +			err_ret = SD_RES_NETWORK_ERROR;
>> +			goto put_sockfd;
>> +		}
>> +
>> +		node_sent[waiting] = &s->nid;
>> +		sfd_sent[waiting] = sfd;
>> +
>> +		ret = send_req(sfd->fd, &hdr, NULL, &wlen);
>> +		if (ret) {
>> +			eprintf("failed at send_req()");
>> +			sheep_del_sockfd(&s->nid, sfd);
>> +			err_ret = SD_RES_NETWORK_ERROR;
>> +			goto put_sockfd;
>> +		}
>> +
>> +		ev.data.fd = sfd->fd;
>> +		if (epoll_ctl(epfd, EPOLL_CTL_ADD, sfd->fd, &ev) == -1) {
>> +			eprintf("failed at epoll_ctl(), errno: %s", strerror(errno));
>> +			err_ret = SD_RES_EIO;
>> +			goto put_sockfd;
>> +		}
>> +
>> +		waiting++;
>> +	}
>> +
>> +	cnt = waiting;
>> +	while (cnt) {
>> +		struct epoll_event ev_nodes[SD_MAX_NODES];
>> +
>> +		bzero(ev_nodes, sizeof(struct epoll_event) * cnt);
>> +
>> +		ret = epoll_wait(epfd, ev_nodes, cnt, -1);
>> +		if (ret == -1) {
>> +			eprintf("failed at epoll_wait(), errno: %s", strerror(errno));
>> +			err_ret = SD_RES_EIO;
>> +			break;
>> +		}
>> +
>> +		cnt -= ret;
>> +
>> +		for (i = 0; i < ret; i++) {
>> +			struct sd_rsp rsp;
>> +
>> +			if (do_read(ev_nodes[i].data.fd, &rsp, sizeof(struct sd_rsp))) {
>> +				eprintf("failed to receive response from node");
>> +				err_ret = SD_RES_NETWORK_ERROR;
>> +				goto put_sockfd;
>> +			}
>> +		}
>> +	}
>> +
>> +put_sockfd:
>> +	for (i = 0; i < waiting; i++)
>> +		sheep_put_sockfd(node_sent[i], sfd_sent[i]);
>> +
>> +	close(epfd);
>> +
>> +	return err_ret;
>> +}
>> +
> 
> This code looks like reimplementation of gateway_forward_request.  How
> about making SD_OP_FLUSH_VDI a gateway operation, and modifying
> gateway_forward_request so that it can forward request to all nodes?
> 

I think SD_OP_FLUSH_VDI should be local as is. We'd better add another
SD_OP_SYNC_VDI for sync vdi operation (for now, we broadcast it to all
nodes), then we can exec_local_req(SD_OP_SYNC_VDI) to queue request on
the gateway to overcome node change events.

> 
>> @@ -904,6 +1017,31 @@ out:
>>  	return ret;
>>  }
>>  
>> +int _peer_flush(void)
>> +{
>> +	int fd;
>> +
>> +	fd = open(obj_path, O_RDONLY);
>> +	if (fd < 0) {
>> +		eprintf("error at open() %s, %s\n", obj_path, strerror(errno));
>> +		return SD_RES_NO_OBJ;
>> +	}
>> +
>> +	if (syncfs(fd)) {
>> +		eprintf("error at syncfs(), %s\n", strerror(errno));
>> +		return SD_RES_EIO;
>> +	}
> 
> Add a new interface sync to a storage driver and call it here because
> how to flush data is up to the underlying strage driver.
> 
> 
>> diff --git a/sheep/sheep.c b/sheep/sheep.c
>> index 10c0501..77e8d7c 100644
>> --- a/sheep/sheep.c
>> +++ b/sheep/sheep.c
>> @@ -52,10 +52,11 @@ static struct option const long_options[] = {
>>  	{"enable-cache", required_argument, NULL, 'w'},
>>  	{"zone", required_argument, NULL, 'z'},
>>  	{"pidfile", required_argument, NULL, 'P'},
>> +	{"writeback", no_argument, NULL, 'W'},
>>  	{NULL, 0, NULL, 0},
>>  };
> 
> Adding a 'writeback' option is a bit confusing.  Should we extend a
> '-w' option so that we can specify types of caches we want to enable?
> 

Basically this is a sync operation, as we don't want to confuse users by
yet another writeback semantic, I'd suggest term it as sync operation.
Then how about -w {wrtiethrough,writeback},{nosync}? First two used for
object cache, and nosync used for backend. nosync means don't use D_SYNC
flag for backend write. For e.g
 -w writeback,nosync means use object cache and no D_SYNC flag for backend
 -w nosync means no D_SYNC flag for backend
 -w writethough means use object cache but still use D_SYNC flag for
backend write

Thanks,
Yuan