[sheepdog] [PATCH 3/3] collie: retry read for eagain

Liu Yuan namei.unix at gmail.com
Thu Jan 17 16:30:30 CET 2013


From: Liu Yuan <tailai.ly at taobao.com>

Collie should retry read for eagain to fix the false-timeout case.

This fix the 035 failure on collie side.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 collie/cluster.c |   10 +++++-----
 collie/collie.c  |    2 +-
 collie/collie.h  |    1 +
 collie/common.c  |   13 +++++++++----
 collie/debug.c   |    2 +-
 collie/node.c    |    2 +-
 collie/vdi.c     |   16 ++++++++--------
 include/net.h    |    1 +
 lib/net.c        |   23 ++++++++++++++++++-----
 9 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/collie/cluster.c b/collie/cluster.c
index 917e190..10ced3b 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -53,7 +53,7 @@ static int list_store(void)
 	sd_init_req(&hdr, SD_OP_GET_STORE_LIST);
 	hdr.data_length = 512;
 
-	ret = exec_req(fd, &hdr, buf);
+	ret = collie_exec_req(fd, &hdr, buf);
 	close(fd);
 
 	if (ret) {
@@ -98,7 +98,7 @@ static int cluster_format(int argc, char **argv)
 	sd_init_req((struct sd_req *)&hdr, SD_OP_READ_VDIS);
 	hdr.data_length = sizeof(vdi_inuse);
 
-	ret = exec_req(fd, (struct sd_req *)&hdr, &vdi_inuse);
+	ret = collie_exec_req(fd, (struct sd_req *)&hdr, &vdi_inuse);
 	if (ret < 0) {
 		fprintf(stderr, "Failed to read VDIs from %s:%d\n",
 			sdhost, sdport);
@@ -144,7 +144,7 @@ static int cluster_format(int argc, char **argv)
 	hdr.flags |= SD_FLAG_CMD_WRITE;
 
 	printf("using backend %s store\n", store_name);
-	ret = exec_req(fd, (struct sd_req *)&hdr, store_name);
+	ret = collie_exec_req(fd, (struct sd_req *)&hdr, store_name);
 	close(fd);
 
 	if (ret) {
@@ -194,7 +194,7 @@ again:
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
 
-	ret = exec_req(fd, &hdr, logs);
+	ret = collie_exec_req(fd, &hdr, logs);
 	close(fd);
 
 	if (ret != 0)
@@ -311,7 +311,7 @@ static int list_snap(void)
 	sd_init_req(&hdr, SD_OP_GET_SNAP_FILE);
 	hdr.data_length = SD_DATA_OBJ_SIZE;
 
-	ret = exec_req(fd, &hdr, buf);
+	ret = collie_exec_req(fd, &hdr, buf);
 	close(fd);
 
 	if (ret) {
diff --git a/collie/collie.c b/collie/collie.c
index e739662..a533141 100644
--- a/collie/collie.c
+++ b/collie/collie.c
@@ -68,7 +68,7 @@ static int update_node_list(int max_nodes, uint32_t epoch)
 
 	hdr.data_length = size;
 
-	ret = exec_req(fd, (struct sd_req *)&hdr, buf);
+	ret = collie_exec_req(fd, (struct sd_req *)&hdr, buf);
 	if (ret) {
 		ret = -1;
 		goto out;
diff --git a/collie/collie.h b/collie/collie.h
index 1b00e68..7e9b6be 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -73,6 +73,7 @@ int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
 		    int copies, bool create, bool direct);
 int send_light_req(struct sd_req *hdr, const char *host, int port);
 int send_light_req_get_response(struct sd_req *hdr, const char *host, int port);
+int collie_exec_req(int sockfd, struct sd_req *hdr, void *data);
 
 extern struct command vdi_command;
 extern struct command node_command;
diff --git a/collie/common.c b/collie/common.c
index 5343ffa..e709964 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -63,7 +63,7 @@ int sd_read_object(uint64_t oid, void *data, unsigned int datalen,
 	if (direct)
 		hdr.flags |= SD_FLAG_CMD_DIRECT;
 
-	ret = exec_req(fd, &hdr, data);
+	ret = collie_exec_req(fd, &hdr, data);
 	close(fd);
 
 	if (ret) {
@@ -113,7 +113,7 @@ int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data,
 	hdr.obj.cow_oid = cow_oid;
 	hdr.obj.offset = offset;
 
-	ret = exec_req(fd, &hdr, data);
+	ret = collie_exec_req(fd, &hdr, data);
 	close(fd);
 
 	if (ret) {
@@ -148,7 +148,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data)
 	sd_init_req(&req, SD_OP_READ_VDIS);
 	req.data_length = sizeof(vdi_inuse);
 
-	ret = exec_req(fd, &req, &vdi_inuse);
+	ret = collie_exec_req(fd, &req, &vdi_inuse);
 	if (ret < 0) {
 		fprintf(stderr, "Failed to read VDIs from %s:%d\n",
 			sdhost, sdport);
@@ -206,7 +206,7 @@ int send_light_req_get_response(struct sd_req *hdr, const char *host, int port)
 	if (fd < 0)
 		return -1;
 
-	ret = exec_req(fd, hdr, NULL);
+	ret = collie_exec_req(fd, hdr, NULL);
 	close(fd);
 	if (ret) {
 		fprintf(stderr, "failed to connect to  %s:%d\n",
@@ -238,3 +238,8 @@ int send_light_req(struct sd_req *hdr, const char *host, int port)
 
 	return 0;
 }
+
+int collie_exec_req(int sockfd, struct sd_req *hdr, void *data)
+{
+	return net_exec_req(sockfd, hdr, data, true);
+}
diff --git a/collie/debug.c b/collie/debug.c
index 11d5934..708ba25 100644
--- a/collie/debug.c
+++ b/collie/debug.c
@@ -94,7 +94,7 @@ read_buffer:
 	sd_init_req(&hdr, SD_OP_TRACE_READ_BUF);
 	hdr.data_length = TRACE_BUF_LEN;
 
-	ret = exec_req(fd, &hdr, buf);
+	ret = collie_exec_req(fd, &hdr, buf);
 
 	if (ret) {
 		fprintf(stderr, "Failed to connect\n");
diff --git a/collie/node.c b/collie/node.c
index f79927f..a87d095 100644
--- a/collie/node.c
+++ b/collie/node.c
@@ -161,7 +161,7 @@ static int node_cache(int argc, char **argv)
 	hdr.flags = SD_FLAG_CMD_WRITE;
 	hdr.data_length = sizeof(cache_size);
 
-	ret = exec_req(fd, &hdr, (void *)&cache_size);
+	ret = collie_exec_req(fd, &hdr, (void *)&cache_size);
 	close(fd);
 
 	if (ret) {
diff --git a/collie/vdi.c b/collie/vdi.c
index 6ed7162..2547bd7 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -319,7 +319,7 @@ static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigne
 
 		hdr.obj.oid = oid;
 
-		ret = exec_req(fd, &hdr, buf);
+		ret = collie_exec_req(fd, &hdr, buf);
 		close(fd);
 
 		sprintf(name + strlen(name), ":%d", sd_nodes[i].nid.port);
@@ -410,7 +410,7 @@ static int find_vdi_name(const char *vdiname, uint32_t snapid, const char *tag,
 	hdr.flags = SD_FLAG_CMD_WRITE;
 	hdr.vdi.snapid = snapid;
 
-	ret = exec_req(fd, &hdr, buf);
+	ret = collie_exec_req(fd, &hdr, buf);
 	if (ret) {
 		ret = -1;
 		goto out;
@@ -490,7 +490,7 @@ static int do_vdi_create(const char *vdiname, int64_t vdi_size,
 	hdr.vdi.vdi_size = roundup(vdi_size, 512);
 	hdr.vdi.copies = nr_copies;
 
-	ret = exec_req(fd, &hdr, buf);
+	ret = collie_exec_req(fd, &hdr, buf);
 
 	close(fd);
 
@@ -751,7 +751,7 @@ static int do_vdi_delete(const char *vdiname, int snap_id, const char *snap_tag)
 	if (snap_tag)
 		pstrcpy(data + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
 
-	ret = exec_req(fd, &hdr, data);
+	ret = collie_exec_req(fd, &hdr, data);
 	close(fd);
 
 	if (ret) {
@@ -884,7 +884,7 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
 
-	ret = exec_req(fd, &hdr, logs);
+	ret = collie_exec_req(fd, &hdr, logs);
 	close(fd);
 
 	if (ret != 0)
@@ -1030,7 +1030,7 @@ static int find_vdi_attr_oid(const char *vdiname, const char *tag, uint32_t snap
 	if (delete)
 		hdr.flags |= SD_FLAG_CMD_DEL;
 
-	ret = exec_req(fd, &hdr, &vattr);
+	ret = collie_exec_req(fd, &hdr, &vattr);
 	if (ret) {
 		ret = SD_RES_EIO;
 		goto out;
@@ -1391,7 +1391,7 @@ static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid)
 
 	hdr.obj.oid = oid;
 
-	ret = exec_req(fd, &hdr, buf);
+	ret = collie_exec_req(fd, &hdr, buf);
 	close(fd);
 
 	if (ret) {
@@ -1440,7 +1440,7 @@ static void write_object_to(const struct sd_vnode *vnode, uint64_t oid,
 	hdr.data_length = SD_DATA_OBJ_SIZE;
 	hdr.obj.oid = oid;
 
-	ret = exec_req(fd, &hdr, buf);
+	ret = collie_exec_req(fd, &hdr, buf);
 	close(fd);
 
 	if (ret) {
diff --git a/include/net.h b/include/net.h
index 787ee79..97e93d0 100644
--- a/include/net.h
+++ b/include/net.h
@@ -52,6 +52,7 @@ int tx(struct connection *conn, enum conn_state next_state);
 int connect_to(const char *name, int port);
 int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen);
 int exec_req(int sockfd, struct sd_req *hdr, void *data);
+int net_exec_req(int sockfd, struct sd_req *hdr, void *data, bool retry_eagain);
 int create_listen_ports(const char *bindaddr, int port,
 			int (*callback)(int fd, void *), void *data);
 int create_unix_domain_socket(const char *unix_path,
diff --git a/lib/net.c b/lib/net.c
index 365c1e1..76b6eab 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -268,7 +268,7 @@ success:
 	return fd;
 }
 
-int do_read(int sockfd, void *buf, int len)
+static int net_do_read(int sockfd, void *buf, int len, bool retry_eagain)
 {
 	int ret;
 reread:
@@ -276,7 +276,10 @@ reread:
 	if (ret < 0 || !ret) {
 		if (errno == EINTR)
 			goto reread;
-		eprintf("failed to read from socket: %d\n", ret);
+		if (retry_eagain && errno == EAGAIN)
+			goto reread;
+
+		eprintf("failed to read from socket: %d, %d(%m)\n", ret, errno);
 		return 1;
 	}
 
@@ -288,6 +291,11 @@ reread:
 	return 0;
 }
 
+int do_read(int sockfd, void *buf, int len)
+{
+	return net_do_read(sockfd, buf, len, false);
+}
+
 static void forward_iov(struct msghdr *msg, int len)
 {
 	while (msg->msg_iov->iov_len <= len) {
@@ -352,7 +360,7 @@ int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen)
 	return ret;
 }
 
-int exec_req(int sockfd, struct sd_req *hdr, void *data)
+int net_exec_req(int sockfd, struct sd_req *hdr, void *data, bool retry_eagain)
 {
 	int ret;
 	struct sd_rsp *rsp = (struct sd_rsp *)hdr;
@@ -369,7 +377,7 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data)
 	if (send_req(sockfd, hdr, data, wlen))
 		return 1;
 
-	ret = do_read(sockfd, rsp, sizeof(*rsp));
+	ret = net_do_read(sockfd, rsp, sizeof(*rsp), retry_eagain);
 	if (ret) {
 		eprintf("failed to read a response\n");
 		return 1;
@@ -379,7 +387,7 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data)
 		rlen = rsp->data_length;
 
 	if (rlen) {
-		ret = do_read(sockfd, data, rlen);
+		ret = net_do_read(sockfd, data, rlen, retry_eagain);
 		if (ret) {
 			eprintf("failed to read the response data\n");
 			return 1;
@@ -389,6 +397,11 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data)
 	return 0;
 }
 
+int exec_req(int sockfd, struct sd_req *hdr, void *data)
+{
+	return net_exec_req(sockfd, hdr, data, false);
+}
+
 char *addr_to_str(char *str, int size, const uint8_t *addr, uint16_t port)
 {
 	int  af = AF_INET6;
-- 
1.7.9.5




More information about the sheepdog mailing list