[sheepdog] [PATCH] collie: retry forever to execute requests

Liu Yuan namei.unix at gmail.com
Fri Jul 19 05:26:39 CEST 2013


This patch add a 'max_retry_count' to exec_req() helper. sheep and sheepfs
still retry MAX_RETRY_COUNT for the rquests. But collie retry forever (UINT32_MAX)

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 collie/common.c      |    8 ++++++--
 include/net.h        |    6 +++---
 lib/net.c            |   24 ++++++++++++++----------
 sheep/gateway.c      |    5 +++--
 sheep/sockfd_cache.c |    3 ++-
 sheepfs/volume.c     |    6 +++---
 6 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/collie/common.c b/collie/common.c
index 4a57d92..e8a6a54 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -208,8 +208,12 @@ int collie_exec_req(const char *host, int port, struct sd_req *hdr, void *data)
 		return -1;
 	}
 
-	/* Retry hard for collie because we can't get the newest epoch */
-	ret = exec_req(fd, hdr, data, NULL, 0);
+	/*
+	 * Retry forever for collie because
+	 * 1. We can't get the newest epoch
+	 * 2. Some operations might take unexpected long time
+	 */
+	ret = exec_req(fd, hdr, data, NULL, 0, UINT32_MAX);
 	close(fd);
 
 	if (ret)
diff --git a/include/net.h b/include/net.h
index 75e6a76..238c636 100644
--- a/include/net.h
+++ b/include/net.h
@@ -49,14 +49,14 @@ int conn_rx_off(struct connection *conn);
 int conn_rx_on(struct connection *conn);
 bool is_conn_dead(const struct connection *conn);
 int do_read(int sockfd, void *buf, int len,
-	    bool (*need_retry)(uint32_t), uint32_t);
+	    bool (*need_retry)(uint32_t), uint32_t, uint32_t);
 int rx(struct connection *conn, enum conn_state next_state);
 int tx(struct connection *conn, enum conn_state next_state);
 int connect_to(const char *name, int port);
 int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
-	     bool (*need_retry)(uint32_t), uint32_t);
+	     bool (*need_retry)(uint32_t), uint32_t, uint32_t);
 int exec_req(int sockfd, struct sd_req *hdr, void *,
-	     bool (*need_retry)(uint32_t), uint32_t);
+	     bool (*need_retry)(uint32_t), uint32_t, uint32_t);
 int create_listen_ports(const char *bindaddr, int port,
 			int (*callback)(int fd, void *), void *data);
 int create_unix_domain_socket(const char *unix_path,
diff --git a/lib/net.c b/lib/net.c
index 97be3df..30d0119 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -271,9 +271,9 @@ success:
 }
 
 int do_read(int sockfd, void *buf, int len, bool (*need_retry)(uint32_t epoch),
-	    uint32_t epoch)
+	    uint32_t epoch, uint32_t max_count)
 {
-	int ret, repeat = MAX_RETRY_COUNT;
+	int ret, repeat = max_count;
 reread:
 	ret = read(sockfd, buf, len);
 	if (ret == 0) {
@@ -319,9 +319,10 @@ static void forward_iov(struct msghdr *msg, int len)
 
 
 static int do_write(int sockfd, struct msghdr *msg, int len,
-		    bool (*need_retry)(uint32_t), uint32_t epoch)
+		    bool (*need_retry)(uint32_t), uint32_t epoch,
+		    uint32_t max_count)
 {
-	int ret, repeat = MAX_RETRY_COUNT;
+	int ret, repeat = max_count;
 rewrite:
 	ret = sendmsg(sockfd, msg, 0);
 	if (ret < 0) {
@@ -351,7 +352,8 @@ rewrite:
 }
 
 int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
-	     bool (*need_retry)(uint32_t epoch), uint32_t epoch)
+	     bool (*need_retry)(uint32_t epoch), uint32_t epoch,
+	     uint32_t max_count)
 {
 	int ret;
 	struct msghdr msg;
@@ -371,7 +373,8 @@ int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
 		iov[1].iov_len = wlen;
 	}
 
-	ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch);
+	ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch,
+		       max_count);
 	if (ret) {
 		sd_eprintf("failed to send request %x, %d: %m", hdr->opcode,
 			   wlen);
@@ -382,7 +385,8 @@ int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
 }
 
 int exec_req(int sockfd, struct sd_req *hdr, void *data,
-	     bool (*need_retry)(uint32_t epoch), uint32_t epoch)
+	     bool (*need_retry)(uint32_t epoch), uint32_t epoch,
+	     uint32_t max_count)
 {
 	int ret;
 	struct sd_rsp *rsp = (struct sd_rsp *)hdr;
@@ -396,10 +400,10 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data,
 		rlen = hdr->data_length;
 	}
 
-	if (send_req(sockfd, hdr, data, wlen, need_retry, epoch))
+	if (send_req(sockfd, hdr, data, wlen, need_retry, epoch, max_count))
 		return 1;
 
-	ret = do_read(sockfd, rsp, sizeof(*rsp), need_retry, epoch);
+	ret = do_read(sockfd, rsp, sizeof(*rsp), need_retry, epoch, max_count);
 	if (ret) {
 		sd_eprintf("failed to read a response");
 		return 1;
@@ -409,7 +413,7 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data,
 		rlen = rsp->data_length;
 
 	if (rlen) {
-		ret = do_read(sockfd, data, rlen, need_retry, epoch);
+		ret = do_read(sockfd, data, rlen, need_retry, epoch, max_count);
 		if (ret) {
 			sd_eprintf("failed to read the response data");
 			return 1;
diff --git a/sheep/gateway.c b/sheep/gateway.c
index ad5850c..59d0127 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -194,7 +194,7 @@ again:
 			goto finish_write;
 		}
 		if (do_read(pi.pfds[i].fd, rsp, sizeof(*rsp), sheep_need_retry,
-			    req->rq.epoch)) {
+			    req->rq.epoch, MAX_RETRY_COUNT)) {
 			sd_eprintf("remote node might have gone away");
 			err_ret = SD_RES_NETWORK_ERROR;
 			finish_one_write_err(wi, i);
@@ -285,7 +285,8 @@ static int gateway_forward_request(struct request *req)
 		}
 
 		ret = send_req(sfd->fd, &hdr, req->data, wlen,
-			       sheep_need_retry, req->rq.epoch);
+			       sheep_need_retry, req->rq.epoch,
+			       MAX_RETRY_COUNT);
 		if (ret) {
 			sheep_del_sockfd(nid, sfd);
 			err_ret = SD_RES_NETWORK_ERROR;
diff --git a/sheep/sockfd_cache.c b/sheep/sockfd_cache.c
index 55c337c..13bb7f6 100644
--- a/sheep/sockfd_cache.c
+++ b/sheep/sockfd_cache.c
@@ -528,7 +528,8 @@ int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf)
 	if (!sfd)
 		return SD_RES_NETWORK_ERROR;
 
-	ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch);
+	ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch,
+		       MAX_RETRY_COUNT);
 	if (ret) {
 		sd_dprintf("remote node might have gone away");
 		sheep_del_sockfd(nid, sfd);
diff --git a/sheepfs/volume.c b/sheepfs/volume.c
index f387962..efd00ce 100644
--- a/sheepfs/volume.c
+++ b/sheepfs/volume.c
@@ -193,7 +193,7 @@ static int volume_rw_object(char *buf, uint64_t oid, size_t size,
 		hdr.flags |= SD_FLAG_CMD_CACHE;
 
 	fd = get_socket_fd(vdi, &sock_idx);
-	ret = exec_req(fd, &hdr, buf, NULL, 0);
+	ret = exec_req(fd, &hdr, buf, NULL, 0, MAX_RETRY_COUNT);
 	put_socket_fd(vdi, sock_idx);
 
 	if (ret || rsp->result != SD_RES_SUCCESS) {
@@ -299,7 +299,7 @@ static int volume_do_sync(uint32_t vid)
 	hdr.obj.oid = vid_to_vdi_oid(vid);
 
 	fd = get_socket_fd(vdi, &idx);
-	ret = exec_req(fd, &hdr, NULL, NULL, 0);
+	ret = exec_req(fd, &hdr, NULL, NULL, 0, MAX_RETRY_COUNT);
 	put_socket_fd(vdi, idx);
 
 	if (ret || rsp->result != SD_RES_SUCCESS) {
@@ -486,7 +486,7 @@ static int volume_sync_and_delete(uint32_t vid)
 	hdr.obj.oid = vid_to_vdi_oid(vid);
 
 	fd = get_socket_fd(vdi, &idx);
-	ret = exec_req(fd, &hdr, NULL, NULL, 0);
+	ret = exec_req(fd, &hdr, NULL, NULL, 0, MAX_RETRY_COUNT);
 	put_socket_fd(vdi, idx);
 
 	if (ret || rsp->result != SD_RES_SUCCESS) {
-- 
1.7.9.5




More information about the sheepdog mailing list