[sheepdog] [PATCH] collie: retry forever to execute requests
Liu Yuan
namei.unix at gmail.com
Fri Jul 19 05:26:39 CEST 2013
This patch add a 'max_retry_count' to exec_req() helper. sheep and sheepfs
still retry MAX_RETRY_COUNT for the rquests. But collie retry forever (UINT32_MAX)
Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
collie/common.c | 8 ++++++--
include/net.h | 6 +++---
lib/net.c | 24 ++++++++++++++----------
sheep/gateway.c | 5 +++--
sheep/sockfd_cache.c | 3 ++-
sheepfs/volume.c | 6 +++---
6 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/collie/common.c b/collie/common.c
index 4a57d92..e8a6a54 100644
--- a/collie/common.c
+++ b/collie/common.c
@@ -208,8 +208,12 @@ int collie_exec_req(const char *host, int port, struct sd_req *hdr, void *data)
return -1;
}
- /* Retry hard for collie because we can't get the newest epoch */
- ret = exec_req(fd, hdr, data, NULL, 0);
+ /*
+ * Retry forever for collie because
+ * 1. We can't get the newest epoch
+ * 2. Some operations might take unexpected long time
+ */
+ ret = exec_req(fd, hdr, data, NULL, 0, UINT32_MAX);
close(fd);
if (ret)
diff --git a/include/net.h b/include/net.h
index 75e6a76..238c636 100644
--- a/include/net.h
+++ b/include/net.h
@@ -49,14 +49,14 @@ int conn_rx_off(struct connection *conn);
int conn_rx_on(struct connection *conn);
bool is_conn_dead(const struct connection *conn);
int do_read(int sockfd, void *buf, int len,
- bool (*need_retry)(uint32_t), uint32_t);
+ bool (*need_retry)(uint32_t), uint32_t, uint32_t);
int rx(struct connection *conn, enum conn_state next_state);
int tx(struct connection *conn, enum conn_state next_state);
int connect_to(const char *name, int port);
int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
- bool (*need_retry)(uint32_t), uint32_t);
+ bool (*need_retry)(uint32_t), uint32_t, uint32_t);
int exec_req(int sockfd, struct sd_req *hdr, void *,
- bool (*need_retry)(uint32_t), uint32_t);
+ bool (*need_retry)(uint32_t), uint32_t, uint32_t);
int create_listen_ports(const char *bindaddr, int port,
int (*callback)(int fd, void *), void *data);
int create_unix_domain_socket(const char *unix_path,
diff --git a/lib/net.c b/lib/net.c
index 97be3df..30d0119 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -271,9 +271,9 @@ success:
}
int do_read(int sockfd, void *buf, int len, bool (*need_retry)(uint32_t epoch),
- uint32_t epoch)
+ uint32_t epoch, uint32_t max_count)
{
- int ret, repeat = MAX_RETRY_COUNT;
+ int ret, repeat = max_count;
reread:
ret = read(sockfd, buf, len);
if (ret == 0) {
@@ -319,9 +319,10 @@ static void forward_iov(struct msghdr *msg, int len)
static int do_write(int sockfd, struct msghdr *msg, int len,
- bool (*need_retry)(uint32_t), uint32_t epoch)
+ bool (*need_retry)(uint32_t), uint32_t epoch,
+ uint32_t max_count)
{
- int ret, repeat = MAX_RETRY_COUNT;
+ int ret, repeat = max_count;
rewrite:
ret = sendmsg(sockfd, msg, 0);
if (ret < 0) {
@@ -351,7 +352,8 @@ rewrite:
}
int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
- bool (*need_retry)(uint32_t epoch), uint32_t epoch)
+ bool (*need_retry)(uint32_t epoch), uint32_t epoch,
+ uint32_t max_count)
{
int ret;
struct msghdr msg;
@@ -371,7 +373,8 @@ int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
iov[1].iov_len = wlen;
}
- ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch);
+ ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch,
+ max_count);
if (ret) {
sd_eprintf("failed to send request %x, %d: %m", hdr->opcode,
wlen);
@@ -382,7 +385,8 @@ int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen,
}
int exec_req(int sockfd, struct sd_req *hdr, void *data,
- bool (*need_retry)(uint32_t epoch), uint32_t epoch)
+ bool (*need_retry)(uint32_t epoch), uint32_t epoch,
+ uint32_t max_count)
{
int ret;
struct sd_rsp *rsp = (struct sd_rsp *)hdr;
@@ -396,10 +400,10 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data,
rlen = hdr->data_length;
}
- if (send_req(sockfd, hdr, data, wlen, need_retry, epoch))
+ if (send_req(sockfd, hdr, data, wlen, need_retry, epoch, max_count))
return 1;
- ret = do_read(sockfd, rsp, sizeof(*rsp), need_retry, epoch);
+ ret = do_read(sockfd, rsp, sizeof(*rsp), need_retry, epoch, max_count);
if (ret) {
sd_eprintf("failed to read a response");
return 1;
@@ -409,7 +413,7 @@ int exec_req(int sockfd, struct sd_req *hdr, void *data,
rlen = rsp->data_length;
if (rlen) {
- ret = do_read(sockfd, data, rlen, need_retry, epoch);
+ ret = do_read(sockfd, data, rlen, need_retry, epoch, max_count);
if (ret) {
sd_eprintf("failed to read the response data");
return 1;
diff --git a/sheep/gateway.c b/sheep/gateway.c
index ad5850c..59d0127 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -194,7 +194,7 @@ again:
goto finish_write;
}
if (do_read(pi.pfds[i].fd, rsp, sizeof(*rsp), sheep_need_retry,
- req->rq.epoch)) {
+ req->rq.epoch, MAX_RETRY_COUNT)) {
sd_eprintf("remote node might have gone away");
err_ret = SD_RES_NETWORK_ERROR;
finish_one_write_err(wi, i);
@@ -285,7 +285,8 @@ static int gateway_forward_request(struct request *req)
}
ret = send_req(sfd->fd, &hdr, req->data, wlen,
- sheep_need_retry, req->rq.epoch);
+ sheep_need_retry, req->rq.epoch,
+ MAX_RETRY_COUNT);
if (ret) {
sheep_del_sockfd(nid, sfd);
err_ret = SD_RES_NETWORK_ERROR;
diff --git a/sheep/sockfd_cache.c b/sheep/sockfd_cache.c
index 55c337c..13bb7f6 100644
--- a/sheep/sockfd_cache.c
+++ b/sheep/sockfd_cache.c
@@ -528,7 +528,8 @@ int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf)
if (!sfd)
return SD_RES_NETWORK_ERROR;
- ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch);
+ ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch,
+ MAX_RETRY_COUNT);
if (ret) {
sd_dprintf("remote node might have gone away");
sheep_del_sockfd(nid, sfd);
diff --git a/sheepfs/volume.c b/sheepfs/volume.c
index f387962..efd00ce 100644
--- a/sheepfs/volume.c
+++ b/sheepfs/volume.c
@@ -193,7 +193,7 @@ static int volume_rw_object(char *buf, uint64_t oid, size_t size,
hdr.flags |= SD_FLAG_CMD_CACHE;
fd = get_socket_fd(vdi, &sock_idx);
- ret = exec_req(fd, &hdr, buf, NULL, 0);
+ ret = exec_req(fd, &hdr, buf, NULL, 0, MAX_RETRY_COUNT);
put_socket_fd(vdi, sock_idx);
if (ret || rsp->result != SD_RES_SUCCESS) {
@@ -299,7 +299,7 @@ static int volume_do_sync(uint32_t vid)
hdr.obj.oid = vid_to_vdi_oid(vid);
fd = get_socket_fd(vdi, &idx);
- ret = exec_req(fd, &hdr, NULL, NULL, 0);
+ ret = exec_req(fd, &hdr, NULL, NULL, 0, MAX_RETRY_COUNT);
put_socket_fd(vdi, idx);
if (ret || rsp->result != SD_RES_SUCCESS) {
@@ -486,7 +486,7 @@ static int volume_sync_and_delete(uint32_t vid)
hdr.obj.oid = vid_to_vdi_oid(vid);
fd = get_socket_fd(vdi, &idx);
- ret = exec_req(fd, &hdr, NULL, NULL, 0);
+ ret = exec_req(fd, &hdr, NULL, NULL, 0, MAX_RETRY_COUNT);
put_socket_fd(vdi, idx);
if (ret || rsp->result != SD_RES_SUCCESS) {
--
1.7.9.5
More information about the sheepdog
mailing list