[sheepdog] [PATCH] sheep: stop special casing local objects on gateways

Tue Jun 5 14:10:23 CEST 2012

Sheepdog is designed for clusters large enough that finding objects locally
is the exception, so stop optimizing for it.  Remove all code that bypasses
the networking code when handling an object that is local to the gateway.

In the short runs the only benefits are code simplification, and not having
to calculate all object locations in forward_read_obj_req, but long run
the better split should allow for some nice optimizations like using the
splice system call on the gateway so that data can be forwarded without
ever having to copy it to userspace.

Signed-off-by: Christoph Hellwig <hch at lst.de>

diff --git a/sheep/gateway.c b/sheep/gateway.c
index c2ec901..8b7777c 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -22,8 +22,6 @@ int forward_read_obj_req(struct request *req)
 	unsigned wlen, rlen;
 	struct sd_req fwd_hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr;
-	struct sd_vnode *v;
-	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
 	uint64_t oid = req->rq.obj.oid;
 	int nr_copies, j;
 
@@ -35,28 +33,13 @@ int forward_read_obj_req(struct request *req)
 	else
 		nr_copies = get_nr_copies(req->vnodes);
 
-	/* TODO: we can do better; we need to check this first */
-	oid_to_vnodes(req->vnodes, oid, nr_copies, obj_vnodes);
-	for (i = 0; i < nr_copies; i++) {
-		v = obj_vnodes[i];
-		if (vnode_is_local(v)) {
-			ret = do_local_io(req, fwd_hdr.epoch);
-			if (ret != SD_RES_SUCCESS)
-				goto read_remote;
-			return ret;
-		}
-	}
-
-read_remote:
 	/* Read random copy from cluster for better load balance, useful for
 	 * reading base VM's COW objects
 	 */
 	j = random();
 	for (i = 0; i < nr_copies; i++) {
 		int idx = (i + j) % nr_copies;
-		v = obj_vnodes[idx];
-		if (vnode_is_local(v))
-			continue;
+		struct sd_vnode *v = oid_to_vnode(req->vnodes, oid, idx);
 
 		fd = get_sheep_fd(v->addr, v->port, v->node_idx, fwd_hdr.epoch);
 		if (fd < 0) {
@@ -94,7 +77,7 @@ int forward_write_obj_req(struct request *req)
 	uint64_t oid = req->rq.obj.oid;
 	int nr_copies;
 	struct pollfd pfds[SD_MAX_REDUNDANCY];
-	int nr_fds, local = 0;
+	int nr_fds;
 
 	dprintf("%"PRIx64"\n", oid);
 
@@ -115,11 +98,6 @@ int forward_write_obj_req(struct request *req)
 
 		addr_to_str(name, sizeof(name), v->addr, 0);
 
-		if (vnode_is_local(v)) {
-			local = 1;
-			continue;
-		}
-
 		fd = get_sheep_fd(v->addr, v->port, v->node_idx, fwd_hdr.epoch);
 		if (fd < 0) {
 			eprintf("failed to connect to %s:%"PRIu32"\n", name, v->port);
@@ -140,22 +118,6 @@ int forward_write_obj_req(struct request *req)
 		nr_fds++;
 	}
 
-	if (local) {
-		ret = do_local_io(req, fwd_hdr.epoch);
-		rsp->result = ret;
-
-		if (nr_fds == 0) {
-			eprintf("exit %"PRIu32"\n", ret);
-			goto out;
-		}
-
-		if (rsp->result != SD_RES_SUCCESS) {
-			eprintf("fail %"PRIu32"\n", ret);
-			ret = rsp->result;
-			goto out;
-		}
-	}
-
 	ret = SD_RES_SUCCESS;
 again:
 	pollret = poll(pfds, nr_fds, DEFAULT_SOCKET_TIMEOUT * 1000);
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 6323ee3..e98286c 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -22,23 +22,6 @@
 
 static void requeue_request(struct request *req);
 
-static int is_access_local(struct request *req, uint64_t oid)
-{
-	struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
-	int nr_copies;
-	int i;
-
-	nr_copies = get_nr_copies(req->vnodes);
-	oid_to_vnodes(req->vnodes, oid, nr_copies, obj_vnodes);
-
-	for (i = 0; i < nr_copies; i++) {
-		if (vnode_is_local(obj_vnodes[i]))
-			return 1;
-	}
-
-	return 0;
-}
-
 static int need_consistency_check(struct request *req)
 {
 	struct sd_req *hdr = &req->rq;
@@ -145,14 +128,8 @@ static void gateway_op_done(struct work *work)
 	case SD_RES_NETWORK_ERROR:
 	case SD_RES_WAIT_FOR_JOIN:
 	case SD_RES_WAIT_FOR_FORMAT:
-		goto retry;
-	case SD_RES_EIO:
-		if (is_access_local(req, hdr->obj.oid)) {
-			eprintf("leaving sheepdog cluster\n");
-			leave_cluster();
-			goto retry;
-		}
-		break;
+		requeue_request(req);
+		return;
 	case SD_RES_SUCCESS:
 		if (req->check_consistency && is_data_obj(hdr->obj.oid))
 			check_object_consistency(hdr);
@@ -160,9 +137,6 @@ static void gateway_op_done(struct work *work)
 	}
 
 	req_done(req);
-	return;
-retry:
-	requeue_request(req);
 }
 
 static void local_op_done(struct work *work)
@@ -219,7 +193,7 @@ static bool request_in_recovery(struct request *req)
 	 * oid_in_recovery() returns true because we should also try snap
 	 * cache of the Farm and return the error code back if not found.
 	 */
-	if (oid_in_recovery(req->local_oid) &&
+	if (oid_in_recovery(req->rq.obj.oid) &&
 	    !(req->rq.flags & SD_FLAG_CMD_RECOVERY)) {
 		/*
 		 * Put request on wait queues of local node
@@ -281,7 +255,7 @@ void resume_wait_recovery_requests(void)
 		if (req->rp.result != SD_RES_OBJ_RECOVERING)
 			continue;
 
-		dprintf("resume wait oid %" PRIx64 "\n", req->local_oid);
+		dprintf("resume wait oid %" PRIx64 "\n", req->rq.obj.oid);
 		list_del(&req->request_list);
 		requeue_request(req);
 	}
@@ -297,12 +271,12 @@ void resume_wait_obj_requests(uint64_t oid)
 	list_splice_init(&sys->wait_obj_queue, &pending_list);
 
 	list_for_each_entry_safe(req, t, &pending_list, request_list) {
-		if (req->local_oid != oid)
+		if (req->rq.obj.oid != oid)
 			continue;
 
 		/* the object requested by a pending request has been
 		 * recovered, notify the pending request. */
-		dprintf("retry %" PRIx64 "\n", req->local_oid);
+		dprintf("retry %" PRIx64 "\n", req->rq.obj.oid);
 		list_del(&req->request_list);
 		requeue_request(req);
 	}
@@ -324,8 +298,7 @@ void flush_wait_obj_requests(void)
 
 static void queue_io_request(struct request *req)
 {
-	req->local_oid = req->rq.obj.oid;
-	if (req->local_oid) {
+	if (req->rq.obj.oid) {
 		if (check_request_epoch(req) < 0)
 			return;
 		if (request_in_recovery(req))
@@ -339,11 +312,6 @@ static void queue_io_request(struct request *req)
 
 static void queue_gateway_request(struct request *req)
 {
-	struct sd_req *hdr = &req->rq;
-
-	if (is_access_local(req, hdr->obj.oid))
-		req->local_oid = hdr->obj.oid;
-
 	/*
 	 * If we go for a cached object, we don't care if it is being recovered
 	 */
@@ -352,10 +320,6 @@ static void queue_gateway_request(struct request *req)
 	    object_is_cached(req->rq.obj.oid))
 		goto queue_work;
 
-	if (req->local_oid)
-		if (request_in_recovery(req))
-			return;
-
 	if (need_consistency_check(req))
 		set_consistency_check(req);
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 60432c7..beae3e5 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -74,8 +74,6 @@ struct request {
 	struct list_head request_list;
 	struct list_head pending_list;
 
-	uint64_t local_oid;
-
 	struct vnode_info *vnodes;
 	int check_consistency;
 
@@ -321,7 +319,6 @@ int has_process_main(struct sd_op_template *op);
 int do_process_work(struct request *req);
 int do_process_main(struct sd_op_template *op, const struct sd_req *req,
 		    struct sd_rsp *rsp, void *data);
-int do_local_io(struct request *req, uint32_t epoch);
 
 /* Journal */
 struct jrnl_descriptor *jrnl_begin(const void *buf, size_t count, off_t offset,
diff --git a/sheep/store.c b/sheep/store.c
index ad9883e..b4f734b 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -46,15 +46,6 @@ mode_t def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 struct store_driver *sd_store;
 LIST_HEAD(store_drivers);
 
-int do_local_io(struct request *req, uint32_t epoch)
-{
-	dprintf("%x, %" PRIx64" , %u\n",
-		req->rq.opcode, req->rq.obj.oid, epoch);
-
-	req->rq.epoch = epoch;
-	return do_process_work(req);
-}
-
 int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
 {
 	int fd, ret, len;
@@ -93,22 +84,18 @@ err_open:
 void do_io_request(struct work *work)
 {
 	struct request *req = container_of(work, struct request, work);
-	uint32_t epoch;
 	int ret;
 
 	if (req->rq.flags & SD_FLAG_CMD_RECOVERY)
-		epoch = req->rq.obj.tgt_epoch;
-	else
-		epoch = req->rq.epoch;
+		req->rq.epoch = req->rq.obj.tgt_epoch;
 
 	dprintf("%x, %" PRIx64" , %u\n",
-		req->rq.opcode, req->rq.obj.oid, epoch);
-
-	ret = do_local_io(req, epoch);
+		req->rq.opcode, req->rq.obj.oid, req->rq.epoch);
 
+	ret = do_process_work(req);
 	if (ret != SD_RES_SUCCESS)
 		dprintf("failed: %x, %" PRIx64" , %u, %"PRIx32"\n",
-			req->rq.opcode, req->rq.obj.oid, epoch, ret);
+			req->rq.opcode, req->rq.obj.oid, req->rq.epoch, ret);
 	req->rp.result = ret;
 }