There are two problems with calling do_cluster_request from a work queue: 1) sys->pending_list is expected to only be used from the main thread and does not have any locking 2) the ->notify cluster driver metho is expected to be called from the main thread Simplify call do_cluster_request directly from process_request_queue instead of offloading it to a workqueue to fix this, and document the assumptions in the code. Based on an earlier patch from Yunkai Zhang <qiushu.zyk at taobao.com>. Signed-off-by: Christoph Hellwig <hch at lst.de> --- sheep/group.c | 10 ++++++---- sheep/sdnet.c | 8 -------- sheep/sheep_priv.h | 1 - 3 files changed, 6 insertions(+), 13 deletions(-) Index: sheepdog/sheep/group.c =================================================================== --- sheepdog.orig/sheep/group.c 2012-05-16 07:36:07.555929098 +0200 +++ sheepdog/sheep/group.c 2012-05-16 07:36:40.459928777 +0200 @@ -209,6 +209,12 @@ int get_nr_copies(struct vnode_info *vno return min(vnode_info->nr_zones, sys->nr_copies); } +/* + * Perform a blocked cluster operation. + * + * Must run in the main thread as it access unlocked state like + * sys->pending_list. + */ static void do_cluster_op(void *arg) { struct vdi_op_message *msg = arg; @@ -228,10 +234,16 @@ static void do_cluster_op(void *arg) msg->rsp.result = ret; } -void do_cluster_request(struct work *work) +/* + * Execute a cluster operation by letting the cluster driver send it to all + * nodes in the cluster. + * + * Must run in the main thread as it access unlocked state like + * sys->pending_list. + */ +static void do_cluster_request(struct request *req) { - struct request *req = container_of(work, struct request, work); - struct sd_req *hdr = (struct sd_req *)&req->rq; + struct sd_req *hdr = &req->rq; struct vdi_op_message *msg; size_t size; @@ -641,6 +653,12 @@ static void __sd_notify_done(struct even req_done(req); } +/* + * Pass on a notification message from the cluster driver. + * + * Must run in the main thread as it access unlocked state like + * sys->pending_list. + */ void sd_notify_handler(struct sd_node *sender, void *msg, size_t msg_len) { struct event_struct *cevent; @@ -1051,8 +1069,16 @@ static void process_request_queue(void) queue_work(sys->io_wqueue, &req->work); else queue_work(sys->gateway_wqueue, &req->work); - } else /* (is_cluster_op(req->op) || is_local_op(req->op)) */ + } else if (is_cluster_op(req->op)) { + /* + * Cluster requests are handed off to the cluster driver + * directly from the main thread. It's the cluster + * drivers job to ensure we avoid blocking on I/O here. + */ + do_cluster_request(req); + } else { /* is_local_op(req->op) */ queue_work(sys->io_wqueue, &req->work); + } } } Index: sheepdog/sheep/sdnet.c =================================================================== --- sheepdog.orig/sheep/sdnet.c 2012-05-15 11:02:08.399956846 +0200 +++ sheepdog/sheep/sdnet.c 2012-05-16 07:36:25.263928929 +0200 @@ -156,11 +156,6 @@ static void local_op_done(struct work *w req_done(req); } -static void cluster_op_done(struct work *work) -{ - /* request is forwarded to cpg group */ -} - static void do_local_request(struct work *work) { struct request *req = container_of(work, struct request, work); @@ -323,8 +318,7 @@ static void queue_request(struct request req->work.fn = do_local_request; req->work.done = local_op_done; } else if (is_cluster_op(req->op)) { - req->work.fn = do_cluster_request; - req->work.done = cluster_op_done; + /* directly executed in the main thread */; } else { eprintf("unknown operation %d\n", hdr->opcode); rsp->result = SD_RES_SYSTEM_ERROR; Index: sheepdog/sheep/sheep_priv.h =================================================================== --- sheepdog.orig/sheep/sheep_priv.h 2012-05-15 16:06:50.415851224 +0200 +++ sheepdog/sheep/sheep_priv.h 2012-05-16 07:36:25.263928929 +0200 @@ -259,7 +259,6 @@ int forward_write_obj_req(struct request int read_epoch(uint32_t *epoch, uint64_t *ctime, struct sd_node *entries, int *nr_entries); -void do_cluster_request(struct work *work); int update_epoch_store(uint32_t epoch); int update_epoch_log(uint32_t epoch); |