[sheepdog] [PATCH v2 3/4] sheep: add support for manual recovery
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Wed Sep 12 09:05:06 CEST 2012
This implements disbling object recovery. When recoery is disabled,
sheep will suspend the recovery process after it recovers objects in
the prio_oids queue. The suspended recovery is resumed after new
objects are pushed into the prio_oids queue. This means that
unaccessed objects are not recovered at all.
Note that sheep increments epoch even when recovery is disabled. If
sheep receives a write request, it will place the updated object based
on the current node membership.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
collie/cluster.c | 3 +-
include/internal_proto.h | 4 ++-
sheep/group.c | 2 +
sheep/ops.c | 1 +
sheep/recovery.c | 63 ++++++++++++++++++++++++++++++++++++++++------
sheep/sheep_priv.h | 1 +
6 files changed, 63 insertions(+), 11 deletions(-)
diff --git a/collie/cluster.c b/collie/cluster.c
index 7ebbb83..9774c7f 100644
--- a/collie/cluster.c
+++ b/collie/cluster.c
@@ -386,8 +386,7 @@ static int cluster_disable_recover(int argc, char **argv)
if (ret)
return EXIT_FAILURE;
- printf("*Note*: Only disable the recovery caused by JOIN envets\n"
- "Cluster recovery: disable\n");
+ printf("Cluster recovery: disable\n");
return EXIT_SUCCESS;
}
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 2fb4a0a..b4199ca 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -197,11 +197,13 @@ struct join_message {
int16_t nr_nodes;
uint16_t nr_failed_nodes;
uint16_t nr_delayed_nodes;
- uint16_t cluster_flags;
uint32_t cluster_status;
uint32_t epoch;
uint64_t ctime;
uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
+ uint8_t disable_recovery;
+ uint16_t cluster_flags;
+ uint32_t __pad;
uint8_t store[STORE_LEN];
/*
diff --git a/sheep/group.c b/sheep/group.c
index 0a3fe08..94466f3 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -836,6 +836,7 @@ static void update_cluster_info(struct join_message *msg,
set_cluster_ctime(msg->ctime);
/*FALLTHROUGH*/
case SD_STATUS_WAIT_FOR_JOIN:
+ sys->disable_recovery = msg->disable_recovery;
get_vdis(nodes, nr_nodes);
break;
default:
@@ -994,6 +995,7 @@ enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void *opaque)
jm->epoch = sys->epoch;
jm->ctime = get_cluster_ctime();
jm->nr_failed_nodes = 0;
+ jm->disable_recovery = sys->disable_recovery;
if (sd_store)
strcpy((char *)jm->store, sd_store->name);
diff --git a/sheep/ops.c b/sheep/ops.c
index a96fe41..4542335 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -318,6 +318,7 @@ static int cluster_enable_recover(const struct sd_req *req,
struct sd_rsp *rsp, void *data)
{
sys->disable_recovery = 0;
+ resume_suspended_recovery();
return SD_RES_SUCCESS;
}
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 4df5b66..eb70296 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -30,11 +30,14 @@ struct recovery_work {
int stop;
struct work work;
+ bool suspended; /* true when automatic recovery is disabled
+ * and recovery process is suspended */
int count;
uint64_t *oids;
uint64_t *prio_oids;
int nr_prio_oids;
+ int nr_scheduled_prio_oids;
struct vnode_info *old_vinfo;
struct vnode_info *cur_vinfo;
@@ -269,13 +272,15 @@ static inline void prepare_schedule_oid(uint64_t oid)
oid);
return;
}
- /* The oid is currently being recovered */
- if (rw->oids[rw->done] == oid)
+ /* When auto recovery is enabled, the oid is currently being
+ * recovered */
+ if (!sys->disable_recovery && rw->oids[rw->done] == oid)
return;
rw->nr_prio_oids++;
rw->prio_oids = xrealloc(rw->prio_oids,
rw->nr_prio_oids * sizeof(uint64_t));
rw->prio_oids[rw->nr_prio_oids - 1] = oid;
+ resume_suspended_recovery();
dprintf("%"PRIx64" nr_prio_oids %d\n", oid, rw->nr_prio_oids);
}
@@ -431,9 +436,51 @@ static inline void finish_schedule_oids(struct recovery_work *rw)
done:
free(rw->prio_oids);
rw->prio_oids = NULL;
+ rw->nr_scheduled_prio_oids += rw->nr_prio_oids;
rw->nr_prio_oids = 0;
}
+/*
+ * When automatic object recovery is disabled, the behavior of the
+ * recovery process is like 'lazy recovery'. This function returns
+ * true if the recovery queue contains objects being accessed by
+ * clients. Sheep recovers such objects for availability even when
+ * automatic object recovery is not enabled.
+ */
+static bool has_scheduled_objects(struct recovery_work *rw)
+{
+ return rw->nr_scheduled_prio_oids <= rw->done;
+}
+
+static void recover_next_object(struct recovery_work *rw)
+{
+ if (next_rw) {
+ run_next_rw(rw);
+ return;
+ }
+
+ if (rw->nr_prio_oids)
+ finish_schedule_oids(rw);
+
+ if (sys->disable_recovery && has_scheduled_objects(rw)) {
+ dprintf("suspended\n");
+ rw->suspended = true;
+ /* suspend until resume_suspended_recovery() is called */
+ return;
+ }
+
+ /* Try recover next object */
+ queue_work(sys->recovery_wqueue, &rw->work);
+}
+
+void resume_suspended_recovery(void)
+{
+ if (recovering_work && recovering_work->suspended) {
+ recovering_work->suspended = false;
+ recover_next_object(recovering_work);
+ }
+}
+
static void recover_object_main(struct work *work)
{
struct recovery_work *rw = container_of(work, struct recovery_work,
@@ -457,11 +504,7 @@ static void recover_object_main(struct work *work)
resume_wait_obj_requests(rw->oids[rw->done++]);
if (rw->done < rw->count) {
- if (rw->nr_prio_oids)
- finish_schedule_oids(rw);
-
- /* Try recover next object */
- queue_work(sys->recovery_wqueue, &rw->work);
+ recover_next_object(rw);
return;
}
@@ -490,7 +533,7 @@ static void finish_object_list(struct work *work)
resume_wait_recovery_requests();
rw->work.fn = recover_object_work;
rw->work.done = recover_object_main;
- queue_work(sys->recovery_wqueue, &rw->work);
+ recover_next_object(rw);
return;
}
@@ -673,6 +716,10 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo)
free_recovery_work(next_rw);
dprintf("recovery skipped\n");
next_rw = rw;
+
+ /* This is necesary to invoke run_next_rw when
+ * recovery work is suspended. */
+ resume_suspended_recovery();
} else {
recovering_work = rw;
queue_work(sys->recovery_wqueue, &rw->work);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index bb7f5fe..bfc5b9d 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -255,6 +255,7 @@ void resume_wait_epoch_requests(void);
void resume_wait_obj_requests(uint64_t oid);
void resume_wait_recovery_requests(void);
void flush_wait_obj_requests(void);
+void resume_suspended_recovery(void);
int create_cluster(int port, int64_t zone, int nr_vnodes,
bool explicit_addr);
--
1.7.2.5
More information about the sheepdog
mailing list