[sheepdog] [PATCH RFC 1/2] sheep: enhance STAT_RECOVERY for prividing information of recovery progress
Liu Yuan
namei.unix at gmail.com
Wed Jul 31 05:10:44 CEST 2013
On Wed, Jul 31, 2013 at 11:09:52AM +0900, Hitoshi Mitake wrote:
> At Mon, 29 Jul 2013 16:02:30 +0800,
> Liu Yuan wrote:
> >
> > On Mon, Jul 29, 2013 at 04:39:26PM +0900, Hitoshi Mitake wrote:
> > > Current SD_OP_STAT_RECOVERY only provides an information that
> > > indicates the node is doing recovery or not. For providing more useful
> > > information about progress of recovery, this patch enhances this
> > > request.
> > >
> > > The enhanced SD_OP_STAT_RECOVERY returns the information with struct
> > > recovery_progress. The struct contains these information:
> > > 1. state of recovery. With this information, collie can know which
> > > phase of progress sheep is in.
> > > 2. Number of copied object during a recovery process.
> > > 3. Number of entire objects which should be copied during a recovery
> > > process.
> > >
> > > Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> > > ---
> > > include/internal_proto.h | 12 ++++++++++++
> > > sheep/ops.c | 11 ++++++++---
> > > sheep/recovery.c | 33 ++++++++++++++++++++++++---------
> > > sheep/sheep_priv.h | 1 +
> > > 4 files changed, 45 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/include/internal_proto.h b/include/internal_proto.h
> > > index 0463eae..6d7a0e9 100644
> > > --- a/include/internal_proto.h
> > > +++ b/include/internal_proto.h
> > > @@ -184,6 +184,18 @@ struct sd_md_info {
> > > int nr;
> > > };
> > >
> > > +enum rw_state {
> > > + RW_PREPARE_LIST, /* the recovery thread is preparing object list */
> > > + RW_RECOVER_OBJ, /* the thread is recoering objects */
> > > + RW_NOTIFY_COMPLETION, /* the thread is notifying recovery completion */
> > > +};
> > > +
> > > +struct recovery_progress {
> > > + uint32_t state; /* actual type is enum rw_state */
> >
> > enum rw_state state; then you don't need comment.
>
> I agree.
>
> >
> > > + uint32_t nr_recovered_objects;
> > > + uint32_t nr_entire_checked_objects;
> > > +};
> >
> > it's recovery information, so better name it as recovery_info. And
> >
> > nr_recovered_objects --> finished_count
> > nr_entire_checked_objects --> total
>
> The names seems to be good. I agree.
>
> >
> > > static inline __attribute__((used)) void __sd_epoch_format_build_bug_ons(void)
> > > {
> > > /* never called, only for checking BUILD_BUG_ON()s */
> > > diff --git a/sheep/ops.c b/sheep/ops.c
> > > index 5d7686c..1436664 100644
> > > --- a/sheep/ops.c
> > > +++ b/sheep/ops.c
> > > @@ -390,10 +390,15 @@ static int local_stat_sheep(struct request *req)
> > > static int local_stat_recovery(const struct sd_req *req, struct sd_rsp *rsp,
> > > void *data)
> > > {
> > > - if (node_in_recovery())
> > > - return SD_RES_NODE_IN_RECOVERY;
> > > + if (!node_in_recovery())
> > > + return SD_RES_SUCCESS;
> > >
> > > - return SD_RES_SUCCESS;
> > > + if (req->data_length == sizeof(struct recovery_progress)) {
> > > + fill_recovery_progress(data);
> > > + rsp->data_length = sizeof(struct recovery_progress);
> > > + }
> > > +
> > > + return SD_RES_NODE_IN_RECOVERY;
> > > }
> > >
> > > static int local_stat_cluster(struct request *req)
> > > diff --git a/sheep/recovery.c b/sheep/recovery.c
> > > index ea3101a..69b07cd 100644
> > > --- a/sheep/recovery.c
> > > +++ b/sheep/recovery.c
> > > @@ -11,12 +11,6 @@
> > >
> > > #include "sheep_priv.h"
> > >
> > > -enum rw_state {
> > > - RW_PREPARE_LIST, /* the recovery thread is preparing object list */
> > > - RW_RECOVER_OBJ, /* the thread is recoering objects */
> > > - RW_NOTIFY_COMPLETION, /* the thread is notifying recovery completion */
> > > -};
> > > -
> > > /* base structure for the recovery thread */
> > > struct recovery_work {
> > > uint32_t epoch;
> > > @@ -86,6 +80,9 @@ static void queue_recovery_work(struct recovery_info *rinfo);
> > > #define DEFAULT_LIST_BUFFER_SIZE (UINT64_C(1) << 22)
> > > static size_t list_buffer_size = DEFAULT_LIST_BUFFER_SIZE;
> > >
> > > +static enum rw_state state;
> > > +static uint32_t nr_recovered_objects, nr_entire_checked_objects;
> > > +
> > > static int obj_cmp(const uint64_t *oid1, const uint64_t *oid2)
> > > {
> > > const uint64_t hval1 = fnv_64a_buf(oid1, sizeof(*oid1), FNV1A_64_INIT);
> > > @@ -316,7 +313,7 @@ static void recover_object_work(struct work *work)
> > >
> > > if (sd_store->exist(oid)) {
> > > sd_dprintf("the object is already recovered");
> > > - return;
> > > + goto end;
> > > }
> > >
> > > /* find object in the stale directory */
> > > @@ -332,6 +329,9 @@ static void recover_object_work(struct work *work)
> > > ret = do_recover_object(row);
> > > if (ret < 0)
> > > sd_eprintf("failed to recover object %"PRIx64, oid);
> > > +
> > > +end:
> >
> > It is better use 'out' lable as other code.
>
> OK.
>
> >
> > > + uatomic_inc(&nr_recovered_objects);
> > > }
> > >
> > > bool node_in_recovery(void)
> > > @@ -499,12 +499,12 @@ static void notify_recovery_completion_main(struct work *work)
> > > static inline void finish_recovery(struct recovery_info *rinfo)
> > > {
> > > uint32_t recovered_epoch = rinfo->epoch;
> > > - main_thread_set(current_rinfo, NULL);
> > >
> > > + main_thread_set(current_rinfo, NULL);
> > > wakeup_all_requests();
> > >
> > > if (rinfo->notify_complete) {
> > > - rinfo->state = RW_NOTIFY_COMPLETION;
> > > + state = rinfo->state = RW_NOTIFY_COMPLETION;
> > > queue_recovery_work(rinfo);
> > > }
> > >
> > > @@ -668,6 +668,9 @@ static void finish_object_list(struct work *work)
> > > return;
> > > }
> > >
> > > + state = RW_RECOVER_OBJ;
> > > + uatomic_set(&nr_entire_checked_objects, rinfo->count);
> > > +
> > > recover_next_object(rinfo);
> > > return;
> > > }
> > > @@ -853,6 +856,10 @@ static void queue_recovery_work(struct recovery_info *rinfo)
> > > rlw = xzalloc(sizeof(*rlw));
> > > rlw->oids = xmalloc(list_buffer_size);
> > >
> > > + state = RW_PREPARE_LIST;
> > > + uatomic_set(&nr_recovered_objects, 0);
> > > + uatomic_set(&nr_entire_checked_objects, 0);
> > > +
> > > rw = &rlw->base;
> > > rw->work.fn = prepare_object_list;
> > > rw->work.done = finish_object_list;
> > > @@ -882,3 +889,11 @@ static void queue_recovery_work(struct recovery_info *rinfo)
> > >
> > > queue_work(sys->recovery_wqueue, &rw->work);
> > > }
> > > +
> > > +void fill_recovery_progress(struct recovery_progress *prog)
> > > +{
> > > + prog->state = state;
> > > + prog->nr_recovered_objects = uatomic_read(&nr_recovered_objects);
> > > + prog->nr_entire_checked_objects =
> > > + uatomic_read(&nr_entire_checked_objects);
> > > +}
> >
> > just ->nr_recovered_objects = rinfo->done isn't enough?
> > the same goes for nr_entire_checked_objects.
> >
> > global variables aren't necessary.
> >
> > Better name the function as get_recovery_info()
>
> If we make a constraint that fill_recovery_progress() (or
> get_recovery_info()) is only callable from main thread, the global
> variable isn't needed. I'll do so in the next version.
Since the filling recovery info is very light, so I think we can make this
constraint.
Thanks
Yuan
More information about the sheepdog
mailing list