[sheepdog] [PATCH V2 00/11] INTRODUCE

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Mon Aug 20 20:03:39 CEST 2012


At Mon, 20 Aug 2012 23:34:10 +0800,
Yunkai Zhang wrote:
> 
> In fact, I have thought this method, but we should face nearly the same problem:
> 
> After sheep joined back, it should known which objects is dirty, and
> should do the clear work(because there are old version object stay in
> it's working directory). This method seems not save the steps, but
> will do extra recovery works.

Can you give me a concrete example?

I created a really naive patch to disable object recovery with my
idea:

==
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 5164aa7..8bf032f 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -35,6 +35,7 @@ struct recovery_work {
 	uint64_t *oids;
 	uint64_t *prio_oids;
 	int nr_prio_oids;
+	int nr_scheduled_oids;
 
 	struct vnode_info *old_vinfo;
 	struct vnode_info *cur_vinfo;
@@ -269,9 +270,6 @@ static inline void prepare_schedule_oid(uint64_t oid)
 				oid);
 			return;
 		}
-	/* The oid is currently being recovered */
-	if (rw->oids[rw->done] == oid)
-		return;
 	rw->nr_prio_oids++;
 	rw->prio_oids = xrealloc(rw->prio_oids,
 				 rw->nr_prio_oids * sizeof(uint64_t));
@@ -399,9 +397,31 @@ static inline void finish_schedule_oids(struct recovery_work *rw)
 done:
 	free(rw->prio_oids);
 	rw->prio_oids = NULL;
+	rw->nr_scheduled_oids += rw->nr_prio_oids;
 	rw->nr_prio_oids = 0;
 }
 
+static struct timer recovery_timer;
+
+static void recover_next_object(void *arg)
+{
+	struct recovery_work *rw = arg;
+
+	if (rw->nr_prio_oids)
+		finish_schedule_oids(rw);
+
+	if (rw->done < rw->nr_scheduled_oids) {
+		/* Try recover next object */
+		queue_work(sys->recovery_wqueue, &rw->work);
+		return;
+	}
+
+	/* There is no objects to be recovered.  Try again later */
+	recovery_timer.callback = recover_next_object;
+	recovery_timer.data = rw;
+	add_timer(&recovery_timer, 1); /* FIXME */
+}
+
 static void recover_object_main(struct work *work)
 {
 	struct recovery_work *rw = container_of(work, struct recovery_work,
@@ -425,11 +445,7 @@ static void recover_object_main(struct work *work)
 	resume_wait_obj_requests(rw->oids[rw->done++]);
 
 	if (rw->done < rw->count) {
-		if (rw->nr_prio_oids)
-			finish_schedule_oids(rw);
-
-		/* Try recover next object */
-		queue_work(sys->recovery_wqueue, &rw->work);
+		recover_next_object(rw);
 		return;
 	}
 
@@ -458,7 +474,7 @@ static void finish_object_list(struct work *work)
 	resume_wait_recovery_requests();
 	rw->work.fn = recover_object_work;
 	rw->work.done = recover_object_main;
-	queue_work(sys->recovery_wqueue, &rw->work);
+	recover_next_object(rw);
 	return;
 }
 
==

I ran the following test, and object recovery was disabled correctly
for both join and leave case.

==
#!/bin/bash

for i in 0 1 2 3; do
    ./sheep/sheep /store/$i -z $i -p 700$i -c local
done

sleep 1
./collie/collie cluster format

./collie/collie vdi create test 4G

echo " * objects will be created on node[0-2] *"
md5sum /store/[0,1,2,3]/obj/807c2b2500000000

pkill -f "./sheep/sheep /store/1"
sleep 3

echo " * recovery doesn't start until the object is touched *"
md5sum /store/[0,2,3]/obj/807c2b2500000000

./collie/collie vdi snapshot test  # invoke recovery of the vdi object
echo " * the object is recovered *"
md5sum /store/[0,2,3]/obj/807c2b2500000000

./sheep/sheep /store/1 -z 1 -p 7001 -c local
sleep 3

echo " * recovery doesn't start until the object is touched *"
md5sum /store/[0,1,2,3]/obj/807c2b2500000000

./collie/collie vdi list -p 7001  # invoke recovery of the vdi object
echo " * the object is recovered *"
md5sum /store/[0,1,2,3]/obj/807c2b2500000000
==

[Output]

using backend farm store
 * objects will be created on node[0-2] *
701e77eab6002c9a48f7ba72c8d9bfe9  /store/0/obj/807c2b2500000000
701e77eab6002c9a48f7ba72c8d9bfe9  /store/1/obj/807c2b2500000000
701e77eab6002c9a48f7ba72c8d9bfe9  /store/2/obj/807c2b2500000000
 * recovery doesn't start until the object is touched *
701e77eab6002c9a48f7ba72c8d9bfe9  /store/0/obj/807c2b2500000000
701e77eab6002c9a48f7ba72c8d9bfe9  /store/2/obj/807c2b2500000000
 * the object is recovered *
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/0/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/2/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/3/obj/807c2b2500000000
 * recovery doesn't start until the object is touched *
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/0/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/2/obj/807c2b2500000000
  Name        Id    Size    Used  Shared    Creation time   VDI id  Tag
s test         1  4.0 GB  0.0 MB  0.0 MB 2012-08-21 02:49   7c2b25  
  test         2  4.0 GB  0.0 MB  0.0 MB 2012-08-21 02:49   7c2b26  
 * the object is recovered *
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/0/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/1/obj/807c2b2500000000
3c3bf0d865363fd0d1f1d5c7aa044dcd  /store/2/obj/807c2b2500000000


I couldn't read an old object at all.

Thanks,

Kazutaka



More information about the sheepdog mailing list