[sheepdog] [PATCH v2] sheep: sheep aborted during startup, still joined in cluster

Tue May 6 15:33:01 CEST 2014

Currently, create_cluster() is called before any thread is created.
Once any one of the following steps during startup is failed,
sheep calls exit() or abort() directly so that leave_cluster() is not called.
Other nodes would consider that that one should be alived.
This will cause many problems.

This is a reproducible case of using journal file. Hot fix also submit.
But, should we re-arrange the startup steps? Should we avoid panic()
because it is dangerous?

Steps:

1. Start three sheeps. Cluster manager is zookeeper.
for i in `seq 0 2`; do
    sheep/sheep /tmp/sd$i -y 127.0.0.1 -c zookeeper:127.0.0.1:2181 -z $i \
        -p 700$i -j size=64M -n
done

2. Format the cluster and create a vdi. Data object file 007c2b2500000000
is always written into sd1 according to sheepdog hash algorithm.
$ dog cluster format -c 1
$ dog vdi create test 4M -P

3. Write the vdi continuously.
for i in `seq 0 4194303`; do
    echo $i
    echo "a" | dog vdi write test $i 1
done

4. Kill all sheeps in another terminal during writing vdi.
$ killall sheep

5. Sometimes journal files are not cleaned up when sheeps exit.
If they are not found, try step 3 and step 4 again.
$ ls /tmp/sd*/journal*
/tmp/sd0/journal_file0  /tmp/sd0/journal_file1
/tmp/sd1/journal_file0  /tmp/sd1/journal_file1
/tmp/sd2/journal_file0  /tmp/sd2/journal_file1

6. Remove data object file to simulate WAL is finished
but data object file is not created.
$ rm /tmp/sd1/obj/007c2b2500000000

7. Start the three sheeps again. We found sd0 and sd2 is up, but sd1 is down.

8. By the program log (sheep.log), we can see the sheep process of sd1
is already aborted.

 INFO [main] md_add_disk(337) /tmp/sd1/obj, vdisk nr 261, total disk 1
 INFO [main] send_join_request(787) IPv4 ip:127.0.0.1 port:7001
 INFO [main] replay_journal_entry(159) /tmp/sd1/obj/007c2b2500000000, ...
ERROR [main] replay_journal_entry(166) open No such file or directory
EMERG [main] check_recover_journal_file(262)
    PANIC: recoverying from journal file (new) failed
EMERG [main] crash_handler(268) sheep exits unexpectedly (Aborted).
EMERG [main] sd_backtrace(833) sheep() [0x406157]
...

9. However, dog command shows the node is still in the cluster!

$ dog cluster info
Cluster status: running, auto-recovery enabled

Cluster created at Mon May  5 10:33:26 2014

Epoch Time           Version
2014-05-05 10:33:26      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]

$ dog node list
  Id   Host:Port         V-Nodes       Zone
   0   127.0.0.1:7000      	128          0
   1   127.0.0.1:7001      	128          1
   2   127.0.0.1:7002      	128          2

----------

Complement:

If we want to cleanup resources gracefully, more changes are needed.

1. assigned an exit code stand for exception, normally it is 1, is not 0.
2. adjust the cleanup steps, for example, pid file is created later than
   other initial steps. I think it should be cleaned up first.
3. adjust some initial steps.

Signed-off-by: Ruoyu <liangry at ucweb.com>
---
 sheep/journal.c | 29 ++++++++++++------
 sheep/sheep.c   | 95 ++++++++++++++++++++++++++++++++-------------------------
 2 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/sheep/journal.c b/sheep/journal.c
index 57502b6..3c70c13 100644
--- a/sheep/journal.c
+++ b/sheep/journal.c
@@ -151,9 +151,11 @@ static int replay_journal_entry(struct journal_descriptor *jd)
 		return 0;
 	}
 
-	if (jd->flag != JF_STORE)
-		panic("flag is not JF_STORE, the journaling file is broken."
+	if (jd->flag != JF_STORE) {
+		sd_emerg("flag is not JF_STORE, the journaling file is broken."
 		      " please remove the journaling file and restart sheep daemon");
+		return -1;
+	}
 
 	sd_info("%s, size %" PRIu64 ", off %" PRIu64 ", %d", path, jd->size,
 		jd->offset, jd->create);
@@ -245,21 +247,27 @@ skip:
  * we actually only recover one jfile, the other would be empty. This process
  * is fast with buffered IO that only take several secends at most.
  */
-static void check_recover_journal_file(const char *p)
+static int check_recover_journal_file(const char *p)
 {
 	int old = 0, new = 0;
 
 	if (get_old_new_jfile(p, &old, &new) < 0)
-		return;
+		return -1;
 
 	/* No journal file found */
 	if (old == 0)
-		return;
+		return 0;
 
-	if (do_recover(old) < 0)
-		panic("recoverying from journal file (old) failed");
-	if (do_recover(new) < 0)
-		panic("recoverying from journal file (new) failed");
+	if (do_recover(old) < 0) {
+		sd_emerg("recoverying from journal file (old) failed");
+		return -1;
+	}
+	if (do_recover(new) < 0) {
+		sd_emerg("recoverying from journal file (new) failed");
+		return -1;
+	}
+
+	return 0;
 }
 
 int journal_file_init(const char *path, size_t size, bool skip)
@@ -267,7 +275,8 @@ int journal_file_init(const char *path, size_t size, bool skip)
 	int fd;
 
 	if (!skip)
-		check_recover_journal_file(path);
+		if (check_recover_journal_file(path) != 0)
+			return -1;
 
 	jfile_size = size / 2;
 
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 74d1aaf..405d0fe 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -611,7 +611,7 @@ static void sighup_handler(int signum)
 int main(int argc, char **argv)
 {
 	int ch, longindex, ret, port = SD_LISTEN_PORT, io_port = SD_LISTEN_PORT;
-	int nr_vnodes = SD_DEFAULT_VNODES;
+	int nr_vnodes = SD_DEFAULT_VNODES, rc = 1;
 	const char *dirp = DEFAULT_OBJECT_DIR, *short_options;
 	char *dir, *p, *pid_file = NULL, *bindaddr = NULL, log_path[PATH_MAX],
 	     *argp = NULL;
@@ -770,16 +770,6 @@ int main(int argc, char **argv)
 	if (ret)
 		exit(1);
 
-	ret = init_base_path(dirp);
-	if (ret)
-		exit(1);
-
-	dir = realpath(dirp, NULL);
-	if (!dir) {
-		sd_err("%m");
-		exit(1);
-	}
-
 	if (!strcmp(log_dst, "default"))
 		log_dst_type = LOG_DST_DEFAULT;
 	else if (!strcmp(log_dst, "stdout"))
@@ -813,6 +803,16 @@ int main(int argc, char **argv)
 		}
 	}
 
+	ret = init_base_path(dirp);
+	if (ret)
+		exit(1);
+
+	dir = realpath(dirp, NULL);
+	if (!dir) {
+		sd_err("%m");
+		exit(1);
+	}
+
 	snprintf(log_path, sizeof(log_path), "%s/" LOG_FILE_NAME,
 		 logdir ?: dir);
 
@@ -820,52 +820,56 @@ int main(int argc, char **argv)
 
 	srandom(port);
 
-	if (lock_and_daemon(log_dst_type != LOG_DST_STDOUT, dir))
-		exit(1);
+	if (lock_and_daemon(log_dst_type != LOG_DST_STDOUT, dir)) {
+		free(argp);
+		goto cleanup_dir;
+	}
 
 	ret = log_init(program_name, log_dst_type, log_level, log_path);
-	if (ret)
-		exit(1);
-
-	ret = init_event(EPOLL_SIZE);
-	if (ret)
-		exit(1);
+	if (ret) {
+		free(argp);
+		goto cleanup_dir;
+	}
 
 	ret = init_global_pathnames(dir, argp);
 	free(argp);
 	if (ret)
-		exit(1);
+		goto cleanup_log;
+
+	ret = init_event(EPOLL_SIZE);
+	if (ret)
+		goto cleanup_log;
 
 	ret = init_config_file();
 	if (ret)
-		exit(1);
+		goto cleanup_log;
 
 	ret = create_listen_port(bindaddr, port);
 	if (ret)
-		exit(1);
+		goto cleanup_log;
 
 	if (io_addr && create_listen_port(io_addr, io_port))
-		exit(1);
+		goto cleanup_log;
 
 	ret = init_unix_domain_socket(dir);
 	if (ret)
-		exit(1);
+		goto cleanup_log;
 
 	local_request_init();
 
 	ret = init_signal();
 	if (ret)
-		exit(1);
+		goto cleanup_log;
 
 	/* This function must be called before create_cluster() */
 	ret = init_disk_space(dir);
 	if (ret)
-		exit(1);
+		goto cleanup_log;
 
 	ret = create_cluster(port, zone, nr_vnodes, explicit_addr);
 	if (ret) {
 		sd_err("failed to create sheepdog cluster");
-		exit(1);
+		goto cleanup_log;
 	}
 
 	/* We should init journal file before backend init */
@@ -876,7 +880,7 @@ int main(int argc, char **argv)
 		sd_debug("%s, %"PRIu64", %d", jpath, jsize, jskip);
 		ret = journal_file_init(jpath, jsize, jskip);
 		if (ret)
-			exit(1);
+			goto cleanup_cluster;
 	}
 
 	init_fec();
@@ -890,15 +894,15 @@ int main(int argc, char **argv)
 	 */
 	ret = create_work_queues();
 	if (ret)
-		exit(1);
+		goto cleanup_journal;
 
 	ret = sockfd_init();
 	if (ret)
-		exit(1);
+		goto cleanup_journal;
 
 	ret = init_store_driver(sys->gateway_only);
 	if (ret)
-		exit(1);
+		goto cleanup_journal;
 
 	if (sys->enable_object_cache) {
 		if (!strlen(ocpath))
@@ -906,31 +910,30 @@ int main(int argc, char **argv)
 			memcpy(ocpath, dir, strlen(dir));
 		ret = object_cache_init(ocpath);
 		if (ret)
-			exit(1);
+			goto cleanup_journal;
 	}
 
 	ret = trace_init();
 	if (ret)
-		exit(1);
+		goto cleanup_journal;
 
 	if (http_options && http_init(http_options) != 0)
-		exit(1);
+		goto cleanup_journal;
 
 	ret = nfs_init(NULL);
 	if (ret)
-		exit(1);
+		goto cleanup_journal;
 
 	if (pid_file && (create_pidfile(pid_file) != 0)) {
 		sd_err("failed to pid file '%s' - %m", pid_file);
-		exit(1);
+		goto cleanup_journal;
 	}
 
 	if (chdir(dir) < 0) {
 		sd_err("failed to chdir to %s: %m", dir);
-		exit(1);
+		goto cleanup_pid_file;
 	}
 
-	free(dir);
 	check_host_env();
 	sd_info("sheepdog daemon (version %s) started", PACKAGE_VERSION);
 
@@ -939,19 +942,27 @@ int main(int argc, char **argv)
 		sys->cinfo.status != SD_STATUS_SHUTDOWN))
 		event_loop(-1);
 
+	rc = 0;
 	sd_info("shutdown");
 
-	leave_cluster();
+cleanup_pid_file:
+	if (pid_file)
+		unlink(pid_file);
 
+cleanup_journal:
 	if (uatomic_is_true(&sys->use_journal)) {
 		sd_info("cleaning journal file");
 		clean_journal_file(jpath);
 	}
 
+cleanup_cluster:
+	leave_cluster();
+
+cleanup_log:
 	log_close();
 
-	if (pid_file)
-		unlink(pid_file);
+cleanup_dir:
+	free(dir);
 
-	return 0;
+	return rc;
 }
-- 
1.8.3.2