[sheepdog] [PATCH v2] sheep: sheep aborted during startup, still joined in cluster

Fri May 16 05:49:17 CEST 2014

On Tue, May 06, 2014 at 09:33:01PM +0800, Ruoyu wrote:
> Currently, create_cluster() is called before any thread is created.
> Once any one of the following steps during startup is failed,
> sheep calls exit() or abort() directly so that leave_cluster() is not called.
> Other nodes would consider that that one should be alived.
> This will cause many problems.
> 
> This is a reproducible case of using journal file. Hot fix also submit.
> But, should we re-arrange the startup steps? Should we avoid panic()
> because it is dangerous?
> 
> Steps:
> 
> 1. Start three sheeps. Cluster manager is zookeeper.
> for i in `seq 0 2`; do
>     sheep/sheep /tmp/sd$i -y 127.0.0.1 -c zookeeper:127.0.0.1:2181 -z $i \
>         -p 700$i -j size=64M -n
> done
> 
> 2. Format the cluster and create a vdi. Data object file 007c2b2500000000
> is always written into sd1 according to sheepdog hash algorithm.
> $ dog cluster format -c 1
> $ dog vdi create test 4M -P
> 
> 3. Write the vdi continuously.
> for i in `seq 0 4194303`; do
>     echo $i
>     echo "a" | dog vdi write test $i 1
> done
> 
> 4. Kill all sheeps in another terminal during writing vdi.
> $ killall sheep
> 
> 5. Sometimes journal files are not cleaned up when sheeps exit.
> If they are not found, try step 3 and step 4 again.
> $ ls /tmp/sd*/journal*
> /tmp/sd0/journal_file0  /tmp/sd0/journal_file1
> /tmp/sd1/journal_file0  /tmp/sd1/journal_file1
> /tmp/sd2/journal_file0  /tmp/sd2/journal_file1
> 
> 6. Remove data object file to simulate WAL is finished
> but data object file is not created.
> $ rm /tmp/sd1/obj/007c2b2500000000
> 
> 7. Start the three sheeps again. We found sd0 and sd2 is up, but sd1 is down.
> 
> 8. By the program log (sheep.log), we can see the sheep process of sd1
> is already aborted.
> 
>  INFO [main] md_add_disk(337) /tmp/sd1/obj, vdisk nr 261, total disk 1
>  INFO [main] send_join_request(787) IPv4 ip:127.0.0.1 port:7001
>  INFO [main] replay_journal_entry(159) /tmp/sd1/obj/007c2b2500000000, ...
> ERROR [main] replay_journal_entry(166) open No such file or directory
> EMERG [main] check_recover_journal_file(262)
>     PANIC: recoverying from journal file (new) failed
> EMERG [main] crash_handler(268) sheep exits unexpectedly (Aborted).
> EMERG [main] sd_backtrace(833) sheep() [0x406157]
> ...
> 
> 9. However, dog command shows the node is still in the cluster!
> 
> $ dog cluster info
> Cluster status: running, auto-recovery enabled
> 
> Cluster created at Mon May  5 10:33:26 2014
> 
> Epoch Time           Version
> 2014-05-05 10:33:26      1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
> 
> $ dog node list
>   Id   Host:Port         V-Nodes       Zone
>    0   127.0.0.1:7000      	128          0
>    1   127.0.0.1:7001      	128          1
>    2   127.0.0.1:7002      	128          2
> 
> ----------
> 
> Complement:
> 
> If we want to cleanup resources gracefully, more changes are needed.
> 
> 1. assigned an exit code stand for exception, normally it is 1, is not 0.
> 2. adjust the cleanup steps, for example, pid file is created later than
>    other initial steps. I think it should be cleaned up first.
> 3. adjust some initial steps.
> 
> Signed-off-by: Ruoyu <liangry at ucweb.com>
> ---
>  sheep/journal.c | 29 ++++++++++++------
>  sheep/sheep.c   | 95 ++++++++++++++++++++++++++++++++-------------------------
>  2 files changed, 72 insertions(+), 52 deletions(-)
> 
> diff --git a/sheep/journal.c b/sheep/journal.c
> index 57502b6..3c70c13 100644
> --- a/sheep/journal.c
> +++ b/sheep/journal.c
> @@ -151,9 +151,11 @@ static int replay_journal_entry(struct journal_descriptor *jd)
>  		return 0;
>  	}
>  
> -	if (jd->flag != JF_STORE)
> -		panic("flag is not JF_STORE, the journaling file is broken."
> +	if (jd->flag != JF_STORE) {
> +		sd_emerg("flag is not JF_STORE, the journaling file is broken."
>  		      " please remove the journaling file and restart sheep daemon");
> +		return -1;
> +	}
>  
>  	sd_info("%s, size %" PRIu64 ", off %" PRIu64 ", %d", path, jd->size,
>  		jd->offset, jd->create);
> @@ -245,21 +247,27 @@ skip:
>   * we actually only recover one jfile, the other would be empty. This process
>   * is fast with buffered IO that only take several secends at most.
>   */
> -static void check_recover_journal_file(const char *p)
> +static int check_recover_journal_file(const char *p)
>  {
>  	int old = 0, new = 0;
>  
>  	if (get_old_new_jfile(p, &old, &new) < 0)
> -		return;
> +		return -1;
>  
>  	/* No journal file found */
>  	if (old == 0)
> -		return;
> +		return 0;
>  
> -	if (do_recover(old) < 0)
> -		panic("recoverying from journal file (old) failed");
> -	if (do_recover(new) < 0)
> -		panic("recoverying from journal file (new) failed");
> +	if (do_recover(old) < 0) {
> +		sd_emerg("recoverying from journal file (old) failed");
> +		return -1;
> +	}
> +	if (do_recover(new) < 0) {
> +		sd_emerg("recoverying from journal file (new) failed");
> +		return -1;
> +	}
> +
> +	return 0;
>  }
>  
>  int journal_file_init(const char *path, size_t size, bool skip)
> @@ -267,7 +275,8 @@ int journal_file_init(const char *path, size_t size, bool skip)
>  	int fd;
>  
>  	if (!skip)
> -		check_recover_journal_file(path);
> +		if (check_recover_journal_file(path) != 0)
> +			return -1;
>  
>  	jfile_size = size / 2;
>  
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index 74d1aaf..405d0fe 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -611,7 +611,7 @@ static void sighup_handler(int signum)
>  int main(int argc, char **argv)
>  {
>  	int ch, longindex, ret, port = SD_LISTEN_PORT, io_port = SD_LISTEN_PORT;
> -	int nr_vnodes = SD_DEFAULT_VNODES;
> +	int nr_vnodes = SD_DEFAULT_VNODES, rc = 1;
>  	const char *dirp = DEFAULT_OBJECT_DIR, *short_options;
>  	char *dir, *p, *pid_file = NULL, *bindaddr = NULL, log_path[PATH_MAX],
>  	     *argp = NULL;
> @@ -770,16 +770,6 @@ int main(int argc, char **argv)
>  	if (ret)
>  		exit(1);
>  
> -	ret = init_base_path(dirp);
> -	if (ret)
> -		exit(1);
> -
> -	dir = realpath(dirp, NULL);
> -	if (!dir) {
> -		sd_err("%m");
> -		exit(1);
> -	}
> -
>  	if (!strcmp(log_dst, "default"))
>  		log_dst_type = LOG_DST_DEFAULT;
>  	else if (!strcmp(log_dst, "stdout"))
> @@ -813,6 +803,16 @@ int main(int argc, char **argv)
>  		}
>  	}
>  
> +	ret = init_base_path(dirp);
> +	if (ret)
> +		exit(1);
> +
> +	dir = realpath(dirp, NULL);
> +	if (!dir) {
> +		sd_err("%m");
> +		exit(1);
> +	}
> +
>  	snprintf(log_path, sizeof(log_path), "%s/" LOG_FILE_NAME,
>  		 logdir ?: dir);
>  
> @@ -820,52 +820,56 @@ int main(int argc, char **argv)
>  
>  	srandom(port);
>  
> -	if (lock_and_daemon(log_dst_type != LOG_DST_STDOUT, dir))
> -		exit(1);
> +	if (lock_and_daemon(log_dst_type != LOG_DST_STDOUT, dir)) {
> +		free(argp);
> +		goto cleanup_dir;
> +	}
>  
>  	ret = log_init(program_name, log_dst_type, log_level, log_path);
> -	if (ret)
> -		exit(1);
> -
> -	ret = init_event(EPOLL_SIZE);
> -	if (ret)
> -		exit(1);
> +	if (ret) {
> +		free(argp);
> +		goto cleanup_dir;
> +	}
>  
>  	ret = init_global_pathnames(dir, argp);
>  	free(argp);
>  	if (ret)
> -		exit(1);
> +		goto cleanup_log;
> +
> +	ret = init_event(EPOLL_SIZE);
> +	if (ret)
> +		goto cleanup_log;
>  
>  	ret = init_config_file();
>  	if (ret)
> -		exit(1);
> +		goto cleanup_log;
>  
>  	ret = create_listen_port(bindaddr, port);
>  	if (ret)
> -		exit(1);
> +		goto cleanup_log;
>  
>  	if (io_addr && create_listen_port(io_addr, io_port))
> -		exit(1);
> +		goto cleanup_log;
>  
>  	ret = init_unix_domain_socket(dir);
>  	if (ret)
> -		exit(1);
> +		goto cleanup_log;
>  
>  	local_request_init();
>  
>  	ret = init_signal();
>  	if (ret)
> -		exit(1);
> +		goto cleanup_log;
>  
>  	/* This function must be called before create_cluster() */
>  	ret = init_disk_space(dir);
>  	if (ret)
> -		exit(1);
> +		goto cleanup_log;
>  
>  	ret = create_cluster(port, zone, nr_vnodes, explicit_addr);
>  	if (ret) {
>  		sd_err("failed to create sheepdog cluster");
> -		exit(1);
> +		goto cleanup_log;
>  	}
>  
>  	/* We should init journal file before backend init */
> @@ -876,7 +880,7 @@ int main(int argc, char **argv)
>  		sd_debug("%s, %"PRIu64", %d", jpath, jsize, jskip);
>  		ret = journal_file_init(jpath, jsize, jskip);
>  		if (ret)
> -			exit(1);
> +			goto cleanup_cluster;
>  	}
>  
>  	init_fec();
> @@ -890,15 +894,15 @@ int main(int argc, char **argv)
>  	 */
>  	ret = create_work_queues();
>  	if (ret)
> -		exit(1);
> +		goto cleanup_journal;
>  
>  	ret = sockfd_init();
>  	if (ret)
> -		exit(1);
> +		goto cleanup_journal;
>  
>  	ret = init_store_driver(sys->gateway_only);
>  	if (ret)
> -		exit(1);
> +		goto cleanup_journal;
>  
>  	if (sys->enable_object_cache) {
>  		if (!strlen(ocpath))
> @@ -906,31 +910,30 @@ int main(int argc, char **argv)
>  			memcpy(ocpath, dir, strlen(dir));
>  		ret = object_cache_init(ocpath);
>  		if (ret)
> -			exit(1);
> +			goto cleanup_journal;
>  	}
>  
>  	ret = trace_init();
>  	if (ret)
> -		exit(1);
> +		goto cleanup_journal;
>  
>  	if (http_options && http_init(http_options) != 0)
> -		exit(1);
> +		goto cleanup_journal;
>  
>  	ret = nfs_init(NULL);
>  	if (ret)
> -		exit(1);
> +		goto cleanup_journal;
>  
>  	if (pid_file && (create_pidfile(pid_file) != 0)) {
>  		sd_err("failed to pid file '%s' - %m", pid_file);
> -		exit(1);
> +		goto cleanup_journal;
>  	}
>  
>  	if (chdir(dir) < 0) {
>  		sd_err("failed to chdir to %s: %m", dir);
> -		exit(1);
> +		goto cleanup_pid_file;
>  	}
>  
> -	free(dir);
>  	check_host_env();
>  	sd_info("sheepdog daemon (version %s) started", PACKAGE_VERSION);
>  
> @@ -939,19 +942,27 @@ int main(int argc, char **argv)
>  		sys->cinfo.status != SD_STATUS_SHUTDOWN))
>  		event_loop(-1);
>  
> +	rc = 0;
>  	sd_info("shutdown");
>  
> -	leave_cluster();
> +cleanup_pid_file:
> +	if (pid_file)
> +		unlink(pid_file);
>  
> +cleanup_journal:
>  	if (uatomic_is_true(&sys->use_journal)) {
>  		sd_info("cleaning journal file");
>  		clean_journal_file(jpath);
>  	}
>  
> +cleanup_cluster:
> +	leave_cluster();
> +
> +cleanup_log:
>  	log_close();
>  
> -	if (pid_file)
> -		unlink(pid_file);
> +cleanup_dir:
> +	free(dir);
>  
> -	return 0;
> +	return rc;
>  }
> -- 
> 1.8.3.2
> 
> 
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog

Applied thanks

Yuan