[sheepdog] [PATCH v2] sheep: sheep aborted during startup, still joined in cluster
Liu Yuan
namei.unix at gmail.com
Fri May 16 05:49:17 CEST 2014
On Tue, May 06, 2014 at 09:33:01PM +0800, Ruoyu wrote:
> Currently, create_cluster() is called before any thread is created.
> Once any one of the following steps during startup is failed,
> sheep calls exit() or abort() directly so that leave_cluster() is not called.
> Other nodes would consider that that one should be alived.
> This will cause many problems.
>
> This is a reproducible case of using journal file. Hot fix also submit.
> But, should we re-arrange the startup steps? Should we avoid panic()
> because it is dangerous?
>
> Steps:
>
> 1. Start three sheeps. Cluster manager is zookeeper.
> for i in `seq 0 2`; do
> sheep/sheep /tmp/sd$i -y 127.0.0.1 -c zookeeper:127.0.0.1:2181 -z $i \
> -p 700$i -j size=64M -n
> done
>
> 2. Format the cluster and create a vdi. Data object file 007c2b2500000000
> is always written into sd1 according to sheepdog hash algorithm.
> $ dog cluster format -c 1
> $ dog vdi create test 4M -P
>
> 3. Write the vdi continuously.
> for i in `seq 0 4194303`; do
> echo $i
> echo "a" | dog vdi write test $i 1
> done
>
> 4. Kill all sheeps in another terminal during writing vdi.
> $ killall sheep
>
> 5. Sometimes journal files are not cleaned up when sheeps exit.
> If they are not found, try step 3 and step 4 again.
> $ ls /tmp/sd*/journal*
> /tmp/sd0/journal_file0 /tmp/sd0/journal_file1
> /tmp/sd1/journal_file0 /tmp/sd1/journal_file1
> /tmp/sd2/journal_file0 /tmp/sd2/journal_file1
>
> 6. Remove data object file to simulate WAL is finished
> but data object file is not created.
> $ rm /tmp/sd1/obj/007c2b2500000000
>
> 7. Start the three sheeps again. We found sd0 and sd2 is up, but sd1 is down.
>
> 8. By the program log (sheep.log), we can see the sheep process of sd1
> is already aborted.
>
> INFO [main] md_add_disk(337) /tmp/sd1/obj, vdisk nr 261, total disk 1
> INFO [main] send_join_request(787) IPv4 ip:127.0.0.1 port:7001
> INFO [main] replay_journal_entry(159) /tmp/sd1/obj/007c2b2500000000, ...
> ERROR [main] replay_journal_entry(166) open No such file or directory
> EMERG [main] check_recover_journal_file(262)
> PANIC: recoverying from journal file (new) failed
> EMERG [main] crash_handler(268) sheep exits unexpectedly (Aborted).
> EMERG [main] sd_backtrace(833) sheep() [0x406157]
> ...
>
> 9. However, dog command shows the node is still in the cluster!
>
> $ dog cluster info
> Cluster status: running, auto-recovery enabled
>
> Cluster created at Mon May 5 10:33:26 2014
>
> Epoch Time Version
> 2014-05-05 10:33:26 1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002]
>
> $ dog node list
> Id Host:Port V-Nodes Zone
> 0 127.0.0.1:7000 128 0
> 1 127.0.0.1:7001 128 1
> 2 127.0.0.1:7002 128 2
>
> ----------
>
> Complement:
>
> If we want to cleanup resources gracefully, more changes are needed.
>
> 1. assigned an exit code stand for exception, normally it is 1, is not 0.
> 2. adjust the cleanup steps, for example, pid file is created later than
> other initial steps. I think it should be cleaned up first.
> 3. adjust some initial steps.
>
> Signed-off-by: Ruoyu <liangry at ucweb.com>
> ---
> sheep/journal.c | 29 ++++++++++++------
> sheep/sheep.c | 95 ++++++++++++++++++++++++++++++++-------------------------
> 2 files changed, 72 insertions(+), 52 deletions(-)
>
> diff --git a/sheep/journal.c b/sheep/journal.c
> index 57502b6..3c70c13 100644
> --- a/sheep/journal.c
> +++ b/sheep/journal.c
> @@ -151,9 +151,11 @@ static int replay_journal_entry(struct journal_descriptor *jd)
> return 0;
> }
>
> - if (jd->flag != JF_STORE)
> - panic("flag is not JF_STORE, the journaling file is broken."
> + if (jd->flag != JF_STORE) {
> + sd_emerg("flag is not JF_STORE, the journaling file is broken."
> " please remove the journaling file and restart sheep daemon");
> + return -1;
> + }
>
> sd_info("%s, size %" PRIu64 ", off %" PRIu64 ", %d", path, jd->size,
> jd->offset, jd->create);
> @@ -245,21 +247,27 @@ skip:
> * we actually only recover one jfile, the other would be empty. This process
> * is fast with buffered IO that only take several secends at most.
> */
> -static void check_recover_journal_file(const char *p)
> +static int check_recover_journal_file(const char *p)
> {
> int old = 0, new = 0;
>
> if (get_old_new_jfile(p, &old, &new) < 0)
> - return;
> + return -1;
>
> /* No journal file found */
> if (old == 0)
> - return;
> + return 0;
>
> - if (do_recover(old) < 0)
> - panic("recoverying from journal file (old) failed");
> - if (do_recover(new) < 0)
> - panic("recoverying from journal file (new) failed");
> + if (do_recover(old) < 0) {
> + sd_emerg("recoverying from journal file (old) failed");
> + return -1;
> + }
> + if (do_recover(new) < 0) {
> + sd_emerg("recoverying from journal file (new) failed");
> + return -1;
> + }
> +
> + return 0;
> }
>
> int journal_file_init(const char *path, size_t size, bool skip)
> @@ -267,7 +275,8 @@ int journal_file_init(const char *path, size_t size, bool skip)
> int fd;
>
> if (!skip)
> - check_recover_journal_file(path);
> + if (check_recover_journal_file(path) != 0)
> + return -1;
>
> jfile_size = size / 2;
>
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index 74d1aaf..405d0fe 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -611,7 +611,7 @@ static void sighup_handler(int signum)
> int main(int argc, char **argv)
> {
> int ch, longindex, ret, port = SD_LISTEN_PORT, io_port = SD_LISTEN_PORT;
> - int nr_vnodes = SD_DEFAULT_VNODES;
> + int nr_vnodes = SD_DEFAULT_VNODES, rc = 1;
> const char *dirp = DEFAULT_OBJECT_DIR, *short_options;
> char *dir, *p, *pid_file = NULL, *bindaddr = NULL, log_path[PATH_MAX],
> *argp = NULL;
> @@ -770,16 +770,6 @@ int main(int argc, char **argv)
> if (ret)
> exit(1);
>
> - ret = init_base_path(dirp);
> - if (ret)
> - exit(1);
> -
> - dir = realpath(dirp, NULL);
> - if (!dir) {
> - sd_err("%m");
> - exit(1);
> - }
> -
> if (!strcmp(log_dst, "default"))
> log_dst_type = LOG_DST_DEFAULT;
> else if (!strcmp(log_dst, "stdout"))
> @@ -813,6 +803,16 @@ int main(int argc, char **argv)
> }
> }
>
> + ret = init_base_path(dirp);
> + if (ret)
> + exit(1);
> +
> + dir = realpath(dirp, NULL);
> + if (!dir) {
> + sd_err("%m");
> + exit(1);
> + }
> +
> snprintf(log_path, sizeof(log_path), "%s/" LOG_FILE_NAME,
> logdir ?: dir);
>
> @@ -820,52 +820,56 @@ int main(int argc, char **argv)
>
> srandom(port);
>
> - if (lock_and_daemon(log_dst_type != LOG_DST_STDOUT, dir))
> - exit(1);
> + if (lock_and_daemon(log_dst_type != LOG_DST_STDOUT, dir)) {
> + free(argp);
> + goto cleanup_dir;
> + }
>
> ret = log_init(program_name, log_dst_type, log_level, log_path);
> - if (ret)
> - exit(1);
> -
> - ret = init_event(EPOLL_SIZE);
> - if (ret)
> - exit(1);
> + if (ret) {
> + free(argp);
> + goto cleanup_dir;
> + }
>
> ret = init_global_pathnames(dir, argp);
> free(argp);
> if (ret)
> - exit(1);
> + goto cleanup_log;
> +
> + ret = init_event(EPOLL_SIZE);
> + if (ret)
> + goto cleanup_log;
>
> ret = init_config_file();
> if (ret)
> - exit(1);
> + goto cleanup_log;
>
> ret = create_listen_port(bindaddr, port);
> if (ret)
> - exit(1);
> + goto cleanup_log;
>
> if (io_addr && create_listen_port(io_addr, io_port))
> - exit(1);
> + goto cleanup_log;
>
> ret = init_unix_domain_socket(dir);
> if (ret)
> - exit(1);
> + goto cleanup_log;
>
> local_request_init();
>
> ret = init_signal();
> if (ret)
> - exit(1);
> + goto cleanup_log;
>
> /* This function must be called before create_cluster() */
> ret = init_disk_space(dir);
> if (ret)
> - exit(1);
> + goto cleanup_log;
>
> ret = create_cluster(port, zone, nr_vnodes, explicit_addr);
> if (ret) {
> sd_err("failed to create sheepdog cluster");
> - exit(1);
> + goto cleanup_log;
> }
>
> /* We should init journal file before backend init */
> @@ -876,7 +880,7 @@ int main(int argc, char **argv)
> sd_debug("%s, %"PRIu64", %d", jpath, jsize, jskip);
> ret = journal_file_init(jpath, jsize, jskip);
> if (ret)
> - exit(1);
> + goto cleanup_cluster;
> }
>
> init_fec();
> @@ -890,15 +894,15 @@ int main(int argc, char **argv)
> */
> ret = create_work_queues();
> if (ret)
> - exit(1);
> + goto cleanup_journal;
>
> ret = sockfd_init();
> if (ret)
> - exit(1);
> + goto cleanup_journal;
>
> ret = init_store_driver(sys->gateway_only);
> if (ret)
> - exit(1);
> + goto cleanup_journal;
>
> if (sys->enable_object_cache) {
> if (!strlen(ocpath))
> @@ -906,31 +910,30 @@ int main(int argc, char **argv)
> memcpy(ocpath, dir, strlen(dir));
> ret = object_cache_init(ocpath);
> if (ret)
> - exit(1);
> + goto cleanup_journal;
> }
>
> ret = trace_init();
> if (ret)
> - exit(1);
> + goto cleanup_journal;
>
> if (http_options && http_init(http_options) != 0)
> - exit(1);
> + goto cleanup_journal;
>
> ret = nfs_init(NULL);
> if (ret)
> - exit(1);
> + goto cleanup_journal;
>
> if (pid_file && (create_pidfile(pid_file) != 0)) {
> sd_err("failed to pid file '%s' - %m", pid_file);
> - exit(1);
> + goto cleanup_journal;
> }
>
> if (chdir(dir) < 0) {
> sd_err("failed to chdir to %s: %m", dir);
> - exit(1);
> + goto cleanup_pid_file;
> }
>
> - free(dir);
> check_host_env();
> sd_info("sheepdog daemon (version %s) started", PACKAGE_VERSION);
>
> @@ -939,19 +942,27 @@ int main(int argc, char **argv)
> sys->cinfo.status != SD_STATUS_SHUTDOWN))
> event_loop(-1);
>
> + rc = 0;
> sd_info("shutdown");
>
> - leave_cluster();
> +cleanup_pid_file:
> + if (pid_file)
> + unlink(pid_file);
>
> +cleanup_journal:
> if (uatomic_is_true(&sys->use_journal)) {
> sd_info("cleaning journal file");
> clean_journal_file(jpath);
> }
>
> +cleanup_cluster:
> + leave_cluster();
> +
> +cleanup_log:
> log_close();
>
> - if (pid_file)
> - unlink(pid_file);
> +cleanup_dir:
> + free(dir);
>
> - return 0;
> + return rc;
> }
> --
> 1.8.3.2
>
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
Applied thanks
Yuan
More information about the sheepdog
mailing list