[sheepdog] [PATCH 1/3] sheep: split cluster_info into system_info and cluster_info

Wed Jul 3 08:49:21 CEST 2013

From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

The current cluster_info contains two different things:

 - The global variables which are specific to the local sheep.
 - The cluster-wide variables which must be kept in the same state
   among existing sheep nodes (e.g. epoch).

This patch moves most of fields cluster_info into system_info.  After
this patch, cluster_info only contains the cluster-wide variables.

The join process will change a bit.  The joining sheep has to send its
cluster_info in the join_message and receives the latest cluster_info
from the master node.  If the joining node sends an invalid
cluster_info, the node will fail to join.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/internal_proto.h |   34 ++++++----
 sheep/group.c            |  158 ++++++++++++++++++++++------------------------
 sheep/ops.c              |   40 ++++++------
 sheep/recovery.c         |    7 +-
 sheep/request.c          |   24 +++----
 sheep/sheep.c            |    4 +-
 sheep/sheep_priv.h       |   11 ++--
 sheep/store.c            |    4 +-
 sheep/vdi.c              |    2 +-
 9 files changed, 140 insertions(+), 144 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 24d5519..f0c55c0 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -129,6 +129,20 @@ struct sd_node {
 	uint64_t        space;
 };
 
+struct cluster_info {
+	uint8_t nr_copies;
+	uint8_t disable_recovery;
+	int16_t nr_nodes;
+	uint32_t epoch;
+	uint64_t ctime;
+	uint16_t flags;
+	uint16_t __pad[3];
+	uint8_t store[STORE_LEN];
+
+	/* node list at cluster_info->epoch */
+	struct sd_node nodes[SD_MAX_NODES];
+};
+
 struct epoch_log {
 	uint64_t ctime;
 	uint64_t time;		/* treated as time_t */
@@ -141,27 +155,19 @@ struct epoch_log {
 
 struct join_message {
 	uint8_t proto_ver;
-	uint8_t nr_copies;
-	int16_t nr_nodes;
+	uint8_t __pad1[3];
 	uint16_t nr_failed_nodes;
 	uint16_t nr_delayed_nodes;
 	uint32_t cluster_status;
-	uint32_t epoch;
-	uint64_t ctime;
 	uint8_t inc_epoch; /* set non-zero when we increment epoch of all nodes */
-	uint8_t disable_recovery;
-	uint16_t cluster_flags;
-	uint32_t __pad;
-	uint8_t store[STORE_LEN];
+	uint8_t __pad2[3];
 
 	/*
-	 * A joining sheep puts the local node list here, which is nr_nodes
-	 * entries long.  After the master replies it will contain the list of
-	 * nodes that attempted to join but failed the join process.  The
-	 * number of entries in that case is nr_failed_nodes, which by
-	 * defintion must be smaller than nr_nodes.
+	 * A joining sheep puts the local cluster info here.  After the master
+	 * replies it will contain the latest cluster info which is shared among
+	 * the existing nodes.
 	 */
-	struct sd_node nodes[];
+	struct cluster_info cinfo;
 };
 
 struct vdi_op_message {
diff --git a/sheep/group.c b/sheep/group.c
index 1b1cea5..4c0cc42 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -62,15 +62,6 @@ static main_thread(struct list_head *) failed_nodes;
  */
 static main_thread(struct list_head *) delayed_nodes;
 
-static size_t get_join_message_size(struct join_message *jm)
-{
-	/*
-	 * jm->nr_nodes is guaranteed to be larger than jm->nr_failed_nodes,
-	 * so it is safe to unconditionally use jm->nr_nodes here.
-	 */
-	return sizeof(*jm) + jm->nr_nodes * sizeof(jm->nodes[0]);
-}
-
 static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes)
 {
 	int nr_zones = 0, i, j;
@@ -104,7 +95,7 @@ bool have_enough_zones(void)
 	int max_copies;
 	struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info);
 
-	if (sys->flags & SD_FLAG_NOHALT)
+	if (sys->cinfo.flags & SD_FLAG_NOHALT)
 		return true;
 
 	if (!cur_vinfo)
@@ -113,12 +104,12 @@ bool have_enough_zones(void)
 	max_copies = get_max_copy_number();
 
 	sd_dprintf("flags %d, nr_zones %d, min copies %d",
-		   sys->flags, cur_vinfo->nr_zones, max_copies);
+		   sys->cinfo.flags, cur_vinfo->nr_zones, max_copies);
 
 	if (!cur_vinfo->nr_zones)
 		return false;
 
-	if (sys->flags & SD_FLAG_QUORUM) {
+	if (sys->cinfo.flags & SD_FLAG_QUORUM) {
 		if (cur_vinfo->nr_zones > (max_copies/2))
 			return true;
 	} else {
@@ -433,9 +424,9 @@ static void update_exceptional_node_list(uint32_t epoch,
 	int i;
 
 	for (i = 0; i < jm->nr_failed_nodes; i++)
-		add_failed_node(epoch, &jm->nodes[i]);
+		add_failed_node(epoch, &jm->cinfo.nodes[i]);
 	for ( ; i < jm->nr_failed_nodes + jm->nr_delayed_nodes; i++)
-		add_delayed_node(epoch, &jm->nodes[i]);
+		add_delayed_node(epoch, &jm->cinfo.nodes[i]);
 }
 
 /* Format the lists of failed or delayed nodes into the join message. */
@@ -444,9 +435,10 @@ static void format_exceptional_node_list(struct join_message *jm)
 	struct node *n;
 
 	list_for_each_entry(n, main_thread_get(failed_nodes), list)
-		jm->nodes[jm->nr_failed_nodes++] = n->ent;
+		jm->cinfo.nodes[jm->nr_failed_nodes++] = n->ent;
 	list_for_each_entry(n, main_thread_get(delayed_nodes), list)
-		jm->nodes[jm->nr_failed_nodes + jm->nr_delayed_nodes++] = n->ent;
+		jm->cinfo.nodes[jm->nr_failed_nodes +
+				jm->nr_delayed_nodes++] = n->ent;
 }
 
 static void clear_exceptional_node_lists(void)
@@ -513,27 +505,27 @@ static int cluster_sanity_check(struct join_message *jm)
 		return CJ_RES_FAIL;
 	}
 
-	if (jm->ctime != local_ctime) {
+	if (jm->cinfo.ctime != local_ctime) {
 		sd_eprintf("joining node ctime doesn't match: %"
-			   PRIu64 " vs %" PRIu64, jm->ctime, local_ctime);
+			   PRIu64 " vs %" PRIu64, jm->cinfo.ctime, local_ctime);
 		return CJ_RES_FAIL;
 	}
 
-	if (jm->epoch > local_epoch) {
+	if (jm->cinfo.epoch > local_epoch) {
 		sd_eprintf("joining node epoch too large: %"
-			   PRIu32 " vs %" PRIu32, jm->epoch, local_epoch);
+			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
 		return CJ_RES_FAIL;
 	}
 
-	if (jm->nr_copies != local_nr_copies) {
+	if (jm->cinfo.nr_copies != local_nr_copies) {
 		sd_eprintf("joining node nr_copies doesn't match: %u vs %u",
-			   jm->nr_copies, local_nr_copies);
+			   jm->cinfo.nr_copies, local_nr_copies);
 		return CJ_RES_FAIL;
 	}
 
-	if (jm->cluster_flags != sys->flags) {
+	if (jm->cinfo.flags != sys->cinfo.flags) {
 		sd_eprintf("joining node cluster_flags don't match: %u vs %u",
-			   jm->cluster_flags, sys->flags);
+			   jm->cinfo.flags, sys->cinfo.flags);
 		return CJ_RES_FAIL;
 	}
 
@@ -549,42 +541,42 @@ static int cluster_wait_for_join_check(const struct sd_node *joined,
 	int ret;
 	struct vnode_info *cur_vinfo;
 
-	if (jm->nr_nodes == 0)
+	if (jm->cinfo.nr_nodes == 0)
 		return CJ_RES_JOIN_LATER;
 
 	ret = cluster_sanity_check(jm);
 	if (ret != CJ_RES_SUCCESS)  {
-		if (jm->epoch > sys->epoch) {
-			sd_eprintf("transfer mastership (%d, %d)", jm->epoch,
-				   sys->epoch);
+		if (jm->cinfo.epoch > sys->cinfo.epoch) {
+			sd_eprintf("transfer mastership (%d, %d)", jm->cinfo.epoch,
+				   sys->cinfo.epoch);
 			return CJ_RES_MASTER_TRANSFER;
 		}
 		return ret;
 	}
 
-	nr_local_entries = epoch_log_read(jm->epoch, local_entries,
+	nr_local_entries = epoch_log_read(jm->cinfo.epoch, local_entries,
 					  sizeof(local_entries));
 	if (nr_local_entries == -1)
 		return CJ_RES_FAIL;
 
-	if (jm->epoch < local_epoch) {
+	if (jm->cinfo.epoch < local_epoch) {
 		sd_eprintf("joining node epoch too small: %"
-			   PRIu32 " vs %" PRIu32, jm->epoch, local_epoch);
+			   PRIu32 " vs %" PRIu32, jm->cinfo.epoch, local_epoch);
 
 		if (xbsearch(joined, local_entries, nr_local_entries, node_cmp))
 			return CJ_RES_FAIL;
 		return CJ_RES_JOIN_LATER;
 	}
 
-	if (jm->nr_nodes != nr_local_entries) {
+	if (jm->cinfo.nr_nodes != nr_local_entries) {
 		sd_eprintf("epoch log entries do not match: %d vs %d",
-			   jm->nr_nodes, nr_local_entries);
+			   jm->cinfo.nr_nodes, nr_local_entries);
 		return CJ_RES_FAIL;
 	}
 
 
-	if (memcmp(jm->nodes, local_entries,
-		   sizeof(jm->nodes[0]) * jm->nr_nodes) != 0) {
+	if (memcmp(jm->cinfo.nodes, local_entries,
+		   sizeof(jm->cinfo.nodes[0]) * jm->cinfo.nr_nodes) != 0) {
 		sd_eprintf("epoch log entries does not match");
 		return CJ_RES_FAIL;
 	}
@@ -633,7 +625,7 @@ static int cluster_running_check(struct join_message *jm)
 	 * When the joining node is newly created and we are not waiting for
 	 * join we do not need to check anything.
 	 */
-	if (jm->nr_nodes != 0) {
+	if (jm->cinfo.nr_nodes != 0) {
 		ret = cluster_sanity_check(jm);
 		if (ret != CJ_RES_SUCCESS)
 			return ret;
@@ -726,8 +718,13 @@ int log_current_epoch(void)
 	struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info);
 
 	if (!cur_vinfo)
-		return update_epoch_log(sys->epoch, NULL, 0);
-	return update_epoch_log(sys->epoch, cur_vinfo->nodes,
+		return update_epoch_log(sys->cinfo.epoch, NULL, 0);
+
+	/* update cluster info to the latest state */
+	sys->cinfo.nr_nodes = cur_vinfo->nr_nodes;
+	memcpy(sys->cinfo.nodes, cur_vinfo->nodes,
+	       sizeof(cur_vinfo->nodes[0]) * cur_vinfo->nr_nodes);
+	return update_epoch_log(sys->cinfo.epoch, cur_vinfo->nodes,
 				cur_vinfo->nr_nodes);
 }
 
@@ -780,23 +777,23 @@ static void finish_join(const struct join_message *msg,
 	int ret;
 
 	sys->join_finished = true;
-	sys->epoch = msg->epoch;
+	sys->cinfo.epoch = msg->cinfo.epoch;
 
 	if (msg->cluster_status != SD_STATUS_OK)
 		update_exceptional_node_list(get_latest_epoch(), msg);
 
-	if (msg->store[0]) {
+	if (msg->cinfo.store[0]) {
 		/*
 		 * We don't need backend for gateway-only node, but need to save
 		 * store name.  Otherwise, the node cannot notify the store name
 		 * when it become master
 		 */
 		if (sys->gateway_only) {
-			ret = set_cluster_store((char *)msg->store);
+			ret = set_cluster_store((char *)msg->cinfo.store);
 			if (ret != SD_RES_SUCCESS)
 				panic("failed to store into config file");
 		} else
-			setup_backend_store((char *)msg->store,
+			setup_backend_store((char *)msg->cinfo.store,
 					    !!msg->inc_epoch);
 	}
 
@@ -868,7 +865,7 @@ static void update_cluster_info(const struct join_message *msg,
 	struct vnode_info *old_vnode_info;
 
 	sd_dprintf("status = %d, epoch = %d, finished: %d",
-		   msg->cluster_status, msg->epoch, sys->join_finished);
+		   msg->cluster_status, msg->cinfo.epoch, sys->join_finished);
 
 	if (!sys->join_finished)
 		finish_join(msg, joined, nodes, nr_nodes);
@@ -882,15 +879,16 @@ static void update_cluster_info(const struct join_message *msg,
 	case SD_STATUS_HALT:
 		switch (sys->status) {
 		case SD_STATUS_WAIT_FOR_FORMAT:
-			sys->nr_copies = msg->nr_copies;
-			sys->flags = msg->cluster_flags;
+			sys->cinfo.nr_copies = msg->cinfo.nr_copies;
+			sys->cinfo.flags = msg->cinfo.flags;
 
-			set_cluster_copies(sys->nr_copies);
-			set_cluster_flags(sys->flags);
-			set_cluster_ctime(msg->ctime);
+			set_cluster_copies(sys->cinfo.nr_copies);
+			set_cluster_flags(sys->cinfo.flags);
+			set_cluster_ctime(msg->cinfo.ctime);
 			/*FALLTHROUGH*/
 		case SD_STATUS_WAIT_FOR_JOIN:
-			sys->disable_recovery = msg->disable_recovery;
+			sys->cinfo.disable_recovery =
+				msg->cinfo.disable_recovery;
 			break;
 		default:
 			break;
@@ -901,7 +899,7 @@ static void update_cluster_info(const struct join_message *msg,
 		sys->status = msg->cluster_status;
 
 		if (msg->inc_epoch) {
-			uatomic_inc(&sys->epoch);
+			uatomic_inc(&sys->cinfo.epoch);
 			log_current_epoch();
 			clear_exceptional_node_lists();
 
@@ -1015,8 +1013,8 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
 		if (nr_entries == -1)
 			return CJ_RES_FAIL;
 
-		sys->epoch = epoch;
-		jm->ctime = get_cluster_ctime();
+		sys->cinfo.epoch = epoch;
+		jm->cinfo.ctime = get_cluster_ctime();
 
 		if (nr_entries == 1)
 			jm->cluster_status = SD_STATUS_OK;
@@ -1031,7 +1029,7 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
 		ret = CJ_RES_FAIL;
 		break;
 	case SD_STATUS_WAIT_FOR_FORMAT:
-		if (jm->nr_nodes != 0) {
+		if (jm->cinfo.nr_nodes != 0) {
 			ret = CJ_RES_FAIL;
 			break;
 		}
@@ -1054,15 +1052,9 @@ enum cluster_join_result sd_check_join_cb(const struct sd_node *joining,
 			       joining->nid.port),
 		   ret, jm->cluster_status);
 
-	jm->nr_copies = sys->nr_copies;
-	jm->cluster_flags = sys->flags;
-	jm->epoch = sys->epoch;
-	jm->ctime = get_cluster_ctime();
 	jm->nr_failed_nodes = 0;
-	jm->disable_recovery = sys->disable_recovery;
 
-	if (sd_store)
-		pstrcpy((char *)jm->store, sizeof(jm->store), sd_store->name);
+	jm->cinfo = sys->cinfo;
 
 	if (jm->cluster_status != SD_STATUS_OK &&
 	    (ret == CJ_RES_SUCCESS || ret == CJ_RES_JOIN_LATER))
@@ -1075,22 +1067,11 @@ static int send_join_request(struct sd_node *ent)
 	struct join_message *msg;
 	int ret;
 
-	msg = xzalloc(sizeof(*msg) + SD_MAX_NODES * sizeof(msg->nodes[0]));
+	msg = xzalloc(sizeof(*msg));
 	msg->proto_ver = SD_SHEEP_PROTO_VER;
-	msg->nr_copies = sys->nr_copies;
-	msg->cluster_flags = sys->flags;
-	msg->epoch = sys->epoch;
-	msg->ctime = get_cluster_ctime();
-
-	if (msg->epoch) {
-		msg->nr_nodes = epoch_log_read(msg->epoch, msg->nodes,
-					       sizeof(struct sd_node) *
-					       SD_MAX_NODES);
-		if (msg->nr_nodes == -1)
-			return SD_RES_EIO;
-	}
+	msg->cinfo = sys->cinfo;
 
-	ret = sys->cdrv->join(ent, msg, get_join_message_size(msg));
+	ret = sys->cdrv->join(ent, msg, sizeof(*msg));
 
 	sd_printf(SDOG_INFO, "%s", node_to_str(&sys->this_node));
 
@@ -1121,6 +1102,8 @@ void sd_join_handler(const struct sd_node *joined,
 	const struct join_message *jm = opaque;
 	uint32_t le = get_latest_epoch();
 
+	sys->cinfo = jm->cinfo;
+
 	if (node_is_local(joined)) {
 		if (result == CJ_RES_FAIL) {
 			sd_eprintf("Failed to join, exiting.");
@@ -1154,7 +1137,7 @@ void sd_join_handler(const struct sd_node *joined,
 		if (!add_failed_node(le, joined))
 			break;
 
-		nr_local = get_nodes_nr_epoch(sys->epoch);
+		nr_local = get_nodes_nr_epoch(sys->cinfo.epoch);
 		nr = nr_members;
 		nr_failed = get_nodes_nr_from(main_thread_get(failed_nodes));
 		nr_delayed = get_nodes_nr_from(main_thread_get(delayed_nodes));
@@ -1174,14 +1157,14 @@ void sd_join_handler(const struct sd_node *joined,
 		 */
 		if (!sys->join_finished) {
 			sys->join_finished = true;
-			sys->epoch = get_latest_epoch();
+			sys->cinfo.epoch = get_latest_epoch();
 
 			put_vnode_info(main_thread_get(current_vnode_info));
 			main_thread_set(current_vnode_info,
 					  alloc_vnode_info(&sys->this_node, 1));
 		}
 
-		nr_local = get_nodes_nr_epoch(sys->epoch);
+		nr_local = get_nodes_nr_epoch(sys->cinfo.epoch);
 		nr = nr_members;
 		nr_failed = get_nodes_nr_from(main_thread_get(failed_nodes));
 		nr_delayed = get_nodes_nr_from(main_thread_get(delayed_nodes));
@@ -1226,7 +1209,7 @@ void sd_leave_handler(const struct sd_node *left, const struct sd_node *members,
 	switch (sys->status) {
 	case SD_STATUS_HALT:
 	case SD_STATUS_OK:
-		uatomic_inc(&sys->epoch);
+		uatomic_inc(&sys->cinfo.epoch);
 		log_current_epoch();
 		start_recovery(main_thread_get(current_vnode_info),
 			       old_vnode_info, true);
@@ -1257,7 +1240,7 @@ void kick_node_recover(void)
 
 	main_thread_set(current_vnode_info,
 			alloc_vnode_info(old->nodes, old->nr_nodes));
-	uatomic_inc(&sys->epoch);
+	uatomic_inc(&sys->cinfo.epoch);
 	log_current_epoch();
 	start_recovery(main_thread_get(current_vnode_info), old, true);
 	put_vnode_info(old);
@@ -1302,12 +1285,19 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
 
 	sys->this_node.space = sys->disk_space;
 
-	sys->epoch = get_latest_epoch();
-	if (sys->epoch) {
+	sys->cinfo.epoch = get_latest_epoch();
+	if (sys->cinfo.epoch) {
 		sys->status = SD_STATUS_WAIT_FOR_JOIN;
-		get_cluster_copies(&sys->nr_copies);
-		get_cluster_flags(&sys->flags);
-
+		get_cluster_copies(&sys->cinfo.nr_copies);
+		get_cluster_flags(&sys->cinfo.flags);
+		sys->cinfo.ctime = get_cluster_ctime();
+		get_cluster_store((char *)sys->cinfo.store);
+
+		sys->cinfo.nr_nodes = epoch_log_read(sys->cinfo.epoch,
+						     sys->cinfo.nodes,
+						     sizeof(sys->cinfo.nodes));
+		if (sys->cinfo.nr_nodes == -1)
+			return -1;
 	} else {
 		sys->status = SD_STATUS_WAIT_FOR_FORMAT;
 	}
diff --git a/sheep/ops.c b/sheep/ops.c
index 5641e81..86df149 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -88,7 +88,8 @@ static int cluster_new_vdi(struct request *req)
 		.size = hdr->vdi.vdi_size,
 		.base_vid = hdr->vdi.base_vdi_id,
 		.create_snapshot = !!hdr->vdi.snapid,
-		.nr_copies = hdr->vdi.copies ? hdr->vdi.copies : sys->nr_copies,
+		.nr_copies = hdr->vdi.copies ? hdr->vdi.copies :
+				sys->cinfo.nr_copies,
 	};
 
 	if (hdr->data_length != SD_MAX_VDI_LEN)
@@ -239,7 +240,6 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 {
 	int i, ret;
 	uint32_t latest_epoch;
-	uint64_t created_time;
 	struct store_driver *driver;
 	char *store_name = data;
 
@@ -247,6 +247,8 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 	if (!driver)
 		return SD_RES_NO_STORE;
 
+	pstrcpy((char *)sys->cinfo.store, sizeof(sys->cinfo.store),
+		store_name);
 	sd_store = driver;
 	latest_epoch = get_latest_epoch();
 
@@ -260,15 +262,15 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 	if (ret != SD_RES_SUCCESS)
 		return ret;
 
-	sys->nr_copies = req->cluster.copies;
-	sys->flags = req->flags;
-	if (!sys->nr_copies)
-		sys->nr_copies = SD_DEFAULT_COPIES;
+	sys->cinfo.nr_copies = req->cluster.copies;
+	sys->cinfo.flags = req->flags;
+	if (!sys->cinfo.nr_copies)
+		sys->cinfo.nr_copies = SD_DEFAULT_COPIES;
+	sys->cinfo.ctime = req->cluster.ctime;
 
-	created_time = req->cluster.ctime;
-	set_cluster_ctime(created_time);
-	set_cluster_copies(sys->nr_copies);
-	set_cluster_flags(sys->flags);
+	set_cluster_ctime(sys->cinfo.ctime);
+	set_cluster_copies(sys->cinfo.nr_copies);
+	set_cluster_flags(sys->cinfo.flags);
 
 	for (i = 1; i <= latest_epoch; i++)
 		remove_epoch(i);
@@ -276,7 +278,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
 	memset(sys->vdi_inuse, 0, sizeof(sys->vdi_inuse));
 	clean_vdi_state();
 
-	sys->epoch = 1;
+	sys->cinfo.epoch = 1;
 
 	ret = log_current_epoch();
 	if (ret)
@@ -300,7 +302,7 @@ static int cluster_shutdown(const struct sd_req *req, struct sd_rsp *rsp,
 static int cluster_enable_recover(const struct sd_req *req,
 				    struct sd_rsp *rsp, void *data)
 {
-	sys->disable_recovery = false;
+	sys->cinfo.disable_recovery = false;
 	resume_suspended_recovery();
 	return SD_RES_SUCCESS;
 }
@@ -308,7 +310,7 @@ static int cluster_enable_recover(const struct sd_req *req,
 static int cluster_disable_recover(const struct sd_req *req,
 				   struct sd_rsp *rsp, void *data)
 {
-	sys->disable_recovery = true;
+	sys->cinfo.disable_recovery = true;
 	return SD_RES_SUCCESS;
 }
 
@@ -450,7 +452,7 @@ static int local_stat_cluster(struct request *req)
 		assert(nr_nodes <= SD_MAX_NODES);
 		log->nr_nodes = nr_nodes;
 
-		log->disable_recovery = sys->disable_recovery;
+		log->disable_recovery = sys->cinfo.disable_recovery;
 
 		rsp->data_length += sizeof(*log);
 		epoch--;
@@ -548,7 +550,7 @@ static int cluster_force_recover_main(const struct sd_req *req,
 	struct sd_node *nodes = data;
 	size_t nr_nodes = rsp->data_length / sizeof(*nodes);
 
-	if (rsp->epoch != sys->epoch) {
+	if (rsp->epoch != sys->cinfo.epoch) {
 		sd_eprintf("epoch was incremented while cluster_force_recover");
 		return SD_RES_FORCE_RECOVER;
 	}
@@ -564,10 +566,10 @@ static int cluster_force_recover_main(const struct sd_req *req,
 		goto err;
 	}
 
-	sys->nr_copies = c;
-	sys->flags = f;
+	sys->cinfo.nr_copies = c;
+	sys->cinfo.flags = f;
 
-	sys->epoch++; /* some nodes are left, so we get a new epoch */
+	sys->cinfo.epoch++; /* some nodes are left, so we get a new epoch */
 	ret = log_current_epoch();
 	if (ret) {
 		sd_printf(SDOG_EMERG, "cannot update epoch log");
@@ -673,7 +675,7 @@ static int cluster_recovery_completion(const struct sd_req *req,
 	for (i = 0; i < nr_recovereds; i++)
 		sd_dprintf("[%x] %s", i, node_to_str(recovereds + i));
 
-	if (sys->epoch != latest_epoch)
+	if (sys->cinfo.epoch != latest_epoch)
 		return SD_RES_SUCCESS;
 
 	vnode_info = get_vnode_info();
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 63877b1..f751bfd 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -575,7 +575,7 @@ static void recover_next_object(struct recovery_info *rinfo)
 	if (rinfo->nr_prio_oids)
 		finish_schedule_oids(rinfo);
 
-	if (sys->disable_recovery && !has_scheduled_objects(rinfo)) {
+	if (sys->cinfo.disable_recovery && !has_scheduled_objects(rinfo)) {
 		sd_dprintf("suspended");
 		rinfo->suspended = true;
 		/* suspend until resume_suspended_recovery() is called */
@@ -796,8 +796,9 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
 
 	rinfo = xzalloc(sizeof(struct recovery_info));
 	rinfo->state = RW_PREPARE_LIST;
-	rinfo->epoch = sys->epoch;
-	rinfo->tgt_epoch = epoch_lifted ? sys->epoch - 1 : sys->epoch;
+	rinfo->epoch = sys->cinfo.epoch;
+	rinfo->tgt_epoch = epoch_lifted ? sys->cinfo.epoch - 1 :
+		sys->cinfo.epoch;
 	rinfo->count = 0;
 	if (epoch_lifted)
 		rinfo->notify_complete = true; /* Reweight or node recovery */
diff --git a/sheep/request.c b/sheep/request.c
index 76d97e5..a6000d3 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -106,7 +106,7 @@ static void gateway_op_done(struct work *work)
 
 	switch (req->rp.result) {
 	case SD_RES_OLD_NODE_VER:
-		if (req->rp.epoch > sys->epoch) {
+		if (req->rp.epoch > sys->cinfo.epoch) {
 			/*
 			 * Gateway of this node is expected to process this
 			 * request later when epoch is lifted.
@@ -122,7 +122,7 @@ static void gateway_op_done(struct work *work)
 	case SD_RES_KILLED:
 		sd_dprintf("retrying failed I/O request op %s result %x epoch %"
 			   PRIu32", sys epoch %"PRIu32, op_name(req->op),
-			   req->rp.result, req->rq.epoch, sys->epoch);
+			   req->rp.result, req->rq.epoch, sys->cinfo.epoch);
 		goto retry;
 	case SD_RES_EIO:
 		if (is_access_local(req, hdr->obj.oid)) {
@@ -158,17 +158,17 @@ static void local_op_done(struct work *work)
 
 static int check_request_epoch(struct request *req)
 {
-	if (before(req->rq.epoch, sys->epoch)) {
+	if (before(req->rq.epoch, sys->cinfo.epoch)) {
 		sd_eprintf("old node version %u, %u (%s)",
-			   sys->epoch, req->rq.epoch, op_name(req->op));
+			   sys->cinfo.epoch, req->rq.epoch, op_name(req->op));
 		/* Ask for sleeping req on requester's wait queue */
 		req->rp.result = SD_RES_OLD_NODE_VER;
-		req->rp.epoch = sys->epoch;
+		req->rp.epoch = sys->cinfo.epoch;
 		put_request(req);
 		return -1;
-	} else if (after(req->rq.epoch, sys->epoch)) {
+	} else if (after(req->rq.epoch, sys->cinfo.epoch)) {
 		sd_eprintf("new node version %u, %u (%s)",
-			   sys->epoch, req->rq.epoch, op_name(req->op));
+			   sys->cinfo.epoch, req->rq.epoch, op_name(req->op));
 		/* Wait for local epoch to be lifted */
 		req->rp.result = SD_RES_NEW_NODE_VER;
 		sleep_on_wait_queue(req);
@@ -221,7 +221,7 @@ void wakeup_requests_on_epoch(void)
 			 */
 			assert(is_gateway_op(req->op));
 			sd_dprintf("gateway %"PRIx64, req->rq.obj.oid);
-			req->rq.epoch = sys->epoch;
+			req->rq.epoch = sys->cinfo.epoch;
 			del_requeue_request(req);
 			break;
 		case SD_RES_NEW_NODE_VER:
@@ -390,13 +390,13 @@ static void queue_request(struct request *req)
 	if (is_peer_op(req->op)) {
 		queue_peer_request(req);
 	} else if (is_gateway_op(req->op)) {
-		hdr->epoch = sys->epoch;
+		hdr->epoch = sys->cinfo.epoch;
 		queue_gateway_request(req);
 	} else if (is_local_op(req->op)) {
-		hdr->epoch = sys->epoch;
+		hdr->epoch = sys->cinfo.epoch;
 		queue_local_request(req);
 	} else if (is_cluster_op(req->op)) {
-		hdr->epoch = sys->epoch;
+		hdr->epoch = sys->cinfo.epoch;
 		queue_cluster_request(req);
 	} else {
 		sd_eprintf("unknown operation %d", hdr->opcode);
@@ -645,7 +645,7 @@ static void init_tx_hdr(struct client_info *ci)
 	/* use cpu_to_le */
 	memcpy(rsp, &req->rp, sizeof(*rsp));
 
-	rsp->epoch = sys->epoch;
+	rsp->epoch = sys->cinfo.epoch;
 	rsp->opcode = req->rq.opcode;
 	rsp->id = req->rq.id;
 }
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 84bd269..3955c73 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -186,8 +186,8 @@ static void crash_handler(int signo)
 	reraise_crash_signal(signo, 1);
 }
 
-static struct cluster_info __sys;
-struct cluster_info *sys = &__sys;
+static struct system_info __sys;
+struct system_info *sys = &__sys;
 
 static void parse_arg(char *arg, const char *delim, void (*fn)(char *))
 {
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index cdf8b7a..804aa1c 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -62,7 +62,7 @@ struct request {
 	struct work work;
 };
 
-struct cluster_info {
+struct system_info {
 	struct cluster_driver *cdrv;
 	const char *cdrv_option;
 
@@ -70,15 +70,13 @@ struct cluster_info {
 	bool join_finished;
 	struct sd_node this_node;
 
-	uint32_t epoch;
+	struct cluster_info cinfo;
 	uint32_t status;
-	uint16_t flags;
 
 	uint64_t disk_space;
 
 	DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
 
-	uint8_t nr_copies;
 	int local_req_efd;
 
 	pthread_mutex_t local_req_lock;
@@ -87,7 +85,6 @@ struct cluster_info {
 	int nr_outstanding_reqs;
 
 	bool gateway_only;
-	bool disable_recovery;
 	bool nosync;
 
 	struct work_queue *gateway_wqueue;
@@ -200,7 +197,7 @@ static inline struct store_driver *find_store_driver(const char *name)
 	return NULL;
 }
 
-extern struct cluster_info *sys;
+extern struct system_info *sys;
 extern struct store_driver *sd_store;
 extern char *obj_path;
 extern char *epoch_path;
@@ -208,7 +205,7 @@ extern char *epoch_path;
 /* One should call this function to get sys->epoch outside main thread */
 static inline uint32_t sys_epoch(void)
 {
-	return uatomic_read(&sys->epoch);
+	return uatomic_read(&sys->cinfo.epoch);
 }
 
 static inline bool is_aligned_to_pagesize(void *p)
diff --git a/sheep/store.c b/sheep/store.c
index a804d0d..f233f35 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -270,10 +270,10 @@ int init_store_driver(bool is_gateway)
 	char driver_name[STORE_LEN], *p;
 	int ret;
 
-	memset(driver_name, '\0', sizeof(driver_name));
-	ret = get_cluster_store(driver_name);
+	ret = get_cluster_store((char *)sys->cinfo.store);
 	if (ret != SD_RES_SUCCESS)
 		return ret;
+	pstrcpy(driver_name, sizeof(driver_name), (char *)sys->cinfo.store);
 
 	p = memchr(driver_name, '\0', STORE_LEN);
 	if (!p) {
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 096244a..41e451e 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -134,7 +134,7 @@ int get_max_copy_number(void)
 	int nr_copies = uatomic_read(&max_copies);
 
 	if (nr_copies == 0)
-		nr_copies = sys->nr_copies;
+		nr_copies = sys->cinfo.nr_copies;
 
 	return nr_copies;
 }
-- 
1.7.9.5