[Sheepdog] PATCH S006: Defensive programming in join_msg

Liu Yuan namei.unix at gmail.com
Fri May 11 04:57:38 CEST 2012


On 05/10/2012 07:17 AM, Shevek wrote:

> Better defensive programming against invalid join messages.
> 
> We have a cluster which got into a state where the protocol mismatched,
> and the malloc metadata in the heap got corrupted. This patch would
> detect the potential heap corruption and make a version mismatch
> more easily detectable. It is also good defensive programming
> against invalid join messages.
> 
> Signed-off-by: Shevek <shevek at anarres.org>
> 
> ---
>  sheep/cluster.h           |    2 +-
>  sheep/cluster/accord.c    |    2 +-
>  sheep/cluster/corosync.c  |    2 +-
>  sheep/cluster/local.c     |    2 +-
>  sheep/cluster/zookeeper.c |    2 +-
>  sheep/group.c             |   24 +++++++++++++++++-------
>  6 files changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/sheep/cluster.h b/sheep/cluster.h
> index d543e99..759d0eb 100644
> --- a/sheep/cluster.h
> +++ b/sheep/cluster.h
> @@ -190,6 +190,6 @@ void sd_leave_handler(struct sd_node *left, struct
> sd_node *members,
>  		size_t nr_members);
>  void sd_notify_handler(struct sd_node *sender, void *msg, size_t
> msg_len);
>  enum cluster_join_result sd_check_join_cb(struct sd_node *joining,
> -		void *opaque);
> +		void *opaque, size_t opaque_len);
>  
>  #endif
> diff --git a/sheep/cluster/accord.c b/sheep/cluster/accord.c
> index 1fdca91..dad1c2f 100644
> --- a/sheep/cluster/accord.c
> +++ b/sheep/cluster/accord.c
> @@ -576,7 +576,7 @@ static int accord_dispatch(void)
>  	case EVENT_JOIN:
>  		if (ev.blocked) {
>  			if (node_cmp(&ev.nodes[0], &this_node) == 0) {
> -				res = sd_check_join_cb(&ev.sender, ev.buf);
> +				res = sd_check_join_cb(&ev.sender, ev.buf, ev.buf_len);
>  				ev.join_result = res;
>  				ev.blocked = 0;
>  
> diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
> index 4a588e9..217d060 100644
> --- a/sheep/cluster/corosync.c
> +++ b/sheep/cluster/corosync.c
> @@ -296,7 +296,7 @@ static int __corosync_dispatch_one(struct
> corosync_event *cevent)
>  				return 0;
>  
>  			res = sd_check_join_cb(&cevent->sender.ent,
> -						     cevent->msg);
> +						     cevent->msg, cevent->msg_len);
>  			if (res == CJ_RES_MASTER_TRANSFER)
>  				nr_cpg_nodes = 0;
>  
> diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
> index fd84615..3d75e68 100644
> --- a/sheep/cluster/local.c
> +++ b/sheep/cluster/local.c
> @@ -404,7 +404,7 @@ static int local_dispatch(void)
>  	case EVENT_JOIN:
>  		if (ev->blocked) {
>  			if (node_cmp(&ev->nodes[0], &this_node) == 0) {
> -				res = sd_check_join_cb(&ev->sender, ev->buf);
> +				res = sd_check_join_cb(&ev->sender, ev->buf, ev->buf_len);
>  				ev->join_result = res;
>  				ev->blocked = 0;
>  				msync(ev, sizeof(*ev), MS_SYNC);
> diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
> index 491056a..335c261 100644
> --- a/sheep/cluster/zookeeper.c
> +++ b/sheep/cluster/zookeeper.c
> @@ -777,7 +777,7 @@ static int zk_dispatch(void)
>  			dprintf("one sheep joined[up], nr_nodes:%ld, sender:%s, joined:%d
> \n",
>  					nr_zk_nodes, node_to_str(&ev.sender.node), ev.sender.joined);
>  			if (is_master(zhandle, &this_node)) {
> -				res = sd_check_join_cb(&ev.sender.node, ev.buf);
> +				res = sd_check_join_cb(&ev.sender.node, ev.buf, ev.buf_len);
>  				ev.join_result = res;
>  				ev.blocked = 0;
>  				ev.sender.joined = 1;
> diff --git a/sheep/group.c b/sheep/group.c
> index c7fd387..1017437 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -449,12 +449,6 @@ out:
>  
>  static void join(struct sd_node *joining, struct join_message *msg)
>  {
> -	if (msg->proto_ver != SD_SHEEP_PROTO_VER) {
> -		eprintf("joining node sent a message with the wrong protocol version
> \n");
> -		msg->result = SD_RES_VER_MISMATCH;
> -		return;
> -	}
> -
>  	msg->result = get_cluster_status(joining, msg->nodes, msg->nr_nodes,
>  					 msg->ctime, msg->epoch,
>  					 &msg->cluster_status, &msg->inc_epoch);
> @@ -735,11 +729,27 @@ static void __sd_leave(struct event_struct
> *cevent)
>  	}
>  }
>  
> -enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void
> *opaque)
> +enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void
> *opaque, size_t opaque_len)
>  {
>  	struct join_message *jm = opaque;
>  	struct node *node;
>  
> +	if (opaque_len < sizeof(*jm)) {
> +		eprintf("joining node sent a message which is too short\n");
> +		// msg->result = SD_RES_VER_MISMATCH;	// This may segfault anyway if
> msg_len == 0
> +		return CJ_RES_FAIL;
> +	}
> +	if (opaque_len != get_join_message_size(jm)) {
> +		eprintf("joining node sent a message size %zu, expected %zu\n",
> opaque_len, get_join_message_size(jm));
> +		jm->result = SD_RES_VER_MISMATCH;
> +		return CJ_RES_FAIL;
> +	}
> +	if (jm->proto_ver != SD_SHEEP_PROTO_VER) {
> +		eprintf("joining node sent a message with the wrong protocol version
> \n");
> +		jm->result = SD_RES_VER_MISMATCH;
> +		return CJ_RES_FAIL;
> +	}
> +
>  	if (node_cmp(joining, &sys->this_node) == 0) {
>  		struct sd_node entries[SD_MAX_NODES];
>  		int nr_entries;


Style problems too :)

Thanks,
Yuan



More information about the sheepdog mailing list