[sheepdog] [PATCH] lib: remove a fd leak bug in the sockfd cache

Hitoshi Mitake mitake.hitoshi at gmail.com
Fri Feb 7 06:47:07 CET 2014


At Tue,  4 Feb 2014 16:25:57 +0900,
Hitoshi Mitake wrote:
> 
> When a node leaves from sheepdog cluster, other nodes try to remove a
> sockfd cache entry of the node (calling
> sockfd_cache_del_node()). Current sockfd caching mechanism would have
> a possibility of fd leak in this sequence.
> 
> Assume there is a worker thread which is using a cached sockfd
> connected to the leaving node during the above sequence (main
> thread is calling sockfd_cache_del_node()). In such a case,
> sockfd_cache_destroy() fails to destroy the sockfd cache entry (the
> debug message "Some victim still holds it" can be seen in a log).
> 
> Sockfd caching mechanism assumes that the victims can destroy the
> entry after their requests fail. But it depends on timing. Other
> threads which use cached fds don't face fail if their requests are
> already executed correctly. And the fd leak problem would happen.
> 
> This patch solves this fd leak problem by adding a new member
> need_to_destroy to struct sockfd_cache_entry. Threads turn on this
> variable true when destroy should be deferred. If the variable is
> true, other threads must try to destroy an entry when they put
> sockfds.
> 
> Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> ---
>  lib/sockfd_cache.c |   40 +++++++++++++++++++++++++++++++---------
>  1 file changed, 31 insertions(+), 9 deletions(-)

I found a bug in this version. Please ignore this one. I'll send v2
later.

Thanks,
Hitoshi

> 
> diff --git a/lib/sockfd_cache.c b/lib/sockfd_cache.c
> index ec06091..3a25bbd 100644
> --- a/lib/sockfd_cache.c
> +++ b/lib/sockfd_cache.c
> @@ -70,6 +70,8 @@ struct sockfd_cache_entry {
>  	struct rb_node rb;
>  	struct node_id nid;
>  	struct sockfd_cache_fd *fds;
> +
> +	uatomic_bool need_to_destroy;
>  };
>  
>  static int sockfd_cache_cmp(const struct sockfd_cache_entry *a,
> @@ -159,32 +161,41 @@ static void free_cache_entry(struct sockfd_cache_entry *entry)
>   * the victim node will finally find itself talking to a dead node and call
>   * sockfd_cache_del() to delete this node from the cache.
>   */
> -static bool sockfd_cache_destroy(const struct node_id *nid)
> +static bool sockfd_cache_destroy_nolock(const struct node_id *nid)
>  {
>  	struct sockfd_cache_entry *entry;
>  
> -	sd_write_lock(&sockfd_cache.lock);
>  	entry = sockfd_cache_search(nid);
>  	if (!entry) {
>  		sd_debug("It is already destroyed");
> -		goto false_out;
> +		return false;
>  	}
>  
>  	if (!slots_all_free(entry)) {
>  		sd_debug("Some victim still holds it");
> -		goto false_out;
> +		/* defer destroy */
> +		uatomic_set_true(&entry->need_to_destroy);
> +
> +		return false;
>  	}
>  
>  	rb_erase(&entry->rb, &sockfd_cache.root);
> -	sd_rw_unlock(&sockfd_cache.lock);
>  
>  	destroy_all_slots(entry);
>  	free_cache_entry(entry);
>  
>  	return true;
> -false_out:
> +}
> +
> +static bool sockfd_cache_destroy(const struct node_id *nid)
> +{
> +	bool result;
> +
> +	sd_write_lock(&sockfd_cache.lock);
> +	result = sockfd_cache_destroy_nolock(nid);
>  	sd_rw_unlock(&sockfd_cache.lock);
> -	return false;
> +
> +	return result;
>  }
>  
>  static void sockfd_cache_add_nolock(const struct node_id *nid)
> @@ -371,10 +382,16 @@ static void sockfd_cache_put_long(const struct node_id *nid, int idx)
>  
>  	sd_debug("%s idx %d", addr_to_str(addr, port), idx);
>  
> -	sd_read_lock(&sockfd_cache.lock);
> +	sd_write_lock(&sockfd_cache.lock);
>  	entry = sockfd_cache_search(nid);
> -	if (entry)
> +	if (entry) {
>  		uatomic_set_false(&entry->fds[idx].in_use);
> +
> +		if (uatomic_is_true(&entry->need_to_destroy)) {
> +			sd_debug("deferred destroy of cache fd is required");
> +			sockfd_cache_destroy_nolock(nid);
> +		}
> +	}
>  	sd_rw_unlock(&sockfd_cache.lock);
>  }
>  
> @@ -393,6 +410,11 @@ static void sockfd_cache_close(const struct node_id *nid, int idx)
>  		close(entry->fds[idx].fd);
>  		entry->fds[idx].fd = -1;
>  		uatomic_set_false(&entry->fds[idx].in_use);
> +
> +		if (uatomic_is_true(&entry->need_to_destroy)) {
> +			sd_debug("deferred destroy of cache fd is required");
> +			sockfd_cache_destroy_nolock(nid);
> +		}
>  	}
>  	sd_rw_unlock(&sockfd_cache.lock);
>  }
> -- 
> 1.7.10.4
> 



More information about the sheepdog mailing list