[sheepdog] [PATCH RFC 1/2] sheepdev: linux kernel module of block device driver for sheepdog

Fri Jan 4 09:12:59 CET 2013

On 12/27/2012 10:00 PM, levin li wrote:
> From: levin li <xingke.lwp at taobao.com>
> 
> This module provides function for users to take sheepdog VDIs as block
> devices in linux, users can register a VDI to its kernel space, it just
> like that a new hard disk is added to the computer, users can create
> partitions for the disk, format the disk or mount the disk, it provides
> users a efficient way to use sheepdog as distributed storage system.
> 
> The usage is easy, after install the module sheepdev.ko, it creates a proc
> entry '/proc/entry', you can write into the proc entry file to control
> the driver.
> 
> Add a new block device from an existing sheepdog VDI:
> 
> # echo "add 127.0.0.1:7070 a5d05d" > /proc/sheep
> 

It's better use 'add ip:port vdi_name' to add a block device.

> It would create a block device /dev/sheepa, you can format/mount this device:
> 
> # mkfs.ext4 /dev/sheepa
> # mount -t ext4 /sheep/sheepa test
> 
> Remove a block device from the kernel:
> 
> # echo "del sheepa" > /proc/sheep
> 
> Signed-off-by: levin li <xingke.lwp at taobao.com>
> ---
>  sheepdev/connect.c        | 178 ++++++++++++
>  sheepdev/module.c         | 726 ++++++++++++++++++++++++++++++++++++++++++++++
>  sheepdev/sheep.c          | 136 +++++++++

I think module.c can be merged into sheep.c and we'd better split /proc
management code out of module.c.

>  sheepdev/sheep.h          |  88 ++++++
>  sheepdev/sheepdog_proto.h | 290 ++++++++++++++++++

We shouldn't use duplicate headers in the source.

>  5 files changed, 1418 insertions(+)
>  create mode 100644 sheepdev/connect.c
>  create mode 100644 sheepdev/module.c
>  create mode 100644 sheepdev/sheep.c
>  create mode 100644 sheepdev/sheep.h
>  create mode 100644 sheepdev/sheepdog_proto.h
> 
> diff --git a/sheepdev/connect.c b/sheepdev/connect.c
> new file mode 100644
> index 0000000..009a7b9
> --- /dev/null
> +++ b/sheepdev/connect.c
> @@ -0,0 +1,178 @@
> +/*
> + * Copyright (C) 2012 Taobao Inc.
> + *
> + * Levin Li <xingke.lwp at taobao.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sheep.h"
> +#include "sheepdog_proto.h"
> +
> +int connect_to(struct socket **sock, const char *ip_addr, int port)
> +{
> +	int ret;
> +	struct sockaddr_in addr;
> +
> +	ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, sock);
> +	if (ret) {
> +		DBPRT("fail to create socket\n");
> +		return ret;
> +	}
> +
> +	memset(&addr, 0, sizeof(addr));
> +	addr.sin_family = AF_INET;
> +	addr.sin_port = htons(port);
> +	addr.sin_addr.s_addr = in_aton(ip_addr);
> +
> +	ret = (*sock)->ops->connect(*sock, (struct sockaddr *)&addr,
> +				 sizeof(addr), 0);
> +
> +	if (!ret)
> +		DBPRT("connected to %s:%d\n", ip_addr, port);
> +
> +	return ret;
> +}
> +
> +int do_read(struct socket *sock, char *buf, const size_t length)
> +{
> +	struct msghdr msg;
> +	struct iovec iov;
> +	int ret = 0, received = 0, left = length;
> +	mm_segment_t oldmm;
> +
> +	memset(&msg, 0, sizeof(msg));
> +	msg.msg_iov = &iov;
> +	msg.msg_iovlen = 1;
> +
> +	while (left > 0) {
> +		oldmm = get_fs();
> +		set_fs(KERNEL_DS);
> +		msg.msg_iov->iov_base = buf + received;
> +		msg.msg_iov->iov_len = left;
> +		ret = sock_recvmsg(sock, &msg, left, MSG_WAITALL);
> +		set_fs(oldmm);
> +		if (ret <= 0)
> +			break;
> +		left -= ret;
> +		received += ret;
> +	}
> +
> +	return ret;
> +}
> +
> +static void forward_iov(struct msghdr *msg, int len)
> +{
> +	while (msg->msg_iov->iov_len <= len) {
> +		len -= msg->msg_iov->iov_len;
> +		msg->msg_iov++;
> +		msg->msg_iovlen--;
> +	}
> +
> +	msg->msg_iov->iov_base = (char *) msg->msg_iov->iov_base + len;
> +	msg->msg_iov->iov_len -= len;
> +}
> +
> +
> +static int do_write(struct socket *sock, struct msghdr *msg, int len)
> +{
> +	int ret;
> +	mm_segment_t oldmm;
> +
> +rewrite:
> +	oldmm = get_fs();
> +	set_fs(KERNEL_DS);
> +	ret = sock_sendmsg(sock, msg, len);
> +	set_fs(oldmm);
> +
> +	if (ret < 0) {
> +		if (ret == -EINTR)
> +			goto rewrite;
> +		if (ret == -EBUSY) {
> +			DBPRT("busy\n");
> +			goto rewrite;
> +		}
> +		DBPRT("failed to write to socket: %d\n", ret);
> +		return -EFAULT;
> +	}
> +
> +	len -= ret;
> +	if (len) {
> +		forward_iov(msg, ret);
> +		goto rewrite;
> +	}
> +
> +	return 0;
> +}
> +
> +int send_req(struct socket *sock, struct sd_req *hdr, void *data,
> +	     unsigned int wlen)
> +{
> +	int ret;
> +	struct msghdr msg;
> +	struct iovec iov[2];
> +
> +	memset(&msg, 0, sizeof(msg));
> +
> +	msg.msg_iov = iov;
> +
> +	msg.msg_iovlen = 1;
> +	iov[0].iov_base = hdr;
> +	iov[0].iov_len = sizeof(*hdr);
> +
> +	if (wlen) {
> +		msg.msg_iovlen++;
> +		iov[1].iov_base = data;
> +		iov[1].iov_len = wlen;
> +	}
> +
> +	ret = do_write(sock, &msg, sizeof(*hdr) + wlen);
> +	if (ret) {
> +		DBPRT("failed to send request %x, %d\n", hdr->opcode, wlen);
> +		ret = -EFAULT;
> +	}
> +
> +	return ret;
> +}
> +
> +int exec_req(struct socket *sock, struct sd_req *hdr, void *data)
> +{
> +	int ret;
> +	struct sd_rsp *rsp = (struct sd_rsp *)hdr;
> +	unsigned int wlen, rlen;
> +
> +	if (hdr->flags & SD_FLAG_CMD_WRITE) {
> +		wlen = hdr->data_length;
> +		rlen = 0;
> +	} else {
> +		wlen = 0;
> +		rlen = hdr->data_length;
> +	}
> +
> +	if (send_req(sock, hdr, data, wlen))
> +		return -EFAULT;
> +
> +	ret = do_read(sock, (char *)rsp, sizeof(*rsp));
> +	if (ret < 0) {
> +		DBPRT("failed to read a response\n");
> +		return -EFAULT;
> +	}
> +
> +	if (rlen > rsp->data_length)
> +		rlen = rsp->data_length;
> +
> +	if (rlen) {
> +		ret = do_read(sock, data, rlen);
> +		if (ret < 0) {
> +			DBPRT("failed to read the response data\n");
> +			return -EFAULT;
> +		}
> +	}
> +
> +	return 0;
> +}
> diff --git a/sheepdev/module.c b/sheepdev/module.c
> new file mode 100644
> index 0000000..bde57d3
> --- /dev/null
> +++ b/sheepdev/module.c
> @@ -0,0 +1,726 @@
> +/*
> + * Copyright (C) 2012 Taobao Inc.
> + *
> + * Levin Li <xingke.lwp at taobao.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <linux/fs.h>
> +#include <linux/genhd.h>
> +#include <linux/blkdev.h>
> +#include <linux/hdreg.h>
> +#include <linux/proc_fs.h>
> +#include <linux/kthread.h>
> +#include "sheep.h"
> +
> +static int sheepdev_major;
> +spinlock_t devices_lock;
> +struct list_head dev_list;
> +static unsigned long *device_bitmap;
> +static struct proc_dir_entry *sheep_proc_entry;
> +
> +static void sheepdev_get(struct sheepdev *dev)
> +{
> +	atomic_inc(&dev->struct_refcnt);
> +}
> +
> +static void sheepdev_put(struct sheepdev *dev)
> +{
> +	if (atomic_dec_and_test(&dev->struct_refcnt))
> +		kfree(dev);
> +}
> +
> +static int add_request(struct sheepdev *dev, struct request *req, uint64_t oid,
> +		       int idx)
> +{
> +	struct sheep_request *s_req = kmalloc(sizeof(*s_req), GFP_KERNEL);
> +	if (!s_req)
> +		return -EIO;
> +
> +	s_req->req_id = dev->req_id;
> +	s_req->req = req;
> +	s_req->oid = oid;
> +	s_req->idx = idx;
> +	INIT_LIST_HEAD(&s_req->list);
> +
> +	spin_lock_irq(&dev->fin_lock);
> +	list_add_tail(&s_req->list, &dev->finish_list);
> +	spin_unlock_irq(&dev->fin_lock);
> +
> +	if (dev->req_id > UINT_MAX)
> +		dev->req_id = 1;
> +	else
> +		dev->req_id++;
> +
> +	return 0;
> +}
> +
> +static void sheep_end_request(struct request *req, int ret)
> +{
> +	struct request_queue *q = req->q;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(q->queue_lock, flags);
> +	__blk_end_request_all(req, ret);
> +	spin_unlock_irqrestore(q->queue_lock, flags);
> +}
> +
> +static int sheep_handle_request(struct request *req)
> +{
> +	struct req_iterator iter;
> +	struct bio_vec *bvec;
> +	struct gendisk *disk = req->rq_disk;
> +	struct sheepdev *dev = disk->private_data;
> +	unsigned long sector = blk_rq_pos(req);
> +	unsigned long offset = sector * KERNEL_SECTOR_SIZE;
> +	unsigned long nbytes = blk_rq_bytes(req);
> +	int idx = offset / SHEEP_OBJECT_SIZE + 1;
> +	uint64_t oid = vid_to_data_oid(dev->vid, idx);
> +	uint64_t off = offset % SHEEP_OBJECT_SIZE;
> +	int ret = 0, len = 0, create = 0;
> +	int write = rq_data_dir(req);
> +	void *sheep_buf = NULL;
> +
> +	if (!write && dev->inode->data_vdi_id[idx] != dev->vid) {
> +		rq_for_each_segment(bvec, req, iter) {
> +			void *addr = kmap(bvec->bv_page);
> +			memset(addr + bvec->bv_offset, 0, bvec->bv_len);
> +			kunmap(bvec->bv_page);
> +		}
> +		sheep_end_request(req, 0);
> +		return 0;
> +	} else if (!write) {
> +		ret = send_read_req(dev, oid, nbytes, off);
> +		if (ret)
> +			return -EIO;
> +
> +		ret = add_request(dev, req, oid, idx);
> +		if (ret)
> +			return -EIO;
> +
> +		return 0;
> +	}
> +
> +	/* For write requests */
> +	sheep_buf = kmalloc(nbytes, GFP_KERNEL);
> +	if (!sheep_buf)
> +		return -EIO;
> +
> +	spin_lock(&dev->creating_lock);
> +	if (!dev->inode->data_vdi_id[idx]) {
> +		dev->inode->data_vdi_id[idx] = 1;
> +		create = 1;
> +		spin_unlock(&dev->creating_lock);
> +	} else if (dev->inode->data_vdi_id[idx] != dev->vid){
> +
> +		spin_unlock(&dev->creating_lock);
> +		wait_event_interruptible(dev->creating_wait,
> +				dev->inode->data_vdi_id[idx] == dev->vid);
> +	} else
> +		spin_unlock(&dev->creating_lock);
> +
> +	rq_for_each_segment(bvec, req, iter) {
> +		void *addr = kmap(bvec->bv_page);
> +
> +		memcpy(sheep_buf + len, addr + bvec->bv_offset, bvec->bv_len);
> +		len += bvec->bv_len;
> +
> +		if (rq_iter_last(req, iter)) {
> +			ret = send_write_req(dev, oid, sheep_buf, len, off,
> +					     create);
> +			if (ret != SD_RES_SUCCESS) {
> +				kunmap(bvec->bv_page);
> +				ret = -EIO;
> +				goto out;
> +			}
> +
> +			ret = add_request(dev, req, oid, idx);
> +			if (ret) {
> +				kunmap(bvec->bv_page);
> +				ret = -EIO;
> +				goto out;
> +			}
> +
> +			if (!create)
> +				goto done;
> +
> +			/* For create operations we need to update inode data */
> +			oid = vid_to_vdi_oid(dev->vid);
> +			off = offsetof(struct sheepdog_inode, data_vdi_id);
> +			off += sizeof(uint32_t) * idx;
> +			ret = send_write_req(dev, oid, (char *)&dev->vid,
> +					     sizeof(dev->vid), off, 0);
> +			if (ret != SD_RES_SUCCESS) {
> +				kunmap(bvec->bv_page);
> +				ret = -EIO;
> +				goto out;
> +			}
> +
> +			ret = add_request(dev, req, oid, idx);
> +			if (ret) {
> +				kunmap(bvec->bv_page);
> +				ret = -EIO;
> +				goto out;
> +			}
> +done:;
> +		}
> +
> +		kunmap(bvec->bv_page);
> +	}
> +
> +out:
> +	kfree(sheep_buf);
> +	return ret;
> +}
> +
> +static void sheep_request(struct request_queue *rq)
> +{

Better find another name, we already have a struct named 'sheep_request'.

Before merging this patch set, I'd expect we can support read/write
snapshot volumes too.

Thanks,
Yuan