[sheepdog] [PATCH RFC 1/2] sheepdev: linux kernel module of block device driver for sheepdog
Liu Yuan
namei.unix at gmail.com
Fri Jan 4 09:12:59 CET 2013
On 12/27/2012 10:00 PM, levin li wrote:
> From: levin li <xingke.lwp at taobao.com>
>
> This module provides function for users to take sheepdog VDIs as block
> devices in linux, users can register a VDI to its kernel space, it just
> like that a new hard disk is added to the computer, users can create
> partitions for the disk, format the disk or mount the disk, it provides
> users a efficient way to use sheepdog as distributed storage system.
>
> The usage is easy, after install the module sheepdev.ko, it creates a proc
> entry '/proc/entry', you can write into the proc entry file to control
> the driver.
>
> Add a new block device from an existing sheepdog VDI:
>
> # echo "add 127.0.0.1:7070 a5d05d" > /proc/sheep
>
It's better use 'add ip:port vdi_name' to add a block device.
> It would create a block device /dev/sheepa, you can format/mount this device:
>
> # mkfs.ext4 /dev/sheepa
> # mount -t ext4 /sheep/sheepa test
>
> Remove a block device from the kernel:
>
> # echo "del sheepa" > /proc/sheep
>
> Signed-off-by: levin li <xingke.lwp at taobao.com>
> ---
> sheepdev/connect.c | 178 ++++++++++++
> sheepdev/module.c | 726 ++++++++++++++++++++++++++++++++++++++++++++++
> sheepdev/sheep.c | 136 +++++++++
I think module.c can be merged into sheep.c and we'd better split /proc
management code out of module.c.
> sheepdev/sheep.h | 88 ++++++
> sheepdev/sheepdog_proto.h | 290 ++++++++++++++++++
We shouldn't use duplicate headers in the source.
> 5 files changed, 1418 insertions(+)
> create mode 100644 sheepdev/connect.c
> create mode 100644 sheepdev/module.c
> create mode 100644 sheepdev/sheep.c
> create mode 100644 sheepdev/sheep.h
> create mode 100644 sheepdev/sheepdog_proto.h
>
> diff --git a/sheepdev/connect.c b/sheepdev/connect.c
> new file mode 100644
> index 0000000..009a7b9
> --- /dev/null
> +++ b/sheepdev/connect.c
> @@ -0,0 +1,178 @@
> +/*
> + * Copyright (C) 2012 Taobao Inc.
> + *
> + * Levin Li <xingke.lwp at taobao.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "sheep.h"
> +#include "sheepdog_proto.h"
> +
> +int connect_to(struct socket **sock, const char *ip_addr, int port)
> +{
> + int ret;
> + struct sockaddr_in addr;
> +
> + ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, sock);
> + if (ret) {
> + DBPRT("fail to create socket\n");
> + return ret;
> + }
> +
> + memset(&addr, 0, sizeof(addr));
> + addr.sin_family = AF_INET;
> + addr.sin_port = htons(port);
> + addr.sin_addr.s_addr = in_aton(ip_addr);
> +
> + ret = (*sock)->ops->connect(*sock, (struct sockaddr *)&addr,
> + sizeof(addr), 0);
> +
> + if (!ret)
> + DBPRT("connected to %s:%d\n", ip_addr, port);
> +
> + return ret;
> +}
> +
> +int do_read(struct socket *sock, char *buf, const size_t length)
> +{
> + struct msghdr msg;
> + struct iovec iov;
> + int ret = 0, received = 0, left = length;
> + mm_segment_t oldmm;
> +
> + memset(&msg, 0, sizeof(msg));
> + msg.msg_iov = &iov;
> + msg.msg_iovlen = 1;
> +
> + while (left > 0) {
> + oldmm = get_fs();
> + set_fs(KERNEL_DS);
> + msg.msg_iov->iov_base = buf + received;
> + msg.msg_iov->iov_len = left;
> + ret = sock_recvmsg(sock, &msg, left, MSG_WAITALL);
> + set_fs(oldmm);
> + if (ret <= 0)
> + break;
> + left -= ret;
> + received += ret;
> + }
> +
> + return ret;
> +}
> +
> +static void forward_iov(struct msghdr *msg, int len)
> +{
> + while (msg->msg_iov->iov_len <= len) {
> + len -= msg->msg_iov->iov_len;
> + msg->msg_iov++;
> + msg->msg_iovlen--;
> + }
> +
> + msg->msg_iov->iov_base = (char *) msg->msg_iov->iov_base + len;
> + msg->msg_iov->iov_len -= len;
> +}
> +
> +
> +static int do_write(struct socket *sock, struct msghdr *msg, int len)
> +{
> + int ret;
> + mm_segment_t oldmm;
> +
> +rewrite:
> + oldmm = get_fs();
> + set_fs(KERNEL_DS);
> + ret = sock_sendmsg(sock, msg, len);
> + set_fs(oldmm);
> +
> + if (ret < 0) {
> + if (ret == -EINTR)
> + goto rewrite;
> + if (ret == -EBUSY) {
> + DBPRT("busy\n");
> + goto rewrite;
> + }
> + DBPRT("failed to write to socket: %d\n", ret);
> + return -EFAULT;
> + }
> +
> + len -= ret;
> + if (len) {
> + forward_iov(msg, ret);
> + goto rewrite;
> + }
> +
> + return 0;
> +}
> +
> +int send_req(struct socket *sock, struct sd_req *hdr, void *data,
> + unsigned int wlen)
> +{
> + int ret;
> + struct msghdr msg;
> + struct iovec iov[2];
> +
> + memset(&msg, 0, sizeof(msg));
> +
> + msg.msg_iov = iov;
> +
> + msg.msg_iovlen = 1;
> + iov[0].iov_base = hdr;
> + iov[0].iov_len = sizeof(*hdr);
> +
> + if (wlen) {
> + msg.msg_iovlen++;
> + iov[1].iov_base = data;
> + iov[1].iov_len = wlen;
> + }
> +
> + ret = do_write(sock, &msg, sizeof(*hdr) + wlen);
> + if (ret) {
> + DBPRT("failed to send request %x, %d\n", hdr->opcode, wlen);
> + ret = -EFAULT;
> + }
> +
> + return ret;
> +}
> +
> +int exec_req(struct socket *sock, struct sd_req *hdr, void *data)
> +{
> + int ret;
> + struct sd_rsp *rsp = (struct sd_rsp *)hdr;
> + unsigned int wlen, rlen;
> +
> + if (hdr->flags & SD_FLAG_CMD_WRITE) {
> + wlen = hdr->data_length;
> + rlen = 0;
> + } else {
> + wlen = 0;
> + rlen = hdr->data_length;
> + }
> +
> + if (send_req(sock, hdr, data, wlen))
> + return -EFAULT;
> +
> + ret = do_read(sock, (char *)rsp, sizeof(*rsp));
> + if (ret < 0) {
> + DBPRT("failed to read a response\n");
> + return -EFAULT;
> + }
> +
> + if (rlen > rsp->data_length)
> + rlen = rsp->data_length;
> +
> + if (rlen) {
> + ret = do_read(sock, data, rlen);
> + if (ret < 0) {
> + DBPRT("failed to read the response data\n");
> + return -EFAULT;
> + }
> + }
> +
> + return 0;
> +}
> diff --git a/sheepdev/module.c b/sheepdev/module.c
> new file mode 100644
> index 0000000..bde57d3
> --- /dev/null
> +++ b/sheepdev/module.c
> @@ -0,0 +1,726 @@
> +/*
> + * Copyright (C) 2012 Taobao Inc.
> + *
> + * Levin Li <xingke.lwp at taobao.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <linux/fs.h>
> +#include <linux/genhd.h>
> +#include <linux/blkdev.h>
> +#include <linux/hdreg.h>
> +#include <linux/proc_fs.h>
> +#include <linux/kthread.h>
> +#include "sheep.h"
> +
> +static int sheepdev_major;
> +spinlock_t devices_lock;
> +struct list_head dev_list;
> +static unsigned long *device_bitmap;
> +static struct proc_dir_entry *sheep_proc_entry;
> +
> +static void sheepdev_get(struct sheepdev *dev)
> +{
> + atomic_inc(&dev->struct_refcnt);
> +}
> +
> +static void sheepdev_put(struct sheepdev *dev)
> +{
> + if (atomic_dec_and_test(&dev->struct_refcnt))
> + kfree(dev);
> +}
> +
> +static int add_request(struct sheepdev *dev, struct request *req, uint64_t oid,
> + int idx)
> +{
> + struct sheep_request *s_req = kmalloc(sizeof(*s_req), GFP_KERNEL);
> + if (!s_req)
> + return -EIO;
> +
> + s_req->req_id = dev->req_id;
> + s_req->req = req;
> + s_req->oid = oid;
> + s_req->idx = idx;
> + INIT_LIST_HEAD(&s_req->list);
> +
> + spin_lock_irq(&dev->fin_lock);
> + list_add_tail(&s_req->list, &dev->finish_list);
> + spin_unlock_irq(&dev->fin_lock);
> +
> + if (dev->req_id > UINT_MAX)
> + dev->req_id = 1;
> + else
> + dev->req_id++;
> +
> + return 0;
> +}
> +
> +static void sheep_end_request(struct request *req, int ret)
> +{
> + struct request_queue *q = req->q;
> + unsigned long flags;
> +
> + spin_lock_irqsave(q->queue_lock, flags);
> + __blk_end_request_all(req, ret);
> + spin_unlock_irqrestore(q->queue_lock, flags);
> +}
> +
> +static int sheep_handle_request(struct request *req)
> +{
> + struct req_iterator iter;
> + struct bio_vec *bvec;
> + struct gendisk *disk = req->rq_disk;
> + struct sheepdev *dev = disk->private_data;
> + unsigned long sector = blk_rq_pos(req);
> + unsigned long offset = sector * KERNEL_SECTOR_SIZE;
> + unsigned long nbytes = blk_rq_bytes(req);
> + int idx = offset / SHEEP_OBJECT_SIZE + 1;
> + uint64_t oid = vid_to_data_oid(dev->vid, idx);
> + uint64_t off = offset % SHEEP_OBJECT_SIZE;
> + int ret = 0, len = 0, create = 0;
> + int write = rq_data_dir(req);
> + void *sheep_buf = NULL;
> +
> + if (!write && dev->inode->data_vdi_id[idx] != dev->vid) {
> + rq_for_each_segment(bvec, req, iter) {
> + void *addr = kmap(bvec->bv_page);
> + memset(addr + bvec->bv_offset, 0, bvec->bv_len);
> + kunmap(bvec->bv_page);
> + }
> + sheep_end_request(req, 0);
> + return 0;
> + } else if (!write) {
> + ret = send_read_req(dev, oid, nbytes, off);
> + if (ret)
> + return -EIO;
> +
> + ret = add_request(dev, req, oid, idx);
> + if (ret)
> + return -EIO;
> +
> + return 0;
> + }
> +
> + /* For write requests */
> + sheep_buf = kmalloc(nbytes, GFP_KERNEL);
> + if (!sheep_buf)
> + return -EIO;
> +
> + spin_lock(&dev->creating_lock);
> + if (!dev->inode->data_vdi_id[idx]) {
> + dev->inode->data_vdi_id[idx] = 1;
> + create = 1;
> + spin_unlock(&dev->creating_lock);
> + } else if (dev->inode->data_vdi_id[idx] != dev->vid){
> +
> + spin_unlock(&dev->creating_lock);
> + wait_event_interruptible(dev->creating_wait,
> + dev->inode->data_vdi_id[idx] == dev->vid);
> + } else
> + spin_unlock(&dev->creating_lock);
> +
> + rq_for_each_segment(bvec, req, iter) {
> + void *addr = kmap(bvec->bv_page);
> +
> + memcpy(sheep_buf + len, addr + bvec->bv_offset, bvec->bv_len);
> + len += bvec->bv_len;
> +
> + if (rq_iter_last(req, iter)) {
> + ret = send_write_req(dev, oid, sheep_buf, len, off,
> + create);
> + if (ret != SD_RES_SUCCESS) {
> + kunmap(bvec->bv_page);
> + ret = -EIO;
> + goto out;
> + }
> +
> + ret = add_request(dev, req, oid, idx);
> + if (ret) {
> + kunmap(bvec->bv_page);
> + ret = -EIO;
> + goto out;
> + }
> +
> + if (!create)
> + goto done;
> +
> + /* For create operations we need to update inode data */
> + oid = vid_to_vdi_oid(dev->vid);
> + off = offsetof(struct sheepdog_inode, data_vdi_id);
> + off += sizeof(uint32_t) * idx;
> + ret = send_write_req(dev, oid, (char *)&dev->vid,
> + sizeof(dev->vid), off, 0);
> + if (ret != SD_RES_SUCCESS) {
> + kunmap(bvec->bv_page);
> + ret = -EIO;
> + goto out;
> + }
> +
> + ret = add_request(dev, req, oid, idx);
> + if (ret) {
> + kunmap(bvec->bv_page);
> + ret = -EIO;
> + goto out;
> + }
> +done:;
> + }
> +
> + kunmap(bvec->bv_page);
> + }
> +
> +out:
> + kfree(sheep_buf);
> + return ret;
> +}
> +
> +static void sheep_request(struct request_queue *rq)
> +{
Better find another name, we already have a struct named 'sheep_request'.
Before merging this patch set, I'd expect we can support read/write
snapshot volumes too.
Thanks,
Yuan
More information about the sheepdog
mailing list