[sheepdog] [PATCH RFC 1/2] sheepdev: linux kernel module of block device driver for sheepdog
levin li
levin108 at gmail.com
Thu Dec 27 15:00:41 CET 2012
From: levin li <xingke.lwp at taobao.com>
This module provides function for users to take sheepdog VDIs as block
devices in linux, users can register a VDI to its kernel space, it just
like that a new hard disk is added to the computer, users can create
partitions for the disk, format the disk or mount the disk, it provides
users a efficient way to use sheepdog as distributed storage system.
The usage is easy, after install the module sheepdev.ko, it creates a proc
entry '/proc/entry', you can write into the proc entry file to control
the driver.
Add a new block device from an existing sheepdog VDI:
# echo "add 127.0.0.1:7070 a5d05d" > /proc/sheep
It would create a block device /dev/sheepa, you can format/mount this device:
# mkfs.ext4 /dev/sheepa
# mount -t ext4 /sheep/sheepa test
Remove a block device from the kernel:
# echo "del sheepa" > /proc/sheep
Signed-off-by: levin li <xingke.lwp at taobao.com>
---
sheepdev/connect.c | 178 ++++++++++++
sheepdev/module.c | 726 ++++++++++++++++++++++++++++++++++++++++++++++
sheepdev/sheep.c | 136 +++++++++
sheepdev/sheep.h | 88 ++++++
sheepdev/sheepdog_proto.h | 290 ++++++++++++++++++
5 files changed, 1418 insertions(+)
create mode 100644 sheepdev/connect.c
create mode 100644 sheepdev/module.c
create mode 100644 sheepdev/sheep.c
create mode 100644 sheepdev/sheep.h
create mode 100644 sheepdev/sheepdog_proto.h
diff --git a/sheepdev/connect.c b/sheepdev/connect.c
new file mode 100644
index 0000000..009a7b9
--- /dev/null
+++ b/sheepdev/connect.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2012 Taobao Inc.
+ *
+ * Levin Li <xingke.lwp at taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sheep.h"
+#include "sheepdog_proto.h"
+
+int connect_to(struct socket **sock, const char *ip_addr, int port)
+{
+ int ret;
+ struct sockaddr_in addr;
+
+ ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, sock);
+ if (ret) {
+ DBPRT("fail to create socket\n");
+ return ret;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(port);
+ addr.sin_addr.s_addr = in_aton(ip_addr);
+
+ ret = (*sock)->ops->connect(*sock, (struct sockaddr *)&addr,
+ sizeof(addr), 0);
+
+ if (!ret)
+ DBPRT("connected to %s:%d\n", ip_addr, port);
+
+ return ret;
+}
+
+int do_read(struct socket *sock, char *buf, const size_t length)
+{
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0, received = 0, left = length;
+ mm_segment_t oldmm;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ while (left > 0) {
+ oldmm = get_fs();
+ set_fs(KERNEL_DS);
+ msg.msg_iov->iov_base = buf + received;
+ msg.msg_iov->iov_len = left;
+ ret = sock_recvmsg(sock, &msg, left, MSG_WAITALL);
+ set_fs(oldmm);
+ if (ret <= 0)
+ break;
+ left -= ret;
+ received += ret;
+ }
+
+ return ret;
+}
+
+static void forward_iov(struct msghdr *msg, int len)
+{
+ while (msg->msg_iov->iov_len <= len) {
+ len -= msg->msg_iov->iov_len;
+ msg->msg_iov++;
+ msg->msg_iovlen--;
+ }
+
+ msg->msg_iov->iov_base = (char *) msg->msg_iov->iov_base + len;
+ msg->msg_iov->iov_len -= len;
+}
+
+
+static int do_write(struct socket *sock, struct msghdr *msg, int len)
+{
+ int ret;
+ mm_segment_t oldmm;
+
+rewrite:
+ oldmm = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sock_sendmsg(sock, msg, len);
+ set_fs(oldmm);
+
+ if (ret < 0) {
+ if (ret == -EINTR)
+ goto rewrite;
+ if (ret == -EBUSY) {
+ DBPRT("busy\n");
+ goto rewrite;
+ }
+ DBPRT("failed to write to socket: %d\n", ret);
+ return -EFAULT;
+ }
+
+ len -= ret;
+ if (len) {
+ forward_iov(msg, ret);
+ goto rewrite;
+ }
+
+ return 0;
+}
+
+int send_req(struct socket *sock, struct sd_req *hdr, void *data,
+ unsigned int wlen)
+{
+ int ret;
+ struct msghdr msg;
+ struct iovec iov[2];
+
+ memset(&msg, 0, sizeof(msg));
+
+ msg.msg_iov = iov;
+
+ msg.msg_iovlen = 1;
+ iov[0].iov_base = hdr;
+ iov[0].iov_len = sizeof(*hdr);
+
+ if (wlen) {
+ msg.msg_iovlen++;
+ iov[1].iov_base = data;
+ iov[1].iov_len = wlen;
+ }
+
+ ret = do_write(sock, &msg, sizeof(*hdr) + wlen);
+ if (ret) {
+ DBPRT("failed to send request %x, %d\n", hdr->opcode, wlen);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+int exec_req(struct socket *sock, struct sd_req *hdr, void *data)
+{
+ int ret;
+ struct sd_rsp *rsp = (struct sd_rsp *)hdr;
+ unsigned int wlen, rlen;
+
+ if (hdr->flags & SD_FLAG_CMD_WRITE) {
+ wlen = hdr->data_length;
+ rlen = 0;
+ } else {
+ wlen = 0;
+ rlen = hdr->data_length;
+ }
+
+ if (send_req(sock, hdr, data, wlen))
+ return -EFAULT;
+
+ ret = do_read(sock, (char *)rsp, sizeof(*rsp));
+ if (ret < 0) {
+ DBPRT("failed to read a response\n");
+ return -EFAULT;
+ }
+
+ if (rlen > rsp->data_length)
+ rlen = rsp->data_length;
+
+ if (rlen) {
+ ret = do_read(sock, data, rlen);
+ if (ret < 0) {
+ DBPRT("failed to read the response data\n");
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
diff --git a/sheepdev/module.c b/sheepdev/module.c
new file mode 100644
index 0000000..bde57d3
--- /dev/null
+++ b/sheepdev/module.c
@@ -0,0 +1,726 @@
+/*
+ * Copyright (C) 2012 Taobao Inc.
+ *
+ * Levin Li <xingke.lwp at taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/proc_fs.h>
+#include <linux/kthread.h>
+#include "sheep.h"
+
+static int sheepdev_major;
+spinlock_t devices_lock;
+struct list_head dev_list;
+static unsigned long *device_bitmap;
+static struct proc_dir_entry *sheep_proc_entry;
+
+static void sheepdev_get(struct sheepdev *dev)
+{
+ atomic_inc(&dev->struct_refcnt);
+}
+
+static void sheepdev_put(struct sheepdev *dev)
+{
+ if (atomic_dec_and_test(&dev->struct_refcnt))
+ kfree(dev);
+}
+
+static int add_request(struct sheepdev *dev, struct request *req, uint64_t oid,
+ int idx)
+{
+ struct sheep_request *s_req = kmalloc(sizeof(*s_req), GFP_KERNEL);
+ if (!s_req)
+ return -EIO;
+
+ s_req->req_id = dev->req_id;
+ s_req->req = req;
+ s_req->oid = oid;
+ s_req->idx = idx;
+ INIT_LIST_HEAD(&s_req->list);
+
+ spin_lock_irq(&dev->fin_lock);
+ list_add_tail(&s_req->list, &dev->finish_list);
+ spin_unlock_irq(&dev->fin_lock);
+
+ if (dev->req_id > UINT_MAX)
+ dev->req_id = 1;
+ else
+ dev->req_id++;
+
+ return 0;
+}
+
+static void sheep_end_request(struct request *req, int ret)
+{
+ struct request_queue *q = req->q;
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ __blk_end_request_all(req, ret);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static int sheep_handle_request(struct request *req)
+{
+ struct req_iterator iter;
+ struct bio_vec *bvec;
+ struct gendisk *disk = req->rq_disk;
+ struct sheepdev *dev = disk->private_data;
+ unsigned long sector = blk_rq_pos(req);
+ unsigned long offset = sector * KERNEL_SECTOR_SIZE;
+ unsigned long nbytes = blk_rq_bytes(req);
+ int idx = offset / SHEEP_OBJECT_SIZE + 1;
+ uint64_t oid = vid_to_data_oid(dev->vid, idx);
+ uint64_t off = offset % SHEEP_OBJECT_SIZE;
+ int ret = 0, len = 0, create = 0;
+ int write = rq_data_dir(req);
+ void *sheep_buf = NULL;
+
+ if (!write && dev->inode->data_vdi_id[idx] != dev->vid) {
+ rq_for_each_segment(bvec, req, iter) {
+ void *addr = kmap(bvec->bv_page);
+ memset(addr + bvec->bv_offset, 0, bvec->bv_len);
+ kunmap(bvec->bv_page);
+ }
+ sheep_end_request(req, 0);
+ return 0;
+ } else if (!write) {
+ ret = send_read_req(dev, oid, nbytes, off);
+ if (ret)
+ return -EIO;
+
+ ret = add_request(dev, req, oid, idx);
+ if (ret)
+ return -EIO;
+
+ return 0;
+ }
+
+ /* For write requests */
+ sheep_buf = kmalloc(nbytes, GFP_KERNEL);
+ if (!sheep_buf)
+ return -EIO;
+
+ spin_lock(&dev->creating_lock);
+ if (!dev->inode->data_vdi_id[idx]) {
+ dev->inode->data_vdi_id[idx] = 1;
+ create = 1;
+ spin_unlock(&dev->creating_lock);
+ } else if (dev->inode->data_vdi_id[idx] != dev->vid){
+
+ spin_unlock(&dev->creating_lock);
+ wait_event_interruptible(dev->creating_wait,
+ dev->inode->data_vdi_id[idx] == dev->vid);
+ } else
+ spin_unlock(&dev->creating_lock);
+
+ rq_for_each_segment(bvec, req, iter) {
+ void *addr = kmap(bvec->bv_page);
+
+ memcpy(sheep_buf + len, addr + bvec->bv_offset, bvec->bv_len);
+ len += bvec->bv_len;
+
+ if (rq_iter_last(req, iter)) {
+ ret = send_write_req(dev, oid, sheep_buf, len, off,
+ create);
+ if (ret != SD_RES_SUCCESS) {
+ kunmap(bvec->bv_page);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = add_request(dev, req, oid, idx);
+ if (ret) {
+ kunmap(bvec->bv_page);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!create)
+ goto done;
+
+ /* For create operations we need to update inode data */
+ oid = vid_to_vdi_oid(dev->vid);
+ off = offsetof(struct sheepdog_inode, data_vdi_id);
+ off += sizeof(uint32_t) * idx;
+ ret = send_write_req(dev, oid, (char *)&dev->vid,
+ sizeof(dev->vid), off, 0);
+ if (ret != SD_RES_SUCCESS) {
+ kunmap(bvec->bv_page);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = add_request(dev, req, oid, idx);
+ if (ret) {
+ kunmap(bvec->bv_page);
+ ret = -EIO;
+ goto out;
+ }
+done:;
+ }
+
+ kunmap(bvec->bv_page);
+ }
+
+out:
+ kfree(sheep_buf);
+ return ret;
+}
+
+static void sheep_request(struct request_queue *rq)
+{
+ struct request *req;
+ struct gendisk *disk;
+ struct sheepdev *dev;
+
+ while ((req = blk_fetch_request(rq)) != NULL) {
+
+ disk = req->rq_disk;
+ dev = disk->private_data;
+
+ if (req->cmd_type != REQ_TYPE_FS) {
+ DBPRT("Skip non-fs request\n");
+ __blk_end_request_all(req, -EIO);
+ }
+
+ spin_lock(&dev->req_lock);
+ list_add_tail(&req->queuelist, &dev->pending_list);
+ spin_unlock(&dev->req_lock);
+
+ wake_up_interruptible(&dev->req_wait);
+ }
+}
+
+static int req_process_func(void *data)
+{
+ struct sheepdev *dev = (struct sheepdev *)data;
+ struct request *req;
+ int ret;
+
+ sheepdev_get(dev);
+
+ while (!kthread_should_stop() || !list_empty(&dev->pending_list)) {
+ wait_event_interruptible(dev->req_wait,
+ !list_empty(&dev->pending_list) ||
+ kthread_should_stop());
+
+ spin_lock(&dev->req_lock);
+ if (list_empty(&dev->pending_list)) {
+ spin_unlock(&dev->req_lock);
+ continue;
+ }
+
+ req = list_entry(dev->pending_list.next, struct request,
+ queuelist);
+ list_del_init(&req->queuelist);
+ spin_unlock(&dev->req_lock);
+
+ ret = sheep_handle_request(req);
+ if (ret)
+ sheep_end_request(req, ret);
+ else
+ wake_up_interruptible(&dev->fin_wait);
+ }
+
+ sheepdev_put(dev);
+
+ return 0;
+}
+
+static int sheepdev_open(struct block_device *blkdev, fmode_t mode)
+{
+ struct gendisk *disk = blkdev->bd_disk;
+ struct sheepdev *dev = disk->private_data;
+
+ spin_lock(&dev->dev_lock);
+ dev->device_refcnt++;
+ spin_unlock(&dev->dev_lock);
+
+ return 0;
+}
+
+static int sheepdev_release(struct gendisk *disk, fmode_t mode)
+{
+ struct sheepdev *dev = disk->private_data;
+
+ spin_lock(&dev->dev_lock);
+ dev->device_refcnt--;
+ spin_unlock(&dev->dev_lock);
+
+ return 0;
+}
+
+static struct block_device_operations sheepdev_ops = {
+ .owner = THIS_MODULE,
+ .open = sheepdev_open,
+ .release = sheepdev_release,
+};
+
+static int sheep_add_disk(struct sheepdev *dev)
+{
+ int ret;
+ struct request_queue *queue;
+
+ dev->disk = alloc_disk(SHEEP_BLKDEV_MINORS);
+ if (!dev->disk) {
+ DBPRT("allocate gendisk failure\n");
+ ret = -EBUSY;
+ return ret;
+ }
+ queue = blk_init_queue(sheep_request, &dev->que_lock);
+ /* 4M boundary */
+ blk_queue_segment_boundary(queue, 0x3fffff);
+ dev->disk->major = sheepdev_major;
+ dev->disk->first_minor = dev->minor * SHEEP_BLKDEV_MINORS;
+ dev->disk->queue = queue;
+ dev->disk->fops = &sheepdev_ops;
+ dev->disk->private_data = dev;
+ snprintf(dev->disk->disk_name, sizeof(dev->disk->disk_name),
+ SHEEP_BLKDEV_NAME"%c", dev->minor + 'a');
+
+ set_capacity(dev->disk, dev->sectors);
+ add_disk(dev->disk);
+
+ return 0;
+}
+
+static struct sheep_request *find_request(struct sheepdev *dev, int id)
+{
+ struct sheep_request *req, *t;
+
+ spin_lock_irq(&dev->fin_lock);
+ list_for_each_entry_safe(req, t, &dev->finish_list, list) {
+ if (req->req_id != id)
+ continue;
+ list_del_init(&req->list);
+ spin_unlock_irq(&dev->fin_lock);
+ return req;
+ }
+ spin_unlock_irq(&dev->fin_lock);
+
+ return NULL;
+}
+
+static int read_reply(struct sheepdev *dev, int *req_id, int *result,
+ void **data)
+{
+ int ret;
+ struct sd_rsp rsp;
+ void *buf = NULL;
+
+ *result = 0;
+ *req_id = 0;
+ *data = NULL;
+
+ ret = do_read(dev->sock, (char *)&rsp, sizeof(rsp));
+ if (ret < 0) {
+ DBPRT("failed to read response\n");
+ return -EIO;
+ }
+
+ if (rsp.data_length > 0) {
+ buf = kmalloc(rsp.data_length, GFP_KERNEL);
+ if (!buf) {
+ DBPRT("No-mem\n");
+ return -ENOMEM;
+ }
+
+ ret = do_read(dev->sock, buf, rsp.data_length);
+ if (ret != rsp.data_length) {
+ kfree(buf);
+ return -EIO;
+ }
+ }
+
+ *req_id = rsp.id;
+ *result = rsp.result;
+ *data = buf;
+
+ return 0;
+}
+
+static void cleanup_finish_list(struct sheepdev *dev)
+{
+ struct sheep_request *req, *t;
+
+ spin_lock(&dev->fin_lock);
+ list_for_each_entry_safe(req, t, &dev->finish_list, list) {
+ list_del_init(&req->list);
+ sheep_end_request(req->req, -EIO);
+ kfree(req);
+ }
+
+ spin_unlock(&dev->fin_lock);
+}
+
+static int fin_process_func(void *data)
+{
+ struct sheepdev *dev = data;
+ struct sheep_request *sheep_req;
+ struct request *req;
+ int ret, req_id, res;
+
+ sheepdev_get(dev);
+
+ while (!kthread_should_stop() || !list_empty(&dev->finish_list)) {
+ void *buf = NULL;
+
+ wait_event_interruptible(dev->fin_wait,
+ !list_empty(&dev->finish_list) ||
+ kthread_should_stop());
+
+ spin_lock_irq(&dev->fin_lock);
+ if (list_empty(&dev->finish_list)) {
+ spin_unlock_irq(&dev->fin_lock);
+ continue;
+ }
+ spin_unlock_irq(&dev->fin_lock);
+
+ ret = read_reply(dev, &req_id, &res, &buf);
+ if (ret) {
+ cleanup_finish_list(dev);
+ continue;
+ }
+
+ sheep_req = find_request(dev, req_id);
+ if (!sheep_req)
+ goto next;
+ req = sheep_req->req;
+
+ if (rq_data_dir(req)) {
+ int idx;
+
+ res = (res != SD_RES_SUCCESS) ? -EIO : 0;
+ if (sheep_req->oid == vid_to_vdi_oid(dev->vid)) {
+ /* inode-update response */
+ idx = sheep_req->idx;
+ } else {
+ /* oridinary write response */
+ idx = data_oid_to_idx(sheep_req->oid);
+
+ /* obj already exist */
+ if (dev->inode->data_vdi_id[idx] == dev->vid) {
+ sheep_end_request(req, res);
+ goto next;
+ }
+ }
+
+ spin_lock(&dev->creating_lock);
+ if (dev->inode->data_vdi_id[idx] == 2) {
+ /*
+ * Both obj-write and inode-update are complete
+ * we can end the write request and wake other
+ * requests waiting for this object.
+ */
+ dev->inode->data_vdi_id[idx] = dev->vid;
+ spin_unlock(&dev->creating_lock);
+
+ sheep_end_request(req, res);
+ wake_up_interruptible(&dev->creating_wait);
+
+ goto next;
+ } else {
+ /*
+ * wait for obj-write or inode-update to complete
+ */
+ dev->inode->data_vdi_id[idx]++;
+ }
+ spin_unlock(&dev->creating_lock);
+
+ } else {
+ int len = 0;
+ struct req_iterator iter;
+ struct bio_vec *bvec;
+
+ if (res != SD_RES_SUCCESS) {
+ sheep_end_request(req, -EIO);
+ goto next;
+ }
+
+ rq_for_each_segment(bvec, req, iter) {
+ void *addr = kmap(bvec->bv_page);
+ memcpy(addr + bvec->bv_offset, buf + len,
+ bvec->bv_len);
+ len += bvec->bv_len;
+ kunmap(bvec->bv_page);
+ }
+ sheep_end_request(req, 0);
+ }
+next:
+ kfree(buf);
+ kfree(sheep_req);
+ }
+
+ sheepdev_put(dev);
+ return 0;
+}
+
+static int dev_setup(struct sheepdev *dev)
+{
+ int ret;
+
+ ret = sheep_vdi_setup(dev);
+ if (ret) {
+ return ret;
+ }
+
+ spin_lock_init(&dev->que_lock);
+ spin_lock_init(&dev->req_lock);
+ spin_lock_init(&dev->fin_lock);
+ spin_lock_init(&dev->dev_lock);
+ spin_lock_init(&dev->creating_lock);
+ init_waitqueue_head(&dev->req_wait);
+ init_waitqueue_head(&dev->fin_wait);
+ init_waitqueue_head(&dev->creating_wait);
+ INIT_LIST_HEAD(&dev->pending_list);
+ INIT_LIST_HEAD(&dev->finish_list);
+ INIT_LIST_HEAD(&dev->dev_list);
+
+ dev->req_id = 1;
+ dev->req_thread = kthread_run(req_process_func, dev,
+ "sheep_req");
+ dev->fin_thread = kthread_run(fin_process_func, dev,
+ "sheep_fin");
+
+ ret = sheep_add_disk(dev);
+ if (ret) {
+ return ret;
+ }
+
+ return 0;
+}
+
+#define MAX_CMD_LEN 64
+
+static int process_add_command(char *buf, int len)
+{
+ int i, ret = 0;
+ struct sheepdev *dev;
+
+ dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+ memset(dev, 0, sizeof(*dev));
+
+ for (i = 0; buf[i] != '\0' && buf[i] != '\n' &&
+ buf[i] != ' ' && buf[i] != ':' && i < len; i++);
+
+ if (buf[i] != ' ' && buf[i] != ':') {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ memcpy(dev->ip_addr, buf, i);
+ dev->ip_addr[i] = '\0';
+ if (buf[i] == ' ') {
+ dev->port = SD_LISTEN_PORT;
+ buf = &buf[i + 1];
+ } else {
+ /* start from ':' to ' ' */
+ char *tmp = &buf[i + 1];
+ len -= (i + 1);
+ for (i = 0; tmp[i] != ' ' && tmp[i] != '\0' &&
+ tmp[i] != '\n' && i < len; i++);
+ if (tmp[i] != ' ') {
+ ret = -EINVAL;
+ goto out;
+ }
+ tmp[i] = '\0';
+ buf = &tmp[i + 1];
+ dev->port = simple_strtol(tmp, NULL, 10);
+ }
+
+ dev->vid = simple_strtol(buf, NULL, 16);
+
+ spin_lock(&devices_lock);
+ dev->minor = find_next_zero_bit(device_bitmap, SHEEP_BLKDEV_MINORS, 0);
+ set_bit(dev->minor, device_bitmap);
+ spin_unlock(&devices_lock);
+
+ ret = dev_setup(dev);
+ if (ret) {
+ clear_bit(dev->minor, device_bitmap);
+ goto out;
+ } else {
+ sheepdev_get(dev);
+ spin_lock(&devices_lock);
+ list_add_tail(&dev->dev_list, &dev_list);
+ spin_unlock(&devices_lock);
+ }
+
+ return ret;
+out:
+ kfree(dev);
+ return ret;
+}
+
+static void remove_device(struct sheepdev *dev)
+{
+ DBPRT("remove device /dev/%s\n", dev->disk->disk_name);
+
+ kthread_stop(dev->req_thread);
+ kthread_stop(dev->fin_thread);
+ wake_up_interruptible(&dev->req_wait);
+ wake_up_interruptible(&dev->fin_wait);
+
+ blk_cleanup_queue(dev->disk->queue);
+ del_gendisk(dev->disk);
+ put_disk(dev->disk);
+
+ clear_bit(dev->minor, device_bitmap);
+ inet_release(dev->sock);
+
+ sheepdev_put(dev);
+}
+
+static int process_del_command(char *buf, int len)
+{
+ struct sheepdev *dev, *t;
+ int ret = 0;
+
+ if (buf[len - 1] != '\n')
+ return -EINVAL;
+ buf[len - 1] = '\0';
+
+ spin_lock(&devices_lock);
+ list_for_each_entry_safe(dev, t, &dev_list, dev_list) {
+ if (strcmp(buf, dev->disk->disk_name) != 0)
+ continue;
+
+ spin_lock(&dev->dev_lock);
+ if (dev->device_refcnt) {
+ spin_unlock(&dev->dev_lock);
+ ret = -EBUSY;
+ } else {
+ spin_unlock(&dev->dev_lock);
+ list_del_init(&dev->dev_list);
+ remove_device(dev);
+ }
+
+ break;
+ }
+ spin_unlock(&devices_lock);
+
+ return ret;
+}
+
+static ssize_t sheep_proc_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *offset)
+{
+ char *kern_buf, cmd_buf[MAX_CMD_LEN];
+ int i, ret;
+
+ kern_buf = kmalloc(len, GFP_KERNEL);
+ if (!kern_buf)
+ return -ENOMEM;
+
+ if (copy_from_user(kern_buf, buf, len)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; kern_buf[i] != '\0' && kern_buf[i] != '\n' &&
+ kern_buf[i] != ' ' && i < len; i++);
+
+ if (i > MAX_CMD_LEN || kern_buf[i] != ' ') {
+ ret = -EINVAL;
+ goto out;
+ }
+ memcpy(cmd_buf, kern_buf, i);
+ cmd_buf[i] = '\0';
+ if (strcmp(cmd_buf, "add") == 0) {
+ ret = process_add_command(&kern_buf[i + 1], len - i - 1);
+ if (ret)
+ goto out;
+ } else if (strcmp(cmd_buf, "del") == 0) {
+ ret = process_del_command(&kern_buf[i + 1], len - i - 1);
+ if (ret)
+ goto out;
+
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = len;
+out:
+ kfree(kern_buf);
+ return ret;
+}
+
+static struct file_operations sheep_proc_fops = {
+ .write = sheep_proc_write,
+};
+
+static int __init sheep_module_init(void)
+{
+ int ret;
+
+ DBPRT("Block device driver for Sheepdog\n");
+
+ spin_lock_init(&devices_lock);
+ INIT_LIST_HEAD(&dev_list);
+ device_bitmap = kmalloc(SHEEP_BLKDEV_MINORS / 8, GFP_KERNEL);
+ if (!device_bitmap)
+ return -ENOMEM;
+ memset(device_bitmap, 0, SHEEP_BLKDEV_MINORS / 8);
+
+ /* create proc entry for sheep control */
+ sheep_proc_entry = create_proc_entry(PROC_ENTRY_NAME,
+ S_IFREG | S_IRUGO | S_IWUGO, NULL);
+ if (!sheep_proc_entry)
+ return -ENOMEM;
+
+ sheep_proc_entry->proc_fops = &sheep_proc_fops;
+
+ sheepdev_major = register_blkdev(0, SHEEP_BLKDEV_NAME);
+ if (sheepdev_major < 0) {
+ ret = sheepdev_major;
+ goto error;
+ }
+
+ return 0;
+
+error:
+ remove_proc_entry(PROC_ENTRY_NAME, NULL);
+ return ret;
+}
+
+static void __exit sheep_module_exit(void)
+{
+ struct sheepdev *dev, *t;
+
+ list_for_each_entry_safe(dev, t, &dev_list, dev_list) {
+ list_del_init(&dev->dev_list);
+ remove_device(dev);
+ }
+
+ remove_proc_entry(PROC_ENTRY_NAME, NULL);
+ unregister_blkdev(sheepdev_major, SHEEP_BLKDEV_NAME);
+
+ kfree(device_bitmap);
+
+ DBPRT("Sheepdog Block Device Removed.\n");
+}
+
+module_init(sheep_module_init);
+module_exit(sheep_module_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/sheepdev/sheep.c b/sheepdev/sheep.c
new file mode 100644
index 0000000..642a5e7
--- /dev/null
+++ b/sheepdev/sheep.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2012 Taobao Inc.
+ *
+ * Levin Li <xingke.lwp at taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "sheep.h"
+#include "sheepdog_proto.h"
+
+static void sd_init_req(struct sd_req *req, uint8_t opcode)
+{
+ memset(req, 0, sizeof(*req));
+ req->opcode = opcode;
+}
+
+static int read_object(struct sheepdev *dev, uint64_t oid, void *data,
+ unsigned int datalen, uint64_t offset)
+{
+ struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+ int ret;
+
+ sd_init_req(&hdr, SD_OP_READ_OBJ);
+ hdr.id = 0;
+ hdr.data_length = datalen;
+
+ hdr.obj.oid = oid;
+ hdr.obj.offset = offset;
+
+ ret = exec_req(dev->sock, &hdr, data);
+
+ if (ret < 0) {
+ DBPRT("Failed to read object %llx\n", oid);
+ return SD_RES_EIO;
+ }
+
+ if (rsp->result != SD_RES_SUCCESS) {
+ DBPRT("Failed to read object %llx,%d\n", oid,
+ rsp->result);
+ return SD_RES_EIO;
+ }
+
+ return SD_RES_SUCCESS;
+}
+
+int send_read_req(struct sheepdev *dev, uint64_t oid,
+ unsigned int datalen, uint64_t offset)
+{
+ struct sd_req hdr;
+ int ret;
+
+ sd_init_req(&hdr, SD_OP_READ_OBJ);
+ hdr.id = dev->req_id;
+ hdr.data_length = datalen;
+
+ hdr.obj.oid = oid;
+ hdr.obj.offset = offset;
+
+ ret = send_req(dev->sock, &hdr, NULL, 0);
+
+ if (ret < 0) {
+ DBPRT("Failed to read object %llx\n", oid);
+ return SD_RES_EIO;
+ }
+
+ return SD_RES_SUCCESS;
+}
+
+int send_write_req(struct sheepdev *dev, uint64_t oid, void *data,
+ unsigned int datalen, uint64_t offset, int create)
+{
+ struct sd_req hdr;
+ int ret;
+
+ if (create)
+ sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ);
+ else
+ sd_init_req(&hdr, SD_OP_WRITE_OBJ);
+
+ hdr.id = dev->req_id;
+ hdr.data_length = datalen;
+ hdr.flags = SD_FLAG_CMD_WRITE | SD_FLAG_CMD_DIRECT;
+
+ hdr.obj.oid = oid;
+ hdr.obj.offset = offset;
+ hdr.obj.copies = dev->inode->nr_copies;
+
+ ret = send_req(dev->sock, &hdr, data, datalen);
+
+ if (ret < 0) {
+ DBPRT("Failed to write object %llx\n", oid);
+ return SD_RES_EIO;
+ }
+
+ return SD_RES_SUCCESS;
+}
+
+int sheep_vdi_setup(struct sheepdev *dev)
+{
+ int ret;
+ struct sheepdog_inode *inode;
+
+ inode = vmalloc(sizeof(*inode));
+ if (!inode)
+ return -ENOMEM;
+ memset(inode, 0 , sizeof(*inode));
+
+ ret = connect_to(&dev->sock, dev->ip_addr, dev->port);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = read_object(dev, vid_to_vdi_oid(dev->vid), inode,
+ SD_INODE_SIZE, 0);
+ if (ret != SD_RES_SUCCESS) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ dev->size = inode->vdi_size - SHEEP_OBJECT_SIZE;
+ dev->sectors = dev->size / KERNEL_SECTOR_SIZE;
+ dev->inode = inode;
+
+ return 0;
+out:
+ vfree(inode);
+ return ret;
+}
diff --git a/sheepdev/sheep.h b/sheepdev/sheep.h
new file mode 100644
index 0000000..55b4062
--- /dev/null
+++ b/sheepdev/sheep.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2012 Taobao Inc.
+ *
+ * Levin Li <xingke.lwp at taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __SHEEP_H_
+#define __SHEEP_H_
+
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <net/sock.h>
+#include <linux/tcp.h>
+#include <linux/socket.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <net/inet_common.h>
+#include "sheepdog_proto.h"
+
+#define SHEEP_OBJECT_SIZE (4 * 1024 * 1024)
+
+#define SHEEP_BLKDEV_NAME "sheep"
+#define PROC_ENTRY_NAME "sheep"
+#define KERNEL_SECTOR_SIZE 512
+#define SHEEP_BLKDEV_MINORS 1024
+
+#define DBPRT(fmt, args...) printk(KERN_DEBUG "sheep: " fmt, ##args)
+
+struct sheepdev {
+ struct gendisk *disk;
+ struct socket *sock;
+ char ip_addr[16];
+ unsigned int port;
+ unsigned int minor;
+ unsigned int req_id;
+ unsigned int vid;
+ unsigned long size;
+ unsigned long sectors;
+ atomic_t struct_refcnt;
+ unsigned int device_refcnt;
+ spinlock_t dev_lock;
+ spinlock_t req_lock;
+ spinlock_t fin_lock;
+ spinlock_t que_lock;
+ spinlock_t creating_lock;
+ struct task_struct *req_thread;
+ struct task_struct *fin_thread;
+ wait_queue_head_t req_wait;
+ wait_queue_head_t fin_wait;
+ wait_queue_head_t creating_wait;
+ struct list_head pending_list;
+ struct list_head finish_list;
+ struct list_head dev_list;
+ struct sheepdog_inode *inode;
+};
+
+struct sheep_request {
+ int req_id;
+ int idx; /* idx is only used when update inode */
+ uint64_t oid;
+ struct request *req;
+ struct list_head list;
+};
+
+/* connect.c */
+int connect_to(struct socket **sock, const char *addr, int port);
+int send_req(struct socket *sock, struct sd_req *hdr, void *data,
+ unsigned int wlen);
+int do_read(struct socket *sock, char *buf, const size_t length);
+int exec_req(struct socket *sock, struct sd_req *hdr, void *data);
+
+/* sheep.c */
+int send_read_req(struct sheepdev *sheepdev, uint64_t oid,
+ unsigned int datalen, uint64_t offset);
+int send_write_req(struct sheepdev *sheepdev, uint64_t oid, void *data,
+ unsigned int datalen, uint64_t offset, int create);
+int sheep_vdi_setup(struct sheepdev *sheep_dev);
+
+#endif
diff --git a/sheepdev/sheepdog_proto.h b/sheepdev/sheepdog_proto.h
new file mode 100644
index 0000000..3a0452c
--- /dev/null
+++ b/sheepdev/sheepdog_proto.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __SHEEPDOG_PROTO_H__
+#define __SHEEPDOG_PROTO_H__
+
+#define UINT64_MAX (18446744073709551615ULL)
+#define UINT64_C(x) ((x) + (UINT64_MAX - UINT64_MAX))
+
+/* This or later version supports trimming zero sectors from read response */
+#define SD_PROTO_VER_TRIM_ZERO_SECTORS 0x02
+
+#define SD_LISTEN_PORT 7000
+
+#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
+#define SD_OP_READ_OBJ 0x02
+#define SD_OP_WRITE_OBJ 0x03
+#define SD_OP_REMOVE_OBJ 0x04
+
+#define SD_OP_NEW_VDI 0x11
+#define SD_OP_LOCK_VDI 0x12
+#define SD_OP_RELEASE_VDI 0x13
+#define SD_OP_GET_VDI_INFO 0x14
+#define SD_OP_READ_VDIS 0x15
+#define SD_OP_FLUSH_VDI 0x16
+
+#define SD_FLAG_CMD_WRITE 0x01
+#define SD_FLAG_CMD_COW 0x02
+#define SD_FLAG_CMD_CACHE 0x04
+#define SD_FLAG_CMD_DIRECT 0x08 /* don't use object cache */
+/* flags above 0x80 are sheepdog-internal */
+
+#define SD_RES_SUCCESS 0x00 /* Success */
+#define SD_RES_UNKNOWN 0x01 /* Unknown error */
+#define SD_RES_NO_OBJ 0x02 /* No object found */
+#define SD_RES_EIO 0x03 /* I/O error */
+#define SD_RES_VDI_EXIST 0x04 /* VDI exists already */
+#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
+#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
+#define SD_RES_VDI_LOCKED 0x07 /* VDI is locked */
+#define SD_RES_NO_VDI 0x08 /* No VDI found */
+#define SD_RES_NO_BASE_VDI 0x09 /* No base VDI found */
+#define SD_RES_VDI_READ 0x0A /* Cannot read requested VDI */
+#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested VDI */
+#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base VDI */
+#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base VDI */
+#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
+#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
+#define SD_RES_VDI_NOT_LOCKED 0x10 /* VDI is not locked */
+#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
+#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
+#define SD_RES_FULL_VDI 0x13 /* we already have the maximum VDIs */
+#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
+#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
+#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Sheepdog is waiting for a format operation */
+#define SD_RES_WAIT_FOR_JOIN 0x17 /* Sheepdog is waiting for other nodes joining */
+#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
+#define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */
+#define SD_RES_FORCE_RECOVER 0x1A /* Users should not force recover this cluster */
+#define SD_RES_NO_STORE 0x20 /* No targeted backend store */
+#define SD_RES_NO_SUPPORT 0x21 /* Operation is not supported by backend store */
+#define SD_RES_NODE_IN_RECOVERY 0x22 /* Targeted node is in recovery */
+#define SD_RES_OBJ_RECOVERING 0x23 /* Object is recovering */
+#define SD_RES_KILLED 0x24 /* Node is killed */
+#define SD_RES_OID_EXIST 0x25 /* Object ID exists already */
+#define SD_RES_AGAIN 0x26 /* Ask to try again */
+
+/* errors above 0x80 are sheepdog-internal */
+
+/*
+ * Object ID rules
+ *
+ * 0 - 19 (20 bits): data object space
+ * 20 - 31 (12 bits): reserved data object space
+ * 32 - 55 (24 bits): VDI object space
+ * 56 - 59 ( 4 bits): reserved VDI object space
+ * 60 - 63 ( 4 bits): object type indentifier space
+ */
+
+#define VDI_SPACE_SHIFT 32
+#define VDI_BIT (UINT64_C(1) << 63)
+#define VMSTATE_BIT (UINT64_C(1) << 62)
+#define VDI_ATTR_BIT (UINT64_C(1) << 61)
+#define MAX_DATA_OBJS (1ULL << 20)
+#define MAX_CHILDREN 1024U
+#define SD_MAX_VDI_LEN 256U
+#define SD_MAX_VDI_TAG_LEN 256U
+#define SD_MAX_VDI_ATTR_KEY_LEN 256U
+#define SD_MAX_VDI_ATTR_VALUE_LEN 65536U
+#define SD_NR_VDIS (1U << 24)
+#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
+#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
+
+#define SD_INODE_SIZE (sizeof(struct sheepdog_inode))
+#define SD_INODE_HEADER_SIZE (sizeof(struct sheepdog_inode) - \
+ sizeof(uint32_t) * MAX_DATA_OBJS)
+#define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr))
+#define CURRENT_VDI_ID 0
+
+#define STORE_LEN 16
+
+struct sd_req {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ union {
+ struct {
+ uint64_t oid;
+ uint64_t cow_oid;
+ uint32_t copies;
+ uint32_t tgt_epoch;
+ uint64_t offset;
+ } obj;
+ struct {
+ uint64_t vdi_size;
+ uint32_t base_vdi_id;
+ uint32_t copies;
+ uint32_t snapid;
+ } vdi;
+ uint32_t __pad[8];
+ };
+};
+
+struct sd_rsp {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ union {
+ uint32_t result;
+ struct {
+ uint32_t __pad;
+ uint32_t copies;
+ uint64_t offset;
+ } obj;
+ struct {
+ uint32_t __pad;
+ uint32_t rsvd;
+ uint32_t vdi_id;
+ uint32_t attr_id;
+ uint32_t copies;
+ } vdi;
+ uint32_t __pad[8];
+ };
+};
+
+struct sheepdog_inode {
+ char name[SD_MAX_VDI_LEN];
+ char tag[SD_MAX_VDI_TAG_LEN];
+ uint64_t create_time;
+ uint64_t snap_ctime;
+ uint64_t vm_clock_nsec;
+ uint64_t vdi_size;
+ uint64_t vm_state_size;
+ uint16_t copy_policy;
+ uint8_t nr_copies;
+ uint8_t block_size_shift;
+ uint32_t snap_id;
+ uint32_t vdi_id;
+ uint32_t parent_vdi_id;
+ uint32_t child_vdi_id[MAX_CHILDREN];
+ uint32_t data_vdi_id[MAX_DATA_OBJS];
+};
+
+struct sheepdog_vdi_attr {
+ char name[SD_MAX_VDI_LEN];
+ char tag[SD_MAX_VDI_TAG_LEN];
+ uint64_t ctime;
+ uint32_t snap_id;
+ uint32_t value_len;
+ char key[SD_MAX_VDI_ATTR_KEY_LEN];
+ char value[SD_MAX_VDI_ATTR_VALUE_LEN];
+};
+
+#define SHA1_LEN 20
+
+struct snap_log {
+ uint32_t epoch;
+ uint64_t time;
+ unsigned char sha1[SHA1_LEN];
+};
+
+/*
+ * 64 bit FNV-1a non-zero initial basis
+ */
+#define FNV1A_64_INIT ((uint64_t) 0xcbf29ce484222325ULL)
+
+/*
+ * 64 bit Fowler/Noll/Vo FNV-1a hash code
+ */
+static inline uint64_t fnv_64a_buf(const void *buf, size_t len, uint64_t hval)
+{
+ unsigned char *bp = (unsigned char *) buf;
+ unsigned char *be = bp + len;
+ while (bp < be) {
+ hval ^= (uint64_t) *bp++;
+ hval += (hval << 1) + (hval << 4) + (hval << 5) +
+ (hval << 7) + (hval << 8) + (hval << 40);
+ }
+ return hval;
+}
+
+static inline uint64_t hash_64(uint64_t val, unsigned int bits)
+{
+ uint64_t hash = fnv_64a_buf(&val, sizeof(uint64_t), FNV1A_64_INIT);
+
+ return hash & ((1 << bits) - 1);
+}
+
+static inline bool is_data_obj_writeable(const struct sheepdog_inode *inode,
+ int idx)
+{
+ return inode->vdi_id == inode->data_vdi_id[idx];
+}
+
+static inline bool is_vdi_obj(uint64_t oid)
+{
+ return !!(oid & VDI_BIT);
+}
+
+static inline bool is_vmstate_obj(uint64_t oid)
+{
+ return !!(oid & VMSTATE_BIT);
+}
+
+static inline bool is_vdi_attr_obj(uint64_t oid)
+{
+ return !!(oid & VDI_ATTR_BIT);
+}
+
+static inline bool is_data_obj(uint64_t oid)
+{
+ return !is_vdi_obj(oid) && !is_vmstate_obj(oid) &&
+ !is_vdi_attr_obj(oid);
+}
+
+static inline size_t get_objsize(uint64_t oid)
+{
+ if (is_vdi_obj(oid))
+ return SD_INODE_SIZE;
+
+ if (is_vdi_attr_obj(oid))
+ return SD_ATTR_OBJ_SIZE;
+
+ return SD_DATA_OBJ_SIZE;
+}
+
+static inline uint64_t data_oid_to_idx(uint64_t oid)
+{
+ return oid & (MAX_DATA_OBJS - 1);
+}
+
+static inline uint64_t vid_to_vdi_oid(uint32_t vid)
+{
+ return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
+}
+
+static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
+{
+ return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
+}
+
+static inline uint32_t oid_to_vid(uint64_t oid)
+{
+ return (~VDI_BIT & oid) >> VDI_SPACE_SHIFT;
+}
+
+static inline uint64_t vid_to_attr_oid(uint32_t vid, uint32_t attrid)
+{
+ return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_ATTR_BIT | attrid;
+}
+
+static inline uint32_t attr_oid_to_vid(uint64_t oid)
+{
+ return (~VDI_ATTR_BIT & oid) >> VDI_SPACE_SHIFT;
+}
+
+#endif
--
1.7.11.7
More information about the sheepdog
mailing list