[sheepdog] [PATCH 7/9] sheep: use copy_policy to control erasure vdi
Liu Yuan
namei.unix at gmail.com
Thu Sep 19 12:42:51 CEST 2013
inode->copy_policy is a good place to control erasure coded vdi. But I don't
think we need uint16_t for it, so I just allocate uint8_t for it.
to create a erasure coded vdi:
$dog vdi create -e test 10G # This will create a erasure coded vdi
For now we only use a fixed scheme (4 data and 2 parity strips) with '-e'. But
I have '-e number' in plan, that users could specify how many parity replica he
wants with different erasure scheme for different vdis. E.g, we can have
-e 2 --> 4 : 2 (0.5 redundancy and can stand with 2 nodes failure)
-e 3 --> 8 : 3 (0.375 redunandcy and can stand with 3 nodes failure)
-e 4 --> 8 : 4 (0.5 redandancy and can stand with 4 nodes failure)
Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
dog/dog.h | 2 +-
dog/farm/farm.c | 11 ++++++++---
dog/vdi.c | 24 +++++++++++++++---------
include/sheepdog_proto.h | 7 ++++++-
sheep/gateway.c | 2 ++
sheep/group.c | 3 ++-
sheep/ops.c | 9 +++++++--
sheep/plain_store.c | 2 +-
sheep/sheep_priv.h | 8 ++++++--
sheep/vdi.c | 39 +++++++++++++++++++++++++++++++--------
10 files changed, 79 insertions(+), 28 deletions(-)
diff --git a/dog/dog.h b/dog/dog.h
index 8c54c10..aac0bba 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -79,7 +79,7 @@ void confirm(const char *message);
void work_queue_wait(struct work_queue *q);
int do_vdi_create(const char *vdiname, int64_t vdi_size,
uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
- int nr_copies);
+ int nr_copies, uint8_t copy_policy);
int do_vdi_check(const struct sd_inode *inode);
void show_progress(uint64_t done, uint64_t total, bool raw);
diff --git a/dog/farm/farm.c b/dog/farm/farm.c
index 3ac0f4a..6f26f24 100644
--- a/dog/farm/farm.c
+++ b/dog/farm/farm.c
@@ -28,6 +28,7 @@ struct vdi_entry {
uint32_t vdi_id;
uint32_t snap_id;
uint8_t nr_copies;
+ uint8_t copy_policy;
struct list_node list;
};
static LIST_HEAD(last_vdi_list);
@@ -53,7 +54,7 @@ static struct vdi_entry *find_vdi(const char *name)
static struct vdi_entry *new_vdi(const char *name, uint64_t vdi_size,
uint32_t vdi_id, uint32_t snap_id,
- uint8_t nr_copies)
+ uint8_t nr_copies, uint8_t copy_policy)
{
struct vdi_entry *vdi;
vdi = xmalloc(sizeof(struct vdi_entry));
@@ -62,6 +63,7 @@ static struct vdi_entry *new_vdi(const char *name, uint64_t vdi_size,
vdi->vdi_id = vdi_id;
vdi->snap_id = snap_id;
vdi->nr_copies = nr_copies;
+ vdi->copy_policy = copy_policy;
return vdi;
}
@@ -74,13 +76,15 @@ static void insert_vdi(struct sd_inode *new)
new->vdi_size,
new->vdi_id,
new->snap_id,
- new->nr_copies);
+ new->nr_copies,
+ new->copy_policy);
list_add(&vdi->list, &last_vdi_list);
} else if (vdi->snap_id < new->snap_id) {
vdi->vdi_size = new->vdi_size;
vdi->vdi_id = new->vdi_id;
vdi->snap_id = new->snap_id;
vdi->nr_copies = new->nr_copies;
+ vdi->copy_policy = new->copy_policy;
}
}
@@ -92,7 +96,8 @@ static int create_active_vdis(void)
if (do_vdi_create(vdi->name,
vdi->vdi_size,
vdi->vdi_id, &new_vid,
- false, vdi->nr_copies) < 0)
+ false, vdi->nr_copies,
+ vdi->copy_policy) < 0)
return -1;
}
return 0;
diff --git a/dog/vdi.c b/dog/vdi.c
index 63fb0b1..b101d43 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -30,6 +30,7 @@ static struct sd_option vdi_options[] = {
{'c', "copies", true, "specify the data redundancy (number of copies)"},
{'F', "from", true, "create a differential backup from the snapshot"},
{'f', "force", false, "do operation forcibly"},
+ {'e', "erasure", false, "create erasure coded vdi"},
{ 0, NULL, false, NULL },
};
@@ -45,6 +46,7 @@ static struct vdi_cmd_data {
int from_snapshot_id;
char from_snapshot_tag[SD_MAX_VDI_TAG_LEN];
bool force;
+ uint8_t copy_policy;
} vdi_cmd_data = { ~0, };
struct get_vdi_info {
@@ -467,8 +469,8 @@ static int read_vdi_obj(const char *vdiname, int snapid, const char *tag,
}
int do_vdi_create(const char *vdiname, int64_t vdi_size,
- uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
- int nr_copies)
+ uint32_t base_vid, uint32_t *vdi_id, bool snapshot,
+ int nr_copies, uint8_t copy_policy)
{
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
@@ -486,6 +488,7 @@ int do_vdi_create(const char *vdiname, int64_t vdi_size,
hdr.vdi.snapid = snapshot ? 1 : 0;
hdr.vdi.vdi_size = vdi_size;
hdr.vdi.copies = nr_copies;
+ hdr.vdi.copy_policy = copy_policy;
ret = dog_exec_req(&sd_nid, &hdr, buf);
if (ret < 0)
@@ -531,7 +534,7 @@ static int vdi_create(int argc, char **argv)
}
ret = do_vdi_create(vdiname, size, 0, &vid, false,
- vdi_cmd_data.nr_copies);
+ vdi_cmd_data.nr_copies, vdi_cmd_data.copy_policy);
if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
goto out;
@@ -606,7 +609,7 @@ static int vdi_snapshot(int argc, char **argv)
return EXIT_FAILURE;
ret = do_vdi_create(vdiname, inode->vdi_size, vid, NULL, true,
- inode->nr_copies);
+ inode->nr_copies, inode->copy_policy);
if (ret == EXIT_SUCCESS && verbose) {
if (raw_output)
@@ -650,7 +653,7 @@ static int vdi_clone(int argc, char **argv)
goto out;
ret = do_vdi_create(dst_vdi, inode->vdi_size, base_vid, &new_vid, false,
- vdi_cmd_data.nr_copies);
+ vdi_cmd_data.nr_copies, inode->copy_policy);
if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc)
goto out;
@@ -830,7 +833,7 @@ static int vdi_rollback(int argc, char **argv)
}
ret = do_vdi_create(vdiname, inode->vdi_size, base_vid, &new_vid,
- false, vdi_cmd_data.nr_copies);
+ false, vdi_cmd_data.nr_copies, inode->copy_policy);
if (ret == EXIT_SUCCESS && verbose) {
if (raw_output)
@@ -1822,7 +1825,7 @@ static uint32_t do_restore(const char *vdiname, int snapid, const char *tag)
goto out;
ret = do_vdi_create(vdiname, inode->vdi_size, inode->vdi_id, &vid,
- false, inode->nr_copies);
+ false, inode->nr_copies, inode->copy_policy);
if (ret != EXIT_SUCCESS) {
sd_err("Failed to read VDI");
goto out;
@@ -1918,7 +1921,8 @@ out:
/* recreate the current vdi object */
recovery_ret = do_vdi_create(vdiname, current_inode->vdi_size,
current_inode->parent_vdi_id, NULL,
- true, current_inode->nr_copies);
+ true, current_inode->nr_copies,
+ current_inode->copy_policy);
if (recovery_ret != EXIT_SUCCESS) {
sd_err("failed to resume the current vdi");
ret = recovery_ret;
@@ -2095,7 +2099,7 @@ static struct subcommand vdi_cmd[] = {
{"check", "<vdiname>", "saph", "check and repair image's consistency",
NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
vdi_check, vdi_options},
- {"create", "<vdiname> <size>", "Pcaphrv", "create an image",
+ {"create", "<vdiname> <size>", "Pcapherv", "create an image",
NULL, CMD_NEED_NODELIST|CMD_NEED_ARG,
vdi_create, vdi_options},
{"snapshot", "<vdiname>", "saphrv", "create a snapshot",
@@ -2206,6 +2210,8 @@ static int vdi_parser(int ch, const char *opt)
case 'f':
vdi_cmd_data.force = true;
break;
+ case 'e':
+ vdi_cmd_data.copy_policy = 1;
}
return 0;
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index ddc710f..1726234 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -134,6 +134,8 @@ struct sd_req {
uint32_t base_vdi_id;
uint32_t copies;
uint32_t snapid;
+ uint8_t copy_policy;
+ uint8_t __pad[3];
} vdi;
/* sheepdog-internal */
@@ -149,6 +151,8 @@ struct sd_req {
uint32_t copies;
uint8_t set_bitmap; /* 0 means false */
/* others mean true */
+ uint8_t copy_policy;
+ uint8_t __pad[2];
} vdi_state;
uint32_t __pad[8];
@@ -203,7 +207,8 @@ struct sd_inode {
uint64_t vm_clock_nsec;
uint64_t vdi_size;
uint64_t vm_state_size;
- uint16_t copy_policy;
+ uint8_t copy_policy;
+ uint8_t reserved;
uint8_t nr_copies;
uint8_t block_size_shift;
uint32_t snap_id;
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 6230867..77fcb6a 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -35,6 +35,8 @@ static struct req_iter *prepare_replication_requests(struct request *req,
uint64_t off = req->rq.obj.offset;
struct req_iter *reqs = xzalloc(sizeof(*reqs) * nr_copies);
+ sd_debug("%"PRIx64, req->rq.obj.oid);
+
*nr = nr_copies;
for (int i = 0; i < nr_copies; i++) {
reqs[i].buf = data;
diff --git a/sheep/group.c b/sheep/group.c
index f75b023..43c0983 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -473,7 +473,8 @@ static int get_vdis_from(struct sd_node *node)
count = rsp->data_length / sizeof(*vs);
for (i = 0; i < count; i++) {
atomic_set_bit(vs[i].vid, sys->vdi_inuse);
- add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot);
+ add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot,
+ vs[i].copy_policy);
}
out:
free(vs);
diff --git a/sheep/ops.c b/sheep/ops.c
index c14c80b..f433289 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -77,11 +77,15 @@ static int cluster_new_vdi(struct request *req)
.size = hdr->vdi.vdi_size,
.base_vid = hdr->vdi.base_vdi_id,
.create_snapshot = !!hdr->vdi.snapid,
+ .copy_policy = hdr->vdi.copy_policy,
.nr_copies = hdr->vdi.copies ? hdr->vdi.copies :
sys->cinfo.nr_copies,
.time = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000,
};
+ if (iocb.copy_policy)
+ iocb.nr_copies = SD_EC_DP;
+
if (hdr->data_length != SD_MAX_VDI_LEN)
return SD_RES_INVALID_PARMS;
@@ -592,12 +596,13 @@ static int cluster_notify_vdi_add(const struct sd_req *req, struct sd_rsp *rsp,
/* make the previous working vdi a snapshot */
add_vdi_state(req->vdi_state.old_vid,
get_vdi_copy_number(req->vdi_state.old_vid),
- true);
+ true, 0);
if (req->vdi_state.set_bitmap)
atomic_set_bit(req->vdi_state.new_vid, sys->vdi_inuse);
- add_vdi_state(req->vdi_state.new_vid, req->vdi_state.copies, false);
+ add_vdi_state(req->vdi_state.new_vid, req->vdi_state.copies, false,
+ req->vdi_state.copy_policy);
return SD_RES_SUCCESS;
}
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index dc5e3d9..b363402 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -195,7 +195,7 @@ static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
}
add_vdi_state(oid_to_vid(oid), inode->nr_copies,
- vdi_is_snapshot(inode));
+ vdi_is_snapshot(inode), inode->copy_policy);
atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
ret = SD_RES_SUCCESS;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 588a61c..fab25b5 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -42,6 +42,7 @@
#include "sha1.h"
#include "config.h"
#include "sockfd_cache.h"
+#include "fec.h"
/*
* Functions that update global info must be called in the main
@@ -173,6 +174,7 @@ struct vdi_iocb {
uint32_t base_vid;
uint32_t snapid;
bool create_snapshot;
+ uint8_t copy_policy;
int nr_copies;
uint64_t time;
};
@@ -189,7 +191,8 @@ struct vdi_state {
uint32_t vid;
uint8_t nr_copies;
uint8_t snapshot;
- uint16_t _pad;
+ uint8_t copy_policy;
+ uint8_t _pad;
};
struct store_driver {
@@ -277,9 +280,10 @@ int lock_base_dir(const char *d);
int fill_vdi_state_list(void *data);
bool oid_is_readonly(uint64_t oid);
int get_vdi_copy_number(uint32_t vid);
+int get_vdi_copy_policy(uint32_t vid);
int get_obj_copy_number(uint64_t oid, int nr_zones);
int get_req_copy_number(struct request *req);
-int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot);
+int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot, uint8_t);
int vdi_exist(uint32_t vid);
int vdi_create(const struct vdi_iocb *iocb, uint32_t *new_vid);
int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid);
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 0faab62..337b604 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -15,6 +15,7 @@ struct vdi_state_entry {
uint32_t vid;
unsigned int nr_copies;
bool snapshot;
+ uint8_t copy_policy;
struct rb_node node;
};
@@ -83,6 +84,20 @@ int get_vdi_copy_number(uint32_t vid)
return entry->nr_copies;
}
+int get_vdi_copy_policy(uint32_t vid)
+{
+ struct vdi_state_entry *entry;
+
+ sd_read_lock(&vdi_state_lock);
+ entry = vdi_state_search(&vdi_state_root, vid);
+ sd_unlock(&vdi_state_lock);
+
+ if (!entry)
+ panic("copy policy for %" PRIx32 " not found", vid);
+
+ return entry->copy_policy;
+}
+
int get_obj_copy_number(uint64_t oid, int nr_zones)
{
return min(get_vdi_copy_number(oid_to_vid(oid)), nr_zones);
@@ -100,7 +115,7 @@ int get_req_copy_number(struct request *req)
return nr_copies;
}
-int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot)
+int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot, uint8_t cp)
{
struct vdi_state_entry *entry, *old;
@@ -108,8 +123,9 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot)
entry->vid = vid;
entry->nr_copies = nr_copies;
entry->snapshot = snapshot;
+ entry->copy_policy = cp;
- sd_debug("%" PRIx32 ", %d", vid, nr_copies);
+ sd_debug("%" PRIx32 ", %d, %d", vid, nr_copies, cp);
sd_write_lock(&vdi_state_lock);
old = vdi_state_insert(&vdi_state_root, entry);
@@ -118,6 +134,7 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot)
entry = old;
entry->nr_copies = nr_copies;
entry->snapshot = snapshot;
+ entry->copy_policy = cp;
}
sd_unlock(&vdi_state_lock);
@@ -137,6 +154,7 @@ int fill_vdi_state_list(void *data)
vs->vid = entry->vid;
vs->nr_copies = entry->nr_copies;
vs->snapshot = entry->snapshot;
+ vs->copy_policy = entry->copy_policy;
vs++;
nr++;
}
@@ -185,7 +203,7 @@ static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
new->vdi_id = new_vid;
new->create_time = iocb->time;
new->vdi_size = iocb->size;
- new->copy_policy = 0;
+ new->copy_policy = iocb->copy_policy;
new->nr_copies = iocb->nr_copies;
new->block_size_shift = find_next_bit(&block_size, BITS_PER_LONG, 0);
new->snap_id = new_snapid;
@@ -215,8 +233,9 @@ static int create_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
int ret;
sd_debug("%s: size %" PRIu64 ", new_vid %" PRIx32 ", copies %d, "
- "snapid %" PRIu32, iocb->name, iocb->size, new_vid,
- iocb->nr_copies, new_snapid);
+ "snapid %" PRIu32 " copy policy %"PRIu8, iocb->name,
+ iocb->size, new_vid, iocb->nr_copies, new_snapid,
+ new->copy_policy);
ret = write_object(vid_to_vdi_oid(new_vid), (char *)new, sizeof(*new),
0, true);
@@ -579,7 +598,8 @@ int vdi_lookup(const struct vdi_iocb *iocb, struct vdi_info *info)
return fill_vdi_info(left, right, iocb, info);
}
-static int notify_vdi_add(uint32_t vdi_id, uint32_t nr_copies, uint32_t old_vid)
+static int notify_vdi_add(uint32_t vdi_id, uint32_t nr_copies, uint32_t old_vid,
+ uint8_t copy_policy)
{
int ret = SD_RES_SUCCESS;
struct sd_req hdr;
@@ -589,6 +609,7 @@ static int notify_vdi_add(uint32_t vdi_id, uint32_t nr_copies, uint32_t old_vid)
hdr.vdi_state.new_vid = vdi_id;
hdr.vdi_state.copies = nr_copies;
hdr.vdi_state.set_bitmap = false;
+ hdr.vdi_state.copy_policy = copy_policy;
ret = exec_local_req(&hdr, NULL);
if (ret != SD_RES_SUCCESS)
@@ -643,7 +664,8 @@ int vdi_create(const struct vdi_iocb *iocb, uint32_t *new_vid)
if (info.snapid == 0)
info.snapid = 1;
*new_vid = info.free_bit;
- ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid);
+ ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid,
+ iocb->copy_policy);
if (ret != SD_RES_SUCCESS)
return ret;
@@ -682,7 +704,8 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid)
assert(info.snapid > 0);
*new_vid = info.free_bit;
- ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid);
+ ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid,
+ iocb->copy_policy);
if (ret != SD_RES_SUCCESS)
return ret;
--
1.7.9.5
More information about the sheepdog
mailing list