[sheepdog] [PATCH 2/3] sheep, dog: make vid space size variable
Takafumi Fujieda
fujieda.takafumi at lab.ntt.co.jp
Mon Feb 2 06:58:20 CET 2015
This patch changes vid space size from constant to variable.
The vid space size value is assigned to the variables (member of
cluster_info and system_info) only at cluster_make_fs or reloading
config file.
Not the constants, but the variables are always refered.
Signed-off-by: Takafumi Fujieda <fujieda.takafumi at lab.ntt.co.jp>
---
dog/cluster.c | 38 +++++++++++++++++++++++++++++++-------
dog/common.c | 32 +++++++++++++++++++++++++-------
dog/dog.h | 3 ++-
dog/farm/farm.c | 16 +++++++++++++++-
dog/node.c | 3 ++-
dog/vdi.c | 22 +++++++++++++++-------
include/sheepdog_proto.h | 8 ++++----
sheep/config.c | 16 +++++++++++++++-
sheep/gateway.c | 23 +++++++++++++----------
sheep/group.c | 10 ++++++++--
sheep/journal.c | 3 ++-
sheep/nfs/fs.c | 4 ++--
sheep/nfs/nfs.c | 4 ++--
sheep/object_cache.c | 13 +++++++------
sheep/object_list_cache.c | 2 +-
sheep/ops.c | 35 ++++++++++++++++++++++++++++-------
sheep/plain_store.c | 25 +++++++++++++++----------
sheep/recovery.c | 10 ++++++----
sheep/request.c | 3 ++-
sheep/sheep_priv.h | 4 ++--
sheep/vdi.c | 43 ++++++++++++++++++++++++++-----------------
sheepfs/volume.c | 2 +-
22 files changed, 224 insertions(+), 95 deletions(-)
diff --git a/dog/cluster.c b/dog/cluster.c
index 2b6864a..d4a45ec 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -71,9 +71,9 @@ static int list_store(void)
return EXIT_SYSFAIL;
}
-static bool no_vdi(const unsigned long *vdis)
+static bool no_vdi(const unsigned long *vdis, uint32_t nr_vdis)
{
- return find_next_bit(vdis, SD_NR_VDIS, 0) == SD_NR_VDIS;
+ return find_next_bit(vdis, nr_vdis, 0) == nr_vdis;
}
#define FORMAT_PRINT \
@@ -90,8 +90,12 @@ static int cluster_format(int argc, char **argv)
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
struct timeval tv;
char store_name[STORE_LEN];
- static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
struct sd_node *n;
+ uint8_t new_space = 0;
+ uint8_t old_space = 0;
+ uint32_t old_nr_vdis;
+ unsigned long *vdi_inuse = NULL;
+ size_t bmp_size;
rb_for_each_entry(n, &sd_nroot, rb) {
struct sd_req info_req;
@@ -110,6 +114,17 @@ static int cluster_format(int argc, char **argv)
return EXIT_FAILURE;
}
+ if (!old_space)
+ if (!cinfo.vid_space)
+ old_space = SD_VID_SPACE;
+ else
+ old_space = cinfo.vid_space;
+ else
+ if (cinfo.vid_space && old_space != cinfo.vid_space) {
+ sd_err("there are nodes have different VID space");
+ return EXIT_FAILURE;
+ }
+
if (n->nr_vnodes != 0) {
if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES)
&& cluster_cmd_data.fixed_vnodes) {
@@ -136,19 +151,25 @@ static int cluster_format(int argc, char **argv)
confirm(info);
}
+ old_nr_vdis = (1U << old_space);
+ bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(old_nr_vdis);
+ vdi_inuse = (unsigned long *)malloc(bmp_size);
+
sd_init_req(&hdr, SD_OP_READ_VDIS);
- hdr.data_length = sizeof(vdi_inuse);
+ hdr.data_length = bmp_size;
ret = dog_exec_req(&sd_nid, &hdr, vdi_inuse);
if (ret < 0)
return EXIT_SYSFAIL;
if (rsp->result != SD_RES_SUCCESS) {
sd_err("%s", sd_strerror(rsp->result));
+ free(vdi_inuse);
return EXIT_FAILURE;
}
- if (!no_vdi(vdi_inuse))
+ if (!no_vdi(vdi_inuse, old_nr_vdis))
confirm(FORMAT_PRINT);
+ free(vdi_inuse);
gettimeofday(&tv, NULL);
@@ -401,7 +422,8 @@ static void fill_cb(struct sd_index *idx, void *arg, int ignore)
static void fill_object_tree(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *i, void *data)
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis)
{
uint64_t vdi_oid = vid_to_vdi_oid(vid), vmstate_oid;
uint32_t vdi_id;
@@ -548,6 +570,7 @@ static int load_snapshot(int argc, char **argv)
cluster_cmd_data.copies = hdr.copy_number;
cluster_cmd_data.copy_policy = hdr.copy_policy;
cluster_cmd_data.block_size_shift = hdr.block_size_shift;
+ cluster_cmd_data.vid_space = hdr.vid_space;
if (cluster_format(0, NULL) != SD_RES_SUCCESS)
goto out;
@@ -714,7 +737,8 @@ static int cluster_reweight(int argc, char **argv)
static void cluster_check_cb(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *inode, void *data)
+ const struct sd_inode *inode, void *data,
+ uint32_t nr_vdis)
{
if (vdi_is_snapshot(inode))
printf("fix snapshot %s (id: %d, tag: \"%s\")\n", name,
diff --git a/dog/common.c b/dog/common.c
index 6ff1e19..dbcc67b 100644
--- a/dog/common.c
+++ b/dog/common.c
@@ -126,7 +126,7 @@ int dog_write_object(uint64_t oid, uint64_t cow_oid, void *data,
return SD_RES_SUCCESS;
}
-#define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS)
+#define FOR_EACH_VDI(nr, vdis, nr_vdis) FOR_EACH_BIT(nr, vdis, nr_vdis)
int parse_vdi(vdi_parser_func_t func, size_t size, void *data,
bool no_deleted)
@@ -136,12 +136,28 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data,
struct sd_inode *i = xmalloc(sizeof(*i));
struct sd_req req;
struct sd_rsp *rsp = (struct sd_rsp *)&req;
- static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
- static DECLARE_BITMAP(vdi_deleted, SD_NR_VDIS);
uint32_t rlen;
+ struct cluster_info cinfo;
+ uint32_t nr_vdis;
+ size_t bmp_size;
+ unsigned long *vdi_inuse = NULL;
+ unsigned long *vdi_deleted = NULL;
+
+ sd_init_req(&req, SD_OP_CLUSTER_INFO);
+ req.data_length = sizeof(cinfo);
+ ret = dog_exec_req(&sd_nid, &req, &cinfo);
+ if (ret < 0) {
+ sd_err("Fail to execute request: SD_OP_CLUSTER_INFO");
+ ret = EXIT_FAILURE;
+ goto out;
+ }
+ nr_vdis = (1U << cinfo.vid_space);
+ bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(nr_vdis);
+ vdi_inuse = (unsigned long *)malloc(bmp_size);
+ vdi_deleted = (unsigned long *)malloc(bmp_size);
sd_init_req(&req, SD_OP_READ_VDIS);
- req.data_length = sizeof(vdi_inuse);
+ req.data_length = bmp_size;
ret = dog_exec_req(&sd_nid, &req, vdi_inuse);
if (ret < 0)
@@ -152,7 +168,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data,
}
sd_init_req(&req, SD_OP_READ_DEL_VDIS);
- req.data_length = sizeof(vdi_deleted);
+ req.data_length = bmp_size;
ret = dog_exec_req(&sd_nid, &req, vdi_deleted);
if (ret < 0)
@@ -162,7 +178,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data,
goto out;
}
- FOR_EACH_VDI(nr, vdi_inuse) {
+ FOR_EACH_VDI(nr, vdi_inuse, nr_vdis) {
uint64_t oid;
uint32_t snapid;
@@ -196,11 +212,13 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data,
}
snapid = vdi_is_snapshot(i) ? i->snap_id : 0;
- func(i->vdi_id, i->name, i->tag, snapid, 0, i, data);
+ func(i->vdi_id, i->name, i->tag, snapid, 0, i, data, nr_vdis);
}
out:
free(i);
+ free(vdi_inuse);
+ free(vdi_deleted);
return ret;
}
diff --git a/dog/dog.h b/dog/dog.h
index 37355e5..1506a88 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -71,7 +71,8 @@ char *strnumber_raw(uint64_t _size, bool raw);
typedef void (*vdi_parser_func_t)(uint32_t vid, const char *name,
const char *tag, uint32_t snapid,
uint32_t flags,
- const struct sd_inode *i, void *data);
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis);
int parse_vdi(vdi_parser_func_t func, size_t size, void *data,
bool no_deleted);
int dog_read_object(uint64_t oid, void *data, unsigned int datalen,
diff --git a/dog/farm/farm.c b/dog/farm/farm.c
index e2b07ad..a9b6fd6 100644
--- a/dog/farm/farm.c
+++ b/dog/farm/farm.c
@@ -360,6 +360,7 @@ int farm_save_snapshot(const char *tag, bool multithread)
log_hdr.copy_number = cinfo.nr_copies;
log_hdr.copy_policy = cinfo.copy_policy;
log_hdr.block_size_shift = cinfo.block_size_shift;
+ log_hdr.vid_space = cinfo.vid_space;
snap_log_write_hdr(&log_hdr);
}
@@ -404,6 +405,10 @@ static void do_load_object(struct work *work)
struct snapshot_work *sw;
static unsigned long loaded;
uint32_t vid;
+ struct sd_req req;
+ struct cluster_info cinfo;
+ uint64_t vdi_mask;
+ int ret = SD_RES_SUCCESS;
if (uatomic_is_true(&work_error))
return;
@@ -415,7 +420,16 @@ static void do_load_object(struct work *work)
if (!buffer)
goto error;
- vid = oid_to_vid(sw->entry.oid);
+ sd_init_req(&req, SD_OP_CLUSTER_INFO);
+ req.data_length = sizeof(cinfo);
+ ret = dog_exec_req(&sd_nid, &req, &cinfo);
+ if (ret < 0) {
+ sd_err("Fail to execute request: SD_OP_CLUSTER_INFO");
+ goto error;
+ }
+ vdi_mask = ((1LU << cinfo.vid_space) << VDI_SPACE_SHIFT)
+ - (1LU << VDI_SPACE_SHIFT);
+ vid = oid_to_vid(sw->entry.oid, vdi_mask);
if (register_vdi(vid)) {
if (notify_vdi_add(vid, sw->entry.nr_copies,
sw->entry.copy_policy,
diff --git a/dog/node.c b/dog/node.c
index 36141ad..6af54f4 100644
--- a/dog/node.c
+++ b/dog/node.c
@@ -21,7 +21,8 @@ static struct node_cmd_data {
static void cal_total_vdi_size(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *i, void *data)
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis)
{
uint64_t *size = data;
diff --git a/dog/vdi.c b/dog/vdi.c
index 67e2f0b..d2ba096 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -117,7 +117,8 @@ static char *redundancy_scheme(uint8_t copy_nr, uint8_t policy)
static void print_vdi_list(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *i, void *data)
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis)
{
bool is_clone = false;
uint64_t my_objs = 0, cow_objs = 0;
@@ -174,7 +175,8 @@ static void print_vdi_list(uint32_t vid, const char *name, const char *tag,
static void print_vdi_tree(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *i, void *data)
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis)
{
time_t ti;
struct tm tm;
@@ -195,7 +197,8 @@ static void print_vdi_tree(uint32_t vid, const char *name, const char *tag,
static void print_vdi_graph(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *i, void *data)
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis)
{
time_t ti;
struct tm tm;
@@ -272,17 +275,21 @@ static void for_each_node_print(uint64_t oid)
static void print_obj_ref(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *i, void *data)
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis)
{
uint64_t oid = *(uint64_t *)data;
uint64_t idx = data_oid_to_idx(oid);
struct get_vdi_info info;
+ uint64_t vdi_mask = ((uint64_t)nr_vdis << VDI_SPACE_SHIFT)
+ - (1LU << VDI_SPACE_SHIFT);
if (i->data_vdi_id[idx] != 0 &&
- i->data_vdi_id[idx] == oid_to_vid(oid)) {
+ i->data_vdi_id[idx] == oid_to_vid(oid, vdi_mask)) {
memset(&info, 0, sizeof(info));
info.name = name;
- print_vdi_list(vid, name, tag, snapid, flags, i, &info);
+ print_vdi_list(vid, name, tag, snapid, flags, i, &info,
+ nr_vdis);
}
}
@@ -2959,7 +2966,8 @@ static int vdi_cache(int argc, char **argv)
static void construct_vdi_tree(uint32_t vid, const char *name, const char *tag,
uint32_t snapid, uint32_t flags,
- const struct sd_inode *i, void *data)
+ const struct sd_inode *i, void *data,
+ uint32_t nr_vdis)
{
add_vdi_tree(name, tag, vid, i->parent_vdi_id, false);
}
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 5db2394..7a2e2ad 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -451,11 +451,11 @@ static inline uint64_t sd_hash_oid(uint64_t oid)
* Create a hash value from a vdi name. We cannot use sd_hash_buf for this
* purpose because of backward compatibility.
*/
-static inline uint32_t sd_hash_vdi(const char *name)
+static inline uint32_t sd_hash_vdi(const char *name, uint32_t nr_vdis)
{
uint64_t hval = fnv_64a_buf(name, strlen(name), FNV1A_64_INIT);
- return (uint32_t)(hval & (SD_NR_VDIS - 1));
+ return (uint32_t)(hval & (nr_vdis - 1));
}
#ifndef __KERNEL__
@@ -535,9 +535,9 @@ static inline uint64_t vid_to_data_oid(uint32_t vid, uint64_t idx)
return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
}
-static inline uint32_t oid_to_vid(uint64_t oid)
+static inline uint32_t oid_to_vid(uint64_t oid, uint64_t mask)
{
- return (oid & SD_VDI_MASK) >> VDI_SPACE_SHIFT;
+ return (oid & mask) >> VDI_SPACE_SHIFT;
}
static inline uint64_t vid_to_attr_oid(uint32_t vid, uint32_t attrid)
diff --git a/sheep/config.c b/sheep/config.c
index 548a1e8..7ab3600 100644
--- a/sheep/config.c
+++ b/sheep/config.c
@@ -71,6 +71,7 @@ static int get_cluster_config(struct cluster_info *cinfo)
(cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES);
cinfo->copy_policy = config.copy_policy;
cinfo->block_size_shift = config.block_size_shift;
+ cinfo->vid_space = config.vid_space;
memcpy(cinfo->store, config.store, sizeof(config.store));
return SD_RES_SUCCESS;
@@ -79,6 +80,7 @@ static int get_cluster_config(struct cluster_info *cinfo)
int init_config_file(void)
{
int fd, ret = 0;
+ size_t bmp_size;
check_tmp_config();
@@ -135,7 +137,6 @@ reload:
sd_err("Designation of before a restart and a vnodes option is different.");
return -1;
}
-
ret = 0;
get_cluster_config(&sys->cinfo);
if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) !=
@@ -144,6 +145,18 @@ reload:
"exists data format mismatch");
return -1;
}
+ if (!sys->cinfo.vid_space) {
+ sys->cinfo.vid_space = SD_VID_SPACE;
+ sys->nr_vdis = SD_NR_VDIS;
+ sys->vdi_mask = SD_VDI_MASK;
+ } else {
+ sys->nr_vdis = (1U << sys->cinfo.vid_space);
+ sys->vdi_mask = ((uint64_t)sys->nr_vdis << VDI_SPACE_SHIFT)
+ - (1LU << VDI_SPACE_SHIFT);
+ }
+ bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis);
+ sys->vdi_inuse = (unsigned long *)malloc(bmp_size);
+ sys->vdi_deleted = (unsigned long *)malloc(bmp_size);
create:
config.version = SD_FORMAT_VERSION;
@@ -171,6 +184,7 @@ int set_cluster_config(const struct cluster_info *cinfo)
config.copy_policy = cinfo->copy_policy;
config.flags = cinfo->flags;
config.block_size_shift = cinfo->block_size_shift;
+ config.vid_space = cinfo->vid_space;
memset(config.store, 0, sizeof(config.store));
pstrcpy((char *)config.store, sizeof(config.store),
(char *)cinfo->store);
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 89db9bf..b071f7a 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -117,7 +117,7 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr)
struct req_iter *reqs;
char *p, *buf = NULL;
uint8_t policy = req->rq.obj.copy_policy ?:
- get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
+ get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid, sys->vdi_mask));
int ed = 0, ep = 0, edp;
edp = ec_policy_to_dp(policy, &ed, &ep);
@@ -183,7 +183,7 @@ bool is_erasure_oid(uint64_t oid)
{
return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) &&
!is_ledger_object(oid) &&
- get_vdi_copy_policy(oid_to_vid(oid)) > 0;
+ get_vdi_copy_policy(oid_to_vid(oid, sys->vdi_mask)) > 0;
}
/* Prepare request iterator and buffer for each replica */
@@ -216,7 +216,8 @@ static void finish_requests(struct request *req, struct req_iter *reqs,
if (opcode == SD_OP_READ_OBJ) {
char *p, *buf = xmalloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe);
uint8_t policy = req->rq.obj.copy_policy ?:
- get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
+ get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid,
+ sys->vdi_mask));
int ed = 0, strip_size;
ec_policy_to_dp(policy, &ed, NULL);
@@ -496,7 +497,8 @@ static int gateway_forward_request(struct request *req)
nr_reqs = nr_to_send;
if (nr_to_send > nr_copies) {
uint8_t policy = req->rq.obj.copy_policy ?:
- get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid));
+ get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid,
+ sys->vdi_mask));
int ds;
/* Only for erasure code, nr_to_send might > nr_copies */
ec_policy_to_dp(policy, &ds, NULL);
@@ -638,7 +640,7 @@ int gateway_read_obj(struct request *req)
if ((req->rq.flags & SD_FLAG_CMD_TGT) &&
!is_inode_refresh_req(req) &&
- is_refresh_required(oid_to_vid(oid))) {
+ is_refresh_required(oid_to_vid(oid, sys->vdi_mask))) {
sd_debug("refresh is required: %"PRIx64, oid);
return SD_RES_INODE_INVALIDATED;
}
@@ -656,7 +658,7 @@ int gateway_read_obj(struct request *req)
return ret;
if (is_inode_refresh_req(req))
- validate_myself(oid_to_vid(oid));
+ validate_myself(oid_to_vid(oid, sys->vdi_mask));
return ret;
}
@@ -670,7 +672,7 @@ int gateway_write_obj(struct request *req)
struct generation_reference *refs = NULL;
if ((req->rq.flags & SD_FLAG_CMD_TGT) &&
- is_refresh_required(oid_to_vid(oid))) {
+ is_refresh_required(oid_to_vid(oid, sys->vdi_mask))) {
sd_debug("refresh is required: %"PRIx64, oid);
return SD_RES_INODE_INVALIDATED;
}
@@ -685,7 +687,7 @@ int gateway_write_obj(struct request *req)
if (is_data_vid_update(hdr)) {
size_t nr_vids = hdr->data_length / sizeof(*vids);
- invalidate_other_nodes(oid_to_vid(oid));
+ invalidate_other_nodes(oid_to_vid(oid, sys->vdi_mask));
/* read the previous vids to discard their references later */
vids = xzalloc(sizeof(*vids) * nr_vids);
@@ -713,7 +715,8 @@ out:
static int gateway_handle_cow(struct request *req)
{
uint64_t oid = req->rq.obj.oid;
- size_t len = get_objsize(oid, get_vdi_object_size(oid_to_vid(oid)));
+ size_t len = get_objsize(oid, get_vdi_object_size(oid_to_vid(oid,
+ sys->vdi_mask)));
struct sd_req hdr, *req_hdr = &req->rq;
char *buf = xvalloc(len);
int ret;
@@ -746,7 +749,7 @@ int gateway_create_and_write_obj(struct request *req)
uint64_t oid = req->rq.obj.oid;
if ((req->rq.flags & SD_FLAG_CMD_TGT) &&
- is_refresh_required(oid_to_vid(oid))) {
+ is_refresh_required(oid_to_vid(oid, sys->vdi_mask))) {
sd_debug("refresh is required: %"PRIx64, oid);
return SD_RES_INODE_INVALIDATED;
}
diff --git a/sheep/group.c b/sheep/group.c
index 85b9249..d03d667 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -18,8 +18,8 @@ struct node {
struct get_vdis_work {
struct work work;
- DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
- DECLARE_BITMAP(vdi_deleted, SD_NR_VDIS);
+ unsigned long *vdi_inuse;
+ unsigned long *vdi_deleted;
struct sd_node joined;
struct rb_root nroot;
};
@@ -587,6 +587,8 @@ static void get_vdis_done(struct work *work)
sd_mutex_unlock(&wait_vdis_lock);
rb_destroy(&w->nroot, struct sd_node, rb);
+ free(w->vdi_inuse);
+ free(w->vdi_deleted);
free(w);
if (refcount_read(&nr_get_vdis_works) == 0)
@@ -673,9 +675,13 @@ static void setup_backend_store(const struct cluster_info *cinfo)
static void get_vdis(const struct rb_root *nroot, const struct sd_node *joined)
{
struct get_vdis_work *w;
+ size_t bmp_size;
+ bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis);
w = xmalloc(sizeof(*w));
w->joined = *joined;
+ w->vdi_inuse = (unsigned long *)malloc(bmp_size);
+ w->vdi_deleted = (unsigned long *)malloc(bmp_size);
INIT_RB_ROOT(&w->nroot);
rb_copy(nroot, struct sd_node, rb, &w->nroot, node_cmp);
refcount_inc(&nr_get_vdis_works);
diff --git a/sheep/journal.c b/sheep/journal.c
index 4df9a74..3802c74 100644
--- a/sheep/journal.c
+++ b/sheep/journal.c
@@ -170,7 +170,8 @@ static int replay_journal_entry(struct journal_descriptor *jd)
return -1;
}
if (jd->create) {
- object_size = get_vdi_object_size(oid_to_vid(jd->oid));
+ object_size = get_vdi_object_size(oid_to_vid(jd->oid,
+ sys->vdi_mask));
ret = prealloc(fd, object_size);
if (ret < 0)
goto out;
diff --git a/sheep/nfs/fs.c b/sheep/nfs/fs.c
index ec92f12..eb070c1 100644
--- a/sheep/nfs/fs.c
+++ b/sheep/nfs/fs.c
@@ -151,7 +151,7 @@ static void dentry_add(struct inode *parent, struct dentry *dentry)
int fs_create_dir(struct inode *inode, const char *name, struct inode *parent)
{
uint64_t myino, pino = parent->ino;
- uint32_t vid = oid_to_vid(pino);
+ uint32_t vid = oid_to_vid(pino, sys->vdi_mask);
struct inode_data *id = prepare_inode_data(inode, vid, name);
struct dentry *entry;
int ret;
@@ -313,7 +313,7 @@ struct dentry *fs_lookup_dir(struct inode *inode, const char *name)
int fs_create_file(uint64_t pino, struct inode *new, const char *name)
{
- uint32_t vid = oid_to_vid(pino);
+ uint32_t vid = oid_to_vid(pino, sys->vdi_mask);
struct inode *inode;
struct dentry *dentry;
int ret;
diff --git a/sheep/nfs/nfs.c b/sheep/nfs/nfs.c
index 036f995..e01ac02 100644
--- a/sheep/nfs/nfs.c
+++ b/sheep/nfs/nfs.c
@@ -66,7 +66,7 @@ static void update_post_attr(struct inode *inode, fattr3 *post)
post->gid = inode->gid;
post->size = inode->size;
post->used = inode->used;
- post->fsid = oid_to_vid(inode->ino);
+ post->fsid = oid_to_vid(inode->ino, sys->vdi_mask);
post->fileid = inode->ino;
post->atime.seconds = inode->atime;
post->mtime.seconds = inode->mtime;
@@ -626,7 +626,7 @@ void *nfs3_fsstat(struct svc_req *req, struct nfs_arg *argp)
static FSSTAT3res result;
struct svc_fh *fh = get_svc_fh(argp);
struct sd_inode *sd_inode = xmalloc(sizeof(*sd_inode));
- uint32_t vid = oid_to_vid(fh->ino);
+ uint32_t vid = oid_to_vid(fh->ino, sys->vdi_mask);
uint64_t my = 0 , cow = 0;
int ret;
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index 3794c19..c5293f8 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -126,7 +126,8 @@ static inline bool idx_has_vdi_bit(uint64_t idx)
static inline size_t get_cache_block_size(uint64_t oid)
{
- uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
+ uint32_t object_size = get_vdi_object_size(oid_to_vid(oid,
+ sys->vdi_mask));
size_t bsize = DIV_ROUND_UP(get_objsize(oid, object_size),
sizeof(uint64_t) * BITS_PER_BYTE);
@@ -927,7 +928,7 @@ static int object_cache_push(struct object_cache *oc)
bool object_is_cached(uint64_t oid)
{
- uint32_t vid = oid_to_vid(oid);
+ uint32_t vid = oid_to_vid(oid, sys->vdi_mask);
uint64_t idx = object_cache_oid_to_idx(oid);
struct object_cache *cache;
@@ -992,7 +993,7 @@ get_cache_entry_from(struct object_cache *cache, uint64_t idx)
/* This helper increases the refcount */
static struct object_cache_entry *oid_to_entry(uint64_t oid)
{
- uint32_t vid = oid_to_vid(oid);
+ uint32_t vid = oid_to_vid(oid, sys->vdi_mask);
uint64_t idx = object_cache_oid_to_idx(oid);
struct object_cache *cache;
struct object_cache_entry *entry;
@@ -1065,7 +1066,7 @@ bool bypass_object_cache(const struct request *req)
return true;
if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
- uint32_t vid = oid_to_vid(oid);
+ uint32_t vid = oid_to_vid(oid, sys->vdi_mask);
struct object_cache *cache;
cache = find_object_cache(vid, false);
@@ -1092,7 +1093,7 @@ int object_cache_handle_request(struct request *req)
{
struct sd_req *hdr = &req->rq;
uint64_t oid = req->rq.obj.oid;
- uint32_t vid = oid_to_vid(oid);
+ uint32_t vid = oid_to_vid(oid, sys->vdi_mask);
uint64_t idx = object_cache_oid_to_idx(oid);
struct object_cache *cache;
struct object_cache_entry *entry;
@@ -1208,7 +1209,7 @@ int object_cache_flush_vdi(uint32_t vid)
int object_cache_flush_and_del(const struct request *req)
{
- uint32_t vid = oid_to_vid(req->rq.obj.oid);
+ uint32_t vid = oid_to_vid(req->rq.obj.oid, sys->vdi_mask);
struct object_cache *cache;
cache = find_object_cache(vid, false);
diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index b9acaa0..ad40ed3 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -154,7 +154,7 @@ static void objlist_deletion_work(struct work *work)
sd_write_lock(&obj_list_cache.lock);
rb_for_each_entry(entry, &obj_list_cache.root, node) {
- entry_vid = oid_to_vid(entry->oid);
+ entry_vid = oid_to_vid(entry->oid, sys->vdi_mask);
if (entry_vid != vid)
continue;
diff --git a/sheep/ops.c b/sheep/ops.c
index bc2848b..6768904 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -298,6 +298,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
char *store_name = data;
int32_t nr_vnodes;
struct vnode_info *vinfo = get_vnode_info();
+ size_t bmp_size;
driver = find_store_driver(data);
if (!driver) {
@@ -305,6 +306,24 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
goto out;
}
+ if (!req->cluster.vid_space) {
+ sys->cinfo.vid_space = SD_VID_SPACE;
+ sys->nr_vdis = SD_NR_VDIS;
+ sys->vdi_mask = SD_VDI_MASK;
+ } else {
+ sys->cinfo.vid_space = req->cluster.vid_space;
+ sys->nr_vdis = (1U << req->cluster.vid_space);
+ sys->vdi_mask = ((uint64_t)sys->nr_vdis << VDI_SPACE_SHIFT)
+ - (1LU << VDI_SPACE_SHIFT);
+ }
+ if (is_cluster_formatted()){
+ free(sys->vdi_inuse);
+ free(sys->vdi_deleted);
+ }
+ bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis);
+ sys->vdi_inuse = (unsigned long *)malloc(bmp_size);
+ sys->vdi_deleted = (unsigned long *)malloc(bmp_size);
+
pstrcpy((char *)sys->cinfo.store, sizeof(sys->cinfo.store),
store_name);
sd_store = driver;
@@ -338,8 +357,8 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
for (i = 1; i <= latest_epoch; i++)
remove_epoch(i);
- memset(sys->vdi_inuse, 0, sizeof(sys->vdi_inuse));
- memset(sys->vdi_deleted, 0, sizeof(sys->vdi_deleted));
+ memset(sys->vdi_inuse, 0, bmp_size);
+ memset(sys->vdi_deleted, 0, bmp_size);
clean_vdi_state();
sys->cinfo.epoch = 0;
@@ -411,7 +430,7 @@ static int cluster_get_vdi_attr(struct request *req)
* the current VDI id can change if we take a snapshot,
* so we use the hash value of the VDI name as the VDI id
*/
- vid = sd_hash_vdi(vattr->name);
+ vid = sd_hash_vdi(vattr->name, sys->nr_vdis);
ret = get_vdi_attr(req->data, hdr->data_length,
vid, &attrid, info.create_time,
!!(hdr->flags & SD_FLAG_CMD_CREAT),
@@ -528,6 +547,7 @@ static int local_stat_cluster(struct request *req)
elog->disable_recovery = sys->cinfo.disable_recovery;
elog->nr_copies = sys->cinfo.nr_copies;
elog->copy_policy = sys->cinfo.copy_policy;
+ elog->vid_space = sys->cinfo.vid_space;
elog->flags = sys->cinfo.flags;
strncpy(elog->drv_name, (char *)sys->cinfo.store,
STORE_LEN);
@@ -727,7 +747,7 @@ static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp,
static int cluster_delete_cache(const struct sd_req *req, struct sd_rsp *rsp,
void *data, const struct sd_node *sender)
{
- uint32_t vid = oid_to_vid(req->obj.oid);
+ uint32_t vid = oid_to_vid(req->obj.oid, sys->vdi_mask);
if (sys->enable_object_cache)
object_cache_delete(vid);
@@ -911,7 +931,7 @@ static int local_get_cache_info(struct request *request)
static int local_cache_purge(struct request *req)
{
const struct sd_req *hdr = &req->rq;
- uint32_t vid = oid_to_vid(req->rq.obj.oid);
+ uint32_t vid = oid_to_vid(req->rq.obj.oid, sys->vdi_mask);
if (hdr->flags == SD_FLAG_CMD_WRITE) {
object_cache_delete(vid);
@@ -936,7 +956,7 @@ static int local_flush_vdi(struct request *req)
int ret = SD_RES_INVALID_PARMS;
if (sys->enable_object_cache) {
- uint32_t vid = oid_to_vid(req->rq.obj.oid);
+ uint32_t vid = oid_to_vid(req->rq.obj.oid, sys->vdi_mask);
ret = object_cache_flush_vdi(vid);
}
@@ -946,7 +966,7 @@ static int local_flush_vdi(struct request *req)
static int local_discard_obj(struct request *req)
{
uint64_t oid = req->rq.obj.oid;
- uint32_t vid = oid_to_vid(oid), tmp_vid;
+ uint32_t vid = oid_to_vid(oid, sys->vdi_mask), tmp_vid;
int ret, idx = data_oid_to_idx(oid);
struct sd_inode *inode = xmalloc(sizeof(struct sd_inode));
@@ -1477,6 +1497,7 @@ static int local_get_cluster_default(const struct sd_req *req,
rsp->cluster_default.nr_copies = sys->cinfo.nr_copies;
rsp->cluster_default.copy_policy = sys->cinfo.copy_policy;
rsp->cluster_default.block_size_shift = sys->cinfo.block_size_shift;
+ rsp->cluster_default.vid_space = sys->cinfo.vid_space;
return SD_RES_SUCCESS;
}
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 92f9a14..390a964 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -152,7 +152,8 @@ static int default_trim(int fd, uint64_t oid, const struct siocb *iocb,
if (*poffset + *plen < iocb->offset + iocb->length) {
uint64_t end = iocb->offset + iocb->length;
- uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
+ uint32_t object_size = get_vdi_object_size(oid_to_vid(oid,
+ sys->vdi_mask));
if (end == get_objsize(oid, object_size))
/* This is necessary to punch the last block */
end = round_up(end, BLOCK_SIZE);
@@ -281,14 +282,16 @@ static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
"wat %s", oid, epoch, wd);
goto out;
}
- add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
- vdi_is_snapshot(inode), inode->copy_policy,
- inode->block_size_shift, inode->parent_vdi_id);
+ add_vdi_state_unordered(oid_to_vid(oid, sys->vdi_mask),
+ inode->nr_copies, vdi_is_snapshot(inode),
+ inode->copy_policy, inode->block_size_shift,
+ inode->parent_vdi_id);
if (inode->name[0] == '\0')
- atomic_set_bit(oid_to_vid(oid), sys->vdi_deleted);
+ atomic_set_bit(oid_to_vid(oid, sys->vdi_mask),
+ sys->vdi_deleted);
- atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
+ atomic_set_bit(oid_to_vid(oid, sys->vdi_mask), sys->vdi_inuse);
ret = SD_RES_SUCCESS;
out:
@@ -400,12 +403,14 @@ int prealloc(int fd, uint32_t size)
size_t get_store_objsize(uint64_t oid)
{
if (is_erasure_oid(oid)) {
- uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
+ uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid,
+ sys->vdi_mask));
int d;
ec_policy_to_dp(policy, &d, NULL);
- return get_vdi_object_size(oid_to_vid(oid)) / d;
+ return get_vdi_object_size(oid_to_vid(oid, sys->vdi_mask)) / d;
}
- return get_objsize(oid, get_vdi_object_size(oid_to_vid(oid)));
+ return get_objsize(oid, get_vdi_object_size(oid_to_vid(oid,
+ sys->vdi_mask)));
}
int default_create_and_write(uint64_t oid, const struct siocb *iocb)
@@ -454,7 +459,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
trim_zero_blocks(iocb->buf, &offset, &len);
- object_size = get_vdi_object_size(oid_to_vid(oid));
+ object_size = get_vdi_object_size(oid_to_vid(oid, sys->vdi_mask));
if (offset != 0 || len != get_objsize(oid, object_size)) {
if (is_sparse_object(oid))
diff --git a/sheep/recovery.c b/sheep/recovery.c
index dbd5146..b3e7a22 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -194,7 +194,7 @@ static void *read_erasure_object(uint64_t oid, uint8_t idx,
struct vnode_info *old = grab_vnode_info(rw->old_vinfo), *new_old;
uint32_t epoch = rw->epoch, tgt_epoch = rw->tgt_epoch;
const struct sd_node *node;
- uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
+ uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid, sys->vdi_mask));
int edp = ec_policy_to_dp(policy, NULL, NULL);
int ret;
struct sd_node *excluded = row->base.rinfo->excluded;
@@ -451,8 +451,9 @@ static void *rebuild_erasure_object(uint64_t oid, uint8_t idx,
int len = get_store_objsize(oid);
char *lost = xvalloc(len);
int i, j;
- uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
- uint32_t object_size = get_vdi_object_size(oid_to_vid(oid));
+ uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid, sys->vdi_mask));
+ uint32_t object_size = get_vdi_object_size(oid_to_vid(oid,
+ sys->vdi_mask));
int ed = 0, edp;
edp = ec_policy_to_dp(policy, &ed, NULL);
struct fec *ctx = ec_init(ed, edp);
@@ -492,7 +493,8 @@ out:
uint8_t local_ec_index(struct vnode_info *vinfo, uint64_t oid)
{
- int idx, m = min(get_vdi_copy_number(oid_to_vid(oid)), vinfo->nr_zones);
+ int idx, m = min(get_vdi_copy_number(oid_to_vid(oid, sys->vdi_mask)),
+ vinfo->nr_zones);
if (!is_erasure_oid(oid))
return SD_MAX_COPIES;
diff --git a/sheep/request.c b/sheep/request.c
index 2f86c67..79204ba 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -318,7 +318,8 @@ static bool has_enough_zones(struct request *req)
{
uint64_t oid = req->rq.obj.oid;
- return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid));
+ return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid,
+ sys->vdi_mask));
}
static void queue_gateway_request(struct request *req)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 5608cbc..3b0d46d 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -132,8 +132,8 @@ struct system_info {
uint32_t nr_vdis;
uint64_t vdi_mask;
- DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS);
- DECLARE_BITMAP(vdi_deleted, SD_NR_VDIS);
+ unsigned long *vdi_inuse;
+ unsigned long *vdi_deleted;
int local_req_efd;
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 2889df6..b41f4b7 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -237,7 +237,7 @@ bool oid_is_readonly(uint64_t oid)
if (!is_data_obj(oid))
return false;
- return vid_is_snapshot(oid_to_vid(oid));
+ return vid_is_snapshot(oid_to_vid(oid, sys->vdi_mask));
}
int get_vdi_copy_number(uint32_t vid)
@@ -314,7 +314,8 @@ uint8_t get_vdi_block_size_shift(uint32_t vid)
int get_obj_copy_number(uint64_t oid, int nr_zones)
{
- return min(get_vdi_copy_number(oid_to_vid(oid)), nr_zones);
+ return min(get_vdi_copy_number(oid_to_vid(oid, sys->vdi_mask)),
+ nr_zones);
}
int get_req_copy_number(struct request *req)
@@ -1346,8 +1347,8 @@ out:
* Return SUCCESS (range of bits set):
* Iff we get a bitmap range [left, right) that VDI might be set between. if
* right < start, this means a wrap around case where we should examine the
- * two split ranges, [left, SD_NR_VDIS - 1] and [0, right). 'Right' is the free
- * bit that might be used by newly created VDI.
+ * two split ranges, [left, sys->nr_vdis - 1] and [0, right).
+ * 'Right' is the free bit that might be used by newly created VDI.
*
* Otherwise:
* Return NO_VDI (bit not set) or FULL_VDI (bitmap fully set)
@@ -1355,15 +1356,15 @@ out:
static int get_vdi_bitmap_range(const char *name, unsigned long *left,
unsigned long *right)
{
- *left = sd_hash_vdi(name);
- *right = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, *left);
+ *left = sd_hash_vdi(name, sys->nr_vdis);
+ *right = find_next_zero_bit(sys->vdi_inuse, sys->nr_vdis, *left);
if (*left == *right)
return SD_RES_NO_VDI;
- if (*right == SD_NR_VDIS) {
+ if (*right == sys->nr_vdis) {
/* Wrap around */
- *right = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, 0);
- if (*right == SD_NR_VDIS)
+ *right = find_next_zero_bit(sys->vdi_inuse, sys->nr_vdis, 0);
+ if (*right == sys->nr_vdis)
return SD_RES_FULL_VDI;
}
return SD_RES_SUCCESS;
@@ -1458,7 +1459,7 @@ static int fill_vdi_info(unsigned long left, unsigned long right,
switch (ret) {
case SD_RES_NO_VDI:
case SD_RES_NO_TAG:
- ret = fill_vdi_info_range(left, SD_NR_VDIS - 1, iocb, info);
+ ret = fill_vdi_info_range(left, sys->nr_vdis - 1, iocb, info);
break;
default:
break;
@@ -1495,7 +1496,7 @@ int vdi_lookup(const struct vdi_iocb *iocb, struct vdi_info *info)
* TODO: for checking before creation, the below fill_vdi_info()
* isn't required. It must be eliminated.
*/
- return fill_vdi_info(0, SD_NR_VDIS, iocb, info);
+ return fill_vdi_info(0, sys->nr_vdis, iocb, info);
case SD_RES_FULL_VDI:
return ret;
case SD_RES_SUCCESS:
@@ -1629,22 +1630,30 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid)
int read_vdis(char *data, int len, unsigned int *rsp_len)
{
- if (len != sizeof(sys->vdi_inuse))
+ if (!is_cluster_formatted()) {
+ *rsp_len = 0;
+ return SD_RES_SUCCESS;
+ }
+ if (len != (sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis)))
return SD_RES_INVALID_PARMS;
- memcpy(data, sys->vdi_inuse, sizeof(sys->vdi_inuse));
- *rsp_len = sizeof(sys->vdi_inuse);
+ memcpy(data, sys->vdi_inuse, len);
+ *rsp_len = len;
return SD_RES_SUCCESS;
}
int read_del_vdis(char *data, int len, unsigned int *rsp_len)
{
- if (len != sizeof(sys->vdi_deleted))
+ if (!is_cluster_formatted()) {
+ *rsp_len = 0;
+ return SD_RES_SUCCESS;
+ }
+ if (len != sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis))
return SD_RES_INVALID_PARMS;
- memcpy(data, sys->vdi_deleted, sizeof(sys->vdi_deleted));
- *rsp_len = sizeof(sys->vdi_deleted);
+ memcpy(data, sys->vdi_deleted, len);
+ *rsp_len = len;
return SD_RES_SUCCESS;
}
diff --git a/sheepfs/volume.c b/sheepfs/volume.c
index d43304c..921d670 100644
--- a/sheepfs/volume.c
+++ b/sheepfs/volume.c
@@ -123,7 +123,7 @@ static int volume_rw_object(char *buf, uint64_t oid, size_t size,
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
int ret, fd, sock_idx;
bool create = false;
- uint32_t vid = oid_to_vid(oid), vdi_id;
+ uint32_t vid = oid_to_vid(oid, sys->vdi_mask), vdi_id;
struct vdi_inode *vdi;
unsigned long idx = 0;
uint64_t cow_oid = 0;
--
1.7.1
More information about the sheepdog
mailing list