Rewrite vnode mapping by making the structure containing the tuple a first class citizen and passing it around to all places that deal with vnodes. This in turn also allows getting rid of the list of these structures, as we can simply create one at each epoch change and reference count it. This also uncovered a nasty bug where queue_request called setup_access_to_local_objects before the vnode-related fields in the request were set up. Signed-off-by: Christoph Hellwig --- sheep/farm/trunk.c | 20 +++-- sheep/group.c | 98 +++++++++------------------- sheep/object_cache.c | 32 +++++---- sheep/sdnet.c | 73 +++++++++------------ sheep/sheep_priv.h | 26 +++---- sheep/store.c | 38 ++++------- sheep/vdi.c | 176 +++++++++++++++++++++------------------------------ 7 files changed, 201 insertions(+), 262 deletions(-) Index: sheepdog/sheep/group.c =================================================================== --- sheepdog.orig/sheep/group.c 2012-04-27 15:26:15.100043955 +0200 +++ sheepdog/sheep/group.c 2012-04-27 15:27:42.128046180 +0200 @@ -8,6 +8,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +#include #include #include #include @@ -96,6 +97,7 @@ struct work_leave { }) static int event_running; +static struct vnode_info *current_vnode_info; static size_t get_join_message_size(struct join_message *jm) { @@ -124,67 +126,37 @@ int get_zones_nr_from(struct sd_node *no return nr_zones; } -struct vnodes_cache { - struct sd_vnode vnodes[SD_MAX_VNODES]; - int nr_vnodes; - int nr_zones; - uint32_t epoch; - - int refcnt; - struct list_head list; -}; - -int get_ordered_sd_vnode_list(struct sd_vnode **entries, - int *nr_vnodes, int *nr_zones) +struct vnode_info *get_vnode_info(void) { - static LIST_HEAD(vnodes_list); - struct vnodes_cache *cache; + assert(current_vnode_info); + current_vnode_info->refcnt++; + return current_vnode_info; +} - list_for_each_entry(cache, &vnodes_list, list) { - if (cache->epoch == sys->epoch) { - *entries = cache->vnodes; - *nr_vnodes = cache->nr_vnodes; - *nr_zones = cache->nr_zones; - cache->refcnt++; +void put_vnode_info(struct vnode_info *vnodes) +{ + if (vnodes && --vnodes->refcnt == 0) + free(vnodes); +} - return SD_RES_SUCCESS; - } - } +static int update_vnode_info(void) +{ + struct vnode_info *vnode_info; - cache = zalloc(sizeof(*cache)); - if (!cache) { + vnode_info = zalloc(sizeof(*vnode_info)); + if (!vnode_info) { eprintf("failed to allocate memory\n"); - *entries = NULL; - return SD_RES_NO_MEM; + return 1; } - cache->nr_zones = sys->nr_zones; - memcpy(cache->vnodes, sys->vnodes, sizeof(sys->vnodes[0]) * sys->nr_vnodes); - cache->nr_vnodes = sys->nr_vnodes; - cache->epoch = sys->epoch; - cache->refcnt++; + vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes, + vnode_info->entries); + vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes); + vnode_info->refcnt = 1; - *entries = cache->vnodes; - *nr_vnodes = cache->nr_vnodes; - *nr_zones = cache->nr_zones; - - list_add(&cache->list, &vnodes_list); - - return SD_RES_SUCCESS; -} - -void free_ordered_sd_vnode_list(struct sd_vnode *entries) -{ - struct vnodes_cache *cache; - - if (!entries) - return; - - cache = container_of(entries, struct vnodes_cache, vnodes[0]); - if (--cache->refcnt == 0) { - list_del(&cache->list); - free(cache); - } + put_vnode_info(current_vnode_info); + current_vnode_info = vnode_info; + return 0; } static void do_cluster_op(void *arg) @@ -570,9 +542,6 @@ static void update_cluster_info(struct j join_finished: sys->nodes[sys->nr_nodes++] = *joined; qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); - sys->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes, - sys->vnodes); - sys->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes); if (msg->cluster_status == SD_STATUS_OK || msg->cluster_status == SD_STATUS_HALT) { @@ -588,6 +557,7 @@ join_finished: set_cluster_ctime(msg->ctime); } } + update_vnode_info(); print_node_list(sys->nodes, sys->nr_nodes); @@ -822,7 +792,7 @@ static void __sd_join_done(struct event_ } if (sys_stat_halt()) { - if (sys->nr_zones >= sys->nr_sobjs) + if (current_vnode_info->nr_zones >= sys->nr_sobjs) sys_stat_set(SD_STATUS_OK); } @@ -838,15 +808,13 @@ static void __sd_leave_done(struct event sys->nr_nodes = w->member_list_entries; memcpy(sys->nodes, w->member_list, sizeof(*sys->nodes) * sys->nr_nodes); qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); - sys->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes, - sys->vnodes); - sys->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes); if (sys_can_recover()) { sys->epoch++; update_epoch_store(sys->epoch); update_epoch_log(sys->epoch); } + update_vnode_info(); print_node_list(sys->nodes, sys->nr_nodes); @@ -854,7 +822,7 @@ static void __sd_leave_done(struct event start_recovery(sys->epoch); if (sys_can_halt()) { - if (sys->nr_zones < sys->nr_sobjs) + if (current_vnode_info->nr_zones < sys->nr_sobjs) sys_stat_set(SD_STATUS_HALT); } } @@ -1022,8 +990,8 @@ static void process_request_queue(void) if (is_io_op(req->op)) { int copies = sys->nr_sobjs; - if (copies > req->nr_zones) - copies = req->nr_zones; + if (copies > req->vnodes->nr_zones) + copies = req->vnodes->nr_zones; if (!(req->rq.flags & SD_FLAG_CMD_IO_LOCAL) && object_is_cached(hdr->oid)) { @@ -1200,9 +1168,9 @@ void sd_join_handler(struct sd_node *joi sys->join_finished = 1; sys->nodes[sys->nr_nodes++] = sys->this_node; qsort(sys->nodes, sys->nr_nodes, sizeof(*sys->nodes), node_cmp); - sys->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes, - sys->vnodes); sys->epoch = get_latest_epoch(); + + update_vnode_info(); } nr_local = get_nodes_nr_epoch(sys->epoch); Index: sheepdog/sheep/object_cache.c =================================================================== --- sheepdog.orig/sheep/object_cache.c 2012-04-27 15:26:15.100043955 +0200 +++ sheepdog/sheep/object_cache.c 2012-04-27 15:27:42.128046180 +0200 @@ -374,7 +374,7 @@ int object_cache_pull(struct object_cach uint64_t oid; struct sd_obj_req hdr = { 0 }; struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; - struct sd_vnode *vnodes = sys->vnodes; + struct vnode_info *vnodes = get_vnode_info(); void *buf; int copies; @@ -393,13 +393,14 @@ int object_cache_pull(struct object_cach } copies = sys->nr_sobjs; - if (sys->nr_zones < copies) - copies = sys->nr_zones; + if (vnodes->nr_zones < copies) + copies = vnodes->nr_zones; /* Check if we can read locally */ for (i = 0; i < copies; i++) { - n = obj_to_sheep(vnodes, sys->nr_vnodes, oid, i); - if (is_myself(vnodes[n].addr, vnodes[n].port)) { + n = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); + if (is_myself(vnodes->entries[n].addr, + vnodes->entries[n].port)) { struct siocb iocb = { 0 }; iocb.epoch = sys->epoch; ret = sd_store->open(oid, &iocb, 0); @@ -423,8 +424,8 @@ int object_cache_pull(struct object_cach pull_remote: /* Okay, no luck, let's read remotely */ for (i = 0; i < copies; i++) { - n = obj_to_sheep(vnodes, sys->nr_vnodes, oid, i); - if (is_myself(vnodes[n].addr, vnodes[n].port)) + n = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); + if (is_myself(vnodes->entries[n].addr, vnodes->entries[n].port)) continue; hdr.opcode = SD_OP_READ_OBJ; @@ -433,7 +434,10 @@ pull_remote: hdr.data_length = rlen = data_length; hdr.flags = SD_FLAG_CMD_IO_LOCAL; - fd = get_sheep_fd(vnodes[n].addr, vnodes[n].port, vnodes[n].node_idx, hdr.epoch); + fd = get_sheep_fd(vnodes->entries[n].addr, + vnodes->entries[n].port, + vnodes->entries[n].node_idx, + hdr.epoch); if (fd < 0) continue; @@ -455,6 +459,7 @@ out: if (ret == SD_RES_SUCCESS) ret = create_cache_object(oc, idx, buf, read_len); free(buf); + put_vnode_info(vnodes); return ret; } @@ -502,15 +507,14 @@ static int push_cache_object(uint32_t vi hdr->epoch = sys->epoch; fake_req.data = buf; fake_req.op = get_sd_op(hdr->opcode); - fake_req.entry = sys->vnodes; - fake_req.nr_vnodes = sys->nr_vnodes; - fake_req.nr_zones = get_zones_nr_from(sys->nodes, sys->nr_vnodes); + fake_req.vnodes = get_vnode_info(); ret = forward_write_obj_req(&fake_req); - if (ret != SD_RES_SUCCESS) { + if (ret != SD_RES_SUCCESS) eprintf("failed to push object %x\n", ret); - goto out; - } + + put_vnode_info(fake_req.vnodes); + out: free(buf); return ret; Index: sheepdog/sheep/sdnet.c =================================================================== --- sheepdog.orig/sheep/sdnet.c 2012-04-27 15:26:15.100043955 +0200 +++ sheepdog/sheep/sdnet.c 2012-04-27 15:27:42.128046180 +0200 @@ -38,6 +38,7 @@ void resume_pending_requests(void) static int is_access_local(struct request *req, uint64_t oid, int copies) { + struct vnode_info *vnodes = req->vnodes; int i, n; if (oid == 0) @@ -45,13 +46,13 @@ static int is_access_local(struct reques if (copies) copies = sys->nr_sobjs; - if (copies > req->nr_zones) - copies = req->nr_zones; + if (copies > vnodes->nr_zones) + copies = vnodes->nr_zones; for (i = 0; i < copies; i++) { - n = obj_to_sheep(req->entry, req->nr_vnodes, oid, i); + n = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); - if (is_myself(req->entry[n].addr, req->entry[n].port)) + if (is_myself(vnodes->entries[n].addr, vnodes->entries[n].port)) return 1; } @@ -155,12 +156,8 @@ static void io_op_done(struct work *work retry: req->rq.epoch = sys->epoch; - if (req->entry) - free_ordered_sd_vnode_list(req->entry); - if (get_ordered_sd_vnode_list(&req->entry, &req->nr_vnodes, - &req->nr_zones) != SD_RES_SUCCESS) - panic("failed to setup vnode list\n"); - + put_vnode_info(req->vnodes); + req->vnodes = get_vnode_info(); setup_access_to_local_objects(req); list_add_tail(&req->cev.event_list, &sys->request_queue); @@ -300,7 +297,6 @@ static void queue_request(struct request if (is_io_op(req->op)) { req->work.fn = do_io_request; req->work.done = io_op_done; - setup_access_to_local_objects(req); } else if (is_local_op(req->op)) { req->work.fn = do_local_request; req->work.done = local_op_done; @@ -327,11 +323,9 @@ static void queue_request(struct request list_del(&req->r_wlist); - assert(req->entry == NULL); - if (get_ordered_sd_vnode_list(&req->entry, &req->nr_vnodes, - &req->nr_zones) != SD_RES_SUCCESS) - panic("failed to setup vnode list\n"); - + req->vnodes = get_vnode_info(); + if (is_io_op(req->op)) + setup_access_to_local_objects(req); cevent->ctype = EVENT_REQUEST; list_add_tail(&cevent->event_list, &sys->request_queue); process_request_event_queues(); @@ -377,7 +371,7 @@ static void free_request(struct request sys->outstanding_data_size -= req->data_length; list_del(&req->r_siblings); - free_ordered_sd_vnode_list(req->entry); + put_vnode_info(req->vnodes); free(req->data); free(req); } @@ -681,8 +675,7 @@ int create_listen_port(int port, void *d return create_listen_ports(port, create_listen_port_fn, data); } -int write_object(struct sd_vnode *e, - int vnodes, int zones, uint32_t node_version, +int write_object(struct vnode_info *vnodes, uint32_t node_version, uint64_t oid, char *data, unsigned int datalen, uint64_t offset, uint16_t flags, int nr, int create) { @@ -690,15 +683,15 @@ int write_object(struct sd_vnode *e, int i, n, fd, ret; char name[128]; - if (nr > zones) - nr = zones; + if (nr > vnodes->nr_zones) + nr = vnodes->nr_zones; for (i = 0; i < nr; i++) { unsigned rlen = 0, wlen = datalen; - n = obj_to_sheep(e, vnodes, oid, i); + n = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); - if (is_myself(e[n].addr, e[n].port)) { + if (is_myself(vnodes->entries[n].addr, vnodes->entries[n].port)) { ret = write_object_local(oid, data, datalen, offset, flags, nr, node_version, create); @@ -710,9 +703,9 @@ int write_object(struct sd_vnode *e, continue; } - addr_to_str(name, sizeof(name), e[n].addr, 0); + addr_to_str(name, sizeof(name), vnodes->entries[n].addr, 0); - fd = connect_to(name, e[n].port); + fd = connect_to(name, vnodes->entries[n].port); if (fd < 0) { eprintf("failed to connect to host %s\n", name); return -1; @@ -744,8 +737,7 @@ int write_object(struct sd_vnode *e, return 0; } -int read_object(struct sd_vnode *e, - int vnodes, int zones, uint32_t node_version, +int read_object(struct vnode_info *vnodes, uint32_t node_version, uint64_t oid, char *data, unsigned int datalen, uint64_t offset, int nr) { @@ -754,14 +746,14 @@ int read_object(struct sd_vnode *e, char name[128]; int i = 0, n, fd, ret, last_error = SD_RES_SUCCESS; - if (nr > zones) - nr = zones; + if (nr > vnodes->nr_zones) + nr = vnodes->nr_zones; /* search a local object first */ for (i = 0; i < nr; i++) { - n = obj_to_sheep(e, vnodes, oid, i); + n = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); - if (is_myself(e[n].addr, e[n].port)) { + if (is_myself(vnodes->entries[n].addr, vnodes->entries[n].port)) { ret = read_object_local(oid, data, datalen, offset, nr, node_version); @@ -778,11 +770,11 @@ int read_object(struct sd_vnode *e, for (i = 0; i < nr; i++) { unsigned wlen = 0, rlen = datalen; - n = obj_to_sheep(e, vnodes, oid, i); + n = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); - addr_to_str(name, sizeof(name), e[n].addr, 0); + addr_to_str(name, sizeof(name), vnodes->entries[n].addr, 0); - fd = connect_to(name, e[n].port); + fd = connect_to(name, vnodes->entries[n].port); if (fd < 0) { printf("%s(%d): %s, %m\n", __func__, __LINE__, name); @@ -815,8 +807,7 @@ int read_object(struct sd_vnode *e, return last_error; } -int remove_object(struct sd_vnode *e, - int vnodes, int zones, uint32_t node_version, +int remove_object(struct vnode_info *vnodes, uint32_t node_version, uint64_t oid, int nr) { char name[128]; @@ -824,17 +815,17 @@ int remove_object(struct sd_vnode *e, struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; int i = 0, n, fd, ret, err = 0; - if (nr > zones) - nr = zones; + if (nr > vnodes->nr_zones) + nr = vnodes->nr_zones; for (i = 0; i < nr; i++) { unsigned wlen = 0, rlen = 0; - n = obj_to_sheep(e, vnodes, oid, i); + n = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); - addr_to_str(name, sizeof(name), e[n].addr, 0); + addr_to_str(name, sizeof(name), vnodes->entries[n].addr, 0); - fd = connect_to(name, e[n].port); + fd = connect_to(name, vnodes->entries[n].port); if (fd < 0) { rsp->result = SD_RES_EIO; return -1; Index: sheepdog/sheep/sheep_priv.h =================================================================== --- sheepdog.orig/sheep/sheep_priv.h 2012-04-27 15:26:15.100043955 +0200 +++ sheepdog/sheep/sheep_priv.h 2012-04-27 15:27:42.128046180 +0200 @@ -62,6 +62,13 @@ struct client_info { int refcnt; }; +struct vnode_info { + struct sd_vnode entries[SD_MAX_VNODES]; + int nr_vnodes; + int nr_zones; + int refcnt; +}; + struct request; typedef void (*req_end_t) (struct request *); @@ -84,9 +91,7 @@ struct request { uint64_t local_oid; uint64_t local_cow_oid; - struct sd_vnode *entry; - int nr_vnodes; - int nr_zones; + struct vnode_info *vnodes; int check_consistency; req_end_t done; @@ -138,7 +143,6 @@ struct cluster_info { struct list_head blocking_conn_list; uint32_t nr_sobjs; - int nr_zones; struct list_head request_queue; struct list_head event_queue; @@ -236,9 +240,8 @@ int get_vdi_attr(uint32_t epoch, struct int write, int excl, int delete); int get_zones_nr_from(struct sd_node *nodes, int nr_nodes); -int get_ordered_sd_vnode_list(struct sd_vnode **entries, - int *nr_vnodes, int *nr_zones); -void free_ordered_sd_vnode_list(struct sd_vnode *entries); +struct vnode_info *get_vnode_info(void); +void put_vnode_info(struct vnode_info *vnodes); int is_access_to_busy_objects(uint64_t oid); void resume_pending_requests(void); @@ -293,16 +296,13 @@ void resume_recovery_work(void); int is_recoverying_oid(uint64_t oid); int node_in_recovery(void); -int write_object(struct sd_vnode *e, - int vnodes, int zones, uint32_t node_version, +int write_object(struct vnode_info *vnodes, uint32_t node_version, uint64_t oid, char *data, unsigned int datalen, uint64_t offset, uint16_t flags, int nr, int create); -int read_object(struct sd_vnode *e, - int vnodes, int zones, uint32_t node_version, +int read_object(struct vnode_info *vnodes, uint32_t node_version, uint64_t oid, char *data, unsigned int datalen, uint64_t offset, int nr); -int remove_object(struct sd_vnode *e, - int vnodes, int zones, uint32_t node_version, +int remove_object(struct vnode_info *vnodes, uint32_t node_version, uint64_t oid, int nr); int merge_objlist(uint64_t *list1, int nr_list1, uint64_t *list2, int nr_list2); Index: sheepdog/sheep/store.c =================================================================== --- sheepdog.orig/sheep/store.c 2012-04-27 15:26:15.100043955 +0200 +++ sheepdog/sheep/store.c 2012-04-27 15:27:42.132046180 +0200 @@ -223,7 +223,7 @@ int get_obj_list(const struct sd_list_re static int read_copy_from_replica(struct request *req, uint32_t epoch, uint64_t oid, char *buf) { - int i, n, nr, nr_vnodes, ret; + int i, n, nr, ret; unsigned wlen, rlen; char name[128]; struct sd_vnode *e; @@ -232,15 +232,14 @@ static int read_copy_from_replica(struct struct siocb iocb; int fd; - e = req->entry; - nr_vnodes = req->nr_vnodes; + e = req->vnodes->entries; nr = sys->nr_sobjs; - if (nr > req->nr_zones) - nr = req->nr_zones; + if (nr > req->vnodes->nr_zones) + nr = req->vnodes->nr_zones; for (i = 0; i < nr; i++) { - n = obj_to_sheep(e, nr_vnodes, oid, i); + n = obj_to_sheep(e, req->vnodes->nr_vnodes, oid, i); addr_to_str(name, sizeof(name), e[n].addr, 0); @@ -302,7 +301,7 @@ static int do_local_io(struct request *r static int forward_read_obj_req(struct request *req) { - int i, n, nr, fd, ret = SD_RES_SUCCESS; + int i, n, fd, ret = SD_RES_SUCCESS; unsigned wlen, rlen; struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq; struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&hdr; @@ -310,22 +309,21 @@ static int forward_read_obj_req(struct r uint64_t oid = hdr.oid; int copies; - e = req->entry; - nr = req->nr_vnodes; + e = req->vnodes->entries; copies = hdr.copies; /* temporary hack */ if (!copies) copies = sys->nr_sobjs; - if (copies > req->nr_zones) - copies = req->nr_zones; + if (copies > req->vnodes->nr_zones) + copies = req->vnodes->nr_zones; hdr.flags |= SD_FLAG_CMD_IO_LOCAL; /* TODO: we can do better; we need to check this first */ for (i = 0; i < copies; i++) { - n = obj_to_sheep(e, nr, oid, i); + n = obj_to_sheep(e, req->vnodes->nr_vnodes, oid, i); if (is_myself(e[n].addr, e[n].port)) { ret = do_local_io(req, hdr.epoch); @@ -337,7 +335,7 @@ static int forward_read_obj_req(struct r read_remote: for (i = 0; i < copies; i++) { - n = obj_to_sheep(e, nr, oid, i); + n = obj_to_sheep(e, req->vnodes->nr_vnodes, oid, i); if (is_myself(e[n].addr, e[n].port)) continue; @@ -367,28 +365,26 @@ read_remote: int forward_write_obj_req(struct request *req) { - int i, n, nr, fd, ret, pollret; + int i, n, fd, ret, pollret; unsigned wlen; char name[128]; struct sd_obj_req hdr = *(struct sd_obj_req *)&req->rq; struct sd_obj_rsp *rsp = (struct sd_obj_rsp *)&req->rp; - struct sd_vnode *e; + struct sd_vnode *e = req->vnodes->entries; uint64_t oid = hdr.oid; int copies; struct pollfd pfds[SD_MAX_REDUNDANCY]; int nr_fds, local = 0; dprintf("%"PRIx64"\n", oid); - e = req->entry; - nr = req->nr_vnodes; copies = hdr.copies; /* temporary hack */ if (!copies) copies = sys->nr_sobjs; - if (copies > req->nr_zones) - copies = req->nr_zones; + if (copies > req->vnodes->nr_zones) + copies = req->vnodes->nr_zones; nr_fds = 0; memset(pfds, 0, sizeof(pfds)); @@ -400,9 +396,9 @@ int forward_write_obj_req(struct request wlen = hdr.data_length; for (i = 0; i < copies; i++) { - n = obj_to_sheep(e, nr, oid, i); + n = obj_to_sheep(req->vnodes->entries, req->vnodes->nr_vnodes, oid, i); - addr_to_str(name, sizeof(name), e[n].addr, 0); + addr_to_str(name, sizeof(name), req->vnodes->entries[n].addr, 0); if (is_myself(e[n].addr, e[n].port)) { local = 1; Index: sheepdog/sheep/vdi.c =================================================================== --- sheepdog.orig/sheep/vdi.c 2012-04-27 15:26:15.100043955 +0200 +++ sheepdog/sheep/vdi.c 2012-04-27 15:32:33.272053635 +0200 @@ -21,17 +21,16 @@ static int create_vdi_obj(uint32_t epoch uint32_t base_vid, uint32_t cur_vid, uint32_t copies, uint32_t snapid, int is_snapshot) { - struct sd_vnode *entries = NULL; + struct vnode_info *vnode_info; /* we are not called concurrently */ struct sheepdog_inode *new = NULL, *base = NULL, *cur = NULL; struct timeval tv; - int ret, nr_vnodes, nr_zones; + int ret = SD_RES_NO_MEM; unsigned long block_size = SD_DATA_OBJ_SIZE; new = zalloc(sizeof(*new)); if (!new) { eprintf("failed to allocate memory\n"); - ret = SD_RES_NO_MEM; goto out; } @@ -39,7 +38,6 @@ static int create_vdi_obj(uint32_t epoch base = zalloc(sizeof(*base)); if (!base) { eprintf("failed to allocate memory\n"); - ret = SD_RES_NO_MEM; goto out; } } @@ -48,22 +46,19 @@ static int create_vdi_obj(uint32_t epoch cur = zalloc(SD_INODE_HEADER_SIZE); if (!cur) { eprintf("failed to allocate memory\n"); - ret = SD_RES_NO_MEM; goto out; } } - ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones); - if (ret != SD_RES_SUCCESS) - goto out; + vnode_info = get_vnode_info(); if (base_vid) { - ret = read_object(entries, nr_vnodes, nr_zones, epoch, + ret = read_object(vnode_info, epoch, vid_to_vdi_oid(base_vid), (char *)base, sizeof(*base), 0, copies); if (ret != SD_RES_SUCCESS) { ret = SD_RES_BASE_VDI_READ; - goto out; + goto out_put_vnode_info; } } @@ -74,13 +69,13 @@ static int create_vdi_obj(uint32_t epoch vprintf(SDOG_INFO, "tree snapshot %s %" PRIx32 " %" PRIx32 "\n", name, cur_vid, base_vid); - ret = read_object(entries, nr_vnodes, nr_zones, epoch, + ret = read_object(vnode_info, epoch, vid_to_vdi_oid(cur_vid), (char *)cur, SD_INODE_HEADER_SIZE, 0, copies); if (ret != SD_RES_SUCCESS) { vprintf(SDOG_ERR, "failed\n"); ret = SD_RES_BASE_VDI_READ; - goto out; + goto out_put_vnode_info; } cur->snap_ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000; @@ -112,39 +107,41 @@ static int create_vdi_obj(uint32_t epoch if (i == ARRAY_SIZE(base->child_vdi_id)) { ret = SD_RES_NO_BASE_VDI; - goto out; + goto out_put_vnode_info; } } if (is_snapshot && cur_vid != base_vid) { - ret = write_object(entries, nr_vnodes, nr_zones, epoch, + ret = write_object(vnode_info, epoch, vid_to_vdi_oid(cur_vid), (char *)cur, SD_INODE_HEADER_SIZE, 0, 0, copies, 0); if (ret != 0) { vprintf(SDOG_ERR, "failed\n"); ret = SD_RES_BASE_VDI_READ; - goto out; + goto out_put_vnode_info; } } if (base_vid) { - ret = write_object(entries, nr_vnodes, nr_zones, epoch, + ret = write_object(vnode_info, epoch, vid_to_vdi_oid(base_vid), (char *)base, SD_INODE_HEADER_SIZE, 0, 0, copies, 0); if (ret != 0) { vprintf(SDOG_ERR, "failed\n"); ret = SD_RES_BASE_VDI_WRITE; - goto out; + goto out_put_vnode_info; } } - ret = write_object(entries, nr_vnodes, nr_zones, epoch, + ret = write_object(vnode_info, epoch, vid_to_vdi_oid(new_vid), (char *)new, sizeof(*new), 0, 0, copies, 1); if (ret != 0) ret = SD_RES_VDI_WRITE; + +out_put_vnode_info: + put_vnode_info(vnode_info); out: - free_ordered_sd_vnode_list(entries); free(new); free(cur); free(base); @@ -156,34 +153,32 @@ static int find_first_vdi(uint32_t epoch unsigned long *deleted_nr, uint32_t *next_snap, unsigned int *nr_copies, uint64_t *ctime) { - struct sd_vnode *entries = NULL; + struct vnode_info *vnode_info; struct sheepdog_inode *inode = NULL; unsigned long i; - int nr_vnodes, nr_zones, nr_reqs; - int ret, vdi_found = 0; + int nr_reqs; + int ret = SD_RES_NO_MEM; + int vdi_found = 0; inode = malloc(SD_INODE_HEADER_SIZE); if (!inode) { eprintf("failed to allocate memory\n"); - ret = SD_RES_NO_MEM; goto out; } - ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones); - if (ret != SD_RES_SUCCESS) - goto out; + vnode_info = get_vnode_info(); nr_reqs = sys->nr_sobjs; - if (nr_reqs > nr_zones) - nr_reqs = nr_zones; + if (nr_reqs > vnode_info->nr_zones) + nr_reqs = vnode_info->nr_zones; for (i = start; i >= end; i--) { - ret = read_object(entries, nr_vnodes, nr_zones, epoch, + ret = read_object(vnode_info, epoch, vid_to_vdi_oid(i), (char *)inode, SD_INODE_HEADER_SIZE, 0, nr_reqs); if (ret != SD_RES_SUCCESS) { ret = SD_RES_EIO; - goto out; + goto out_put_vnode_info; } if (inode->name[0] == '\0') { @@ -205,7 +200,7 @@ static int find_first_vdi(uint32_t epoch if (ctime) *ctime = inode->ctime; ret = SD_RES_SUCCESS; - goto out; + goto out_put_vnode_info; } } @@ -213,14 +208,14 @@ static int find_first_vdi(uint32_t epoch ret = SD_RES_NO_TAG; else ret = SD_RES_NO_VDI; -out: - free(inode); - free_ordered_sd_vnode_list(entries); +out_put_vnode_info: + put_vnode_info(vnode_info); + free(inode); +out: return ret; } - static int do_lookup_vdi(uint32_t epoch, char *name, int namelen, uint32_t *vid, char *tag, uint32_t snapid, uint32_t *next_snapid, unsigned long *right_nr, unsigned long *deleted_nr, @@ -345,7 +340,6 @@ int del_vdi(uint32_t epoch, char *data, uint32_t dummy0; unsigned long dummy1, dummy2; int ret; - struct sd_vnode *entries = NULL; struct sheepdog_inode *inode = NULL; inode = malloc(SD_INODE_HEADER_SIZE); @@ -372,8 +366,6 @@ int del_vdi(uint32_t epoch, char *data, ret = start_deletion(*vid, epoch); out: free(inode); - free_ordered_sd_vnode_list(entries); - return ret; } @@ -400,9 +392,7 @@ struct deletion_work { int count; uint32_t *buf; - struct sd_vnode entries[SD_MAX_VNODES]; - int nr_vnodes; - int nr_zones; + struct vnode_info *vnodes; int delete_error; }; @@ -420,12 +410,11 @@ static int delete_inode(struct deletion_ } nr_reqs = sys->nr_sobjs; - if (nr_reqs > dw->nr_zones) - nr_reqs = dw->nr_zones; + if (nr_reqs > dw->vnodes->nr_zones) + nr_reqs = dw->vnodes->nr_zones; - ret = read_object(dw->entries, dw->nr_vnodes, dw->nr_zones, dw->epoch, - vid_to_vdi_oid(dw->vid), (char *)inode, - SD_INODE_HEADER_SIZE, 0, nr_reqs); + ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(dw->vid), + (char *)inode, SD_INODE_HEADER_SIZE, 0, nr_reqs); if (ret != SD_RES_SUCCESS) { ret = SD_RES_EIO; goto out; @@ -436,9 +425,9 @@ static int delete_inode(struct deletion_ else memset(inode->name, 0, sizeof(inode->name)); - ret = write_object(dw->entries, dw->nr_vnodes, dw->nr_zones, dw->epoch, - vid_to_vdi_oid(dw->vid), (char *)inode, - SD_INODE_HEADER_SIZE, 0, 0, nr_reqs, 0); + ret = write_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(dw->vid), + (char *)inode, SD_INODE_HEADER_SIZE, 0, 0, + nr_reqs, 0); if (ret != 0) { ret = SD_RES_EIO; goto out; @@ -465,8 +454,8 @@ static void delete_one(struct work *work goto out; } - ret = read_object(dw->entries, dw->nr_vnodes, dw->nr_zones, dw->epoch, - vid_to_vdi_oid(vdi_id), (void *)inode, sizeof(*inode), + ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), + (void *)inode, sizeof(*inode), 0, sys->nr_sobjs); if (ret != SD_RES_SUCCESS) { @@ -484,7 +473,7 @@ static void delete_one(struct work *work continue; } - ret = remove_object(dw->entries, dw->nr_vnodes, dw->nr_zones, dw->epoch, + ret = remove_object(dw->vnodes, dw->epoch, vid_to_data_oid(inode->data_vdi_id[i], i), inode->nr_copies); @@ -494,10 +483,11 @@ static void delete_one(struct work *work inode->data_vdi_id[i] = 0; } - if (dw->delete_error) - write_object(dw->entries, dw->nr_vnodes, dw->nr_zones, dw->epoch, - vid_to_vdi_oid(vdi_id), (void *)inode, sizeof(*inode), - 0, 0, inode->nr_copies, 0); + if (dw->delete_error) { + write_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id), + (void *)inode, sizeof(*inode), 0, 0, + inode->nr_copies, 0); + } out: free(inode); @@ -517,6 +507,7 @@ static void delete_one_done(struct work list_del(&dw->dw_siblings); + put_vnode_info(dw->vnodes); free(dw->buf); free(dw); @@ -528,9 +519,7 @@ static void delete_one_done(struct work } } -static int fill_vdi_list(struct deletion_work *dw, - struct sd_vnode *entries, - int nr_vnodes, int nr_zones, uint32_t root_vid) +static int fill_vdi_list(struct deletion_work *dw, uint32_t root_vid) { int ret, i; struct sheepdog_inode *inode = NULL; @@ -546,9 +535,9 @@ static int fill_vdi_list(struct deletion dw->buf[dw->count++] = root_vid; again: vid = dw->buf[done++]; - ret = read_object(entries, nr_vnodes, nr_zones, dw->epoch, - vid_to_vdi_oid(vid), (char *)inode, - SD_INODE_HEADER_SIZE, 0, sys->nr_sobjs); + ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vid), + (char *)inode, SD_INODE_HEADER_SIZE, 0, + sys->nr_sobjs); if (ret != SD_RES_SUCCESS) { eprintf("cannot find VDI object\n"); @@ -575,9 +564,8 @@ out: return 1; } -static uint64_t get_vdi_root(struct sd_vnode *entries, - int nr_vnodes, int nr_zones, uint32_t epoch, - uint32_t vid) +static uint64_t get_vdi_root(struct vnode_info *vnode_info, uint32_t epoch, + uint32_t vid) { int ret; struct sheepdog_inode *inode = NULL; @@ -589,8 +577,7 @@ static uint64_t get_vdi_root(struct sd_v goto out; } next: - ret = read_object(entries, nr_vnodes, nr_zones, epoch, - vid_to_vdi_oid(vid), (char *)inode, + ret = read_object(vnode_info, epoch, vid_to_vdi_oid(vid), (char *)inode, SD_INODE_HEADER_SIZE, 0, sys->nr_sobjs); if (vid == inode->vdi_id && inode->snap_id == 1 @@ -622,22 +609,18 @@ out: int start_deletion(uint32_t vid, uint32_t epoch) { struct deletion_work *dw = NULL; - struct sd_vnode *entries = NULL; - int nr_vnodes, nr_zones, ret; + struct vnode_info *vnode_info = NULL; + int ret = SD_RES_NO_MEM; uint32_t root_vid; dw = zalloc(sizeof(struct deletion_work)); - if (!dw) { - ret = SD_RES_NO_MEM; + if (!dw) goto err; - } /* buf is to store vdi id of every object */ dw->buf = zalloc(SD_INODE_SIZE - SD_INODE_HEADER_SIZE); - if (!dw->buf) { - ret = SD_RES_NO_MEM; + if (!dw->buf) goto err; - } dw->count = 0; dw->vid = vid; @@ -646,21 +629,15 @@ int start_deletion(uint32_t vid, uint32_ dw->work.fn = delete_one; dw->work.done = delete_one_done; - ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones); - if (ret != SD_RES_SUCCESS) - goto err; - - memcpy(dw->entries, entries, nr_vnodes * sizeof(struct sd_vnode)); - dw->nr_vnodes = nr_vnodes; - dw->nr_zones = nr_zones; + dw->vnodes = get_vnode_info(); - root_vid = get_vdi_root(entries, nr_vnodes, nr_zones, dw->epoch, dw->vid); + root_vid = get_vdi_root(dw->vnodes, dw->epoch, dw->vid); if (!root_vid) { ret = SD_RES_EIO; goto err; } - ret = fill_vdi_list(dw, entries, nr_vnodes, nr_zones, root_vid); + ret = fill_vdi_list(dw, root_vid); if (ret) { dprintf("snapshot chain has valid vdi, " "just mark vdi %" PRIx32 " as deleted.\n", dw->vid); @@ -681,11 +658,10 @@ int start_deletion(uint32_t vid, uint32_ list_add_tail(&dw->dw_siblings, &deletion_work_list); queue_work(sys->deletion_wqueue, &dw->work); out: - free_ordered_sd_vnode_list(entries); - + put_vnode_info(vnode_info); return SD_RES_SUCCESS; err: - free_ordered_sd_vnode_list(entries); + put_vnode_info(vnode_info); if (dw) free(dw->buf); free(dw); @@ -697,17 +673,15 @@ int get_vdi_attr(uint32_t epoch, struct uint32_t vid, uint32_t *attrid, int copies, uint64_t ctime, int wr, int excl, int delete) { - struct sd_vnode *entries = NULL; + struct vnode_info *vnode_info = NULL; struct sheepdog_vdi_attr tmp_attr; uint64_t oid, hval; uint32_t end; - int ret, nr_zones, nr_vnodes; + int ret; vattr->ctime = ctime; - ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones); - if (ret != SD_RES_SUCCESS) - goto out; + vnode_info = get_vnode_info(); /* we cannot include value_len for calculating the hash value */ hval = fnv_64a_buf(vattr->name, sizeof(vattr->name), FNV1A_64_INIT); @@ -719,21 +693,21 @@ int get_vdi_attr(uint32_t epoch, struct end = *attrid - 1; while (*attrid != end) { oid = vid_to_attr_oid(vid, *attrid); - ret = read_object(entries, nr_vnodes, nr_zones, epoch, oid, (char *)&tmp_attr, + ret = read_object(vnode_info, epoch, oid, (char *)&tmp_attr, sizeof(tmp_attr), 0, copies); if (ret == SD_RES_NO_OBJ && wr) { - ret = write_object(entries, nr_vnodes, nr_zones, epoch, oid, + ret = write_object(vnode_info, epoch, oid, (char *)vattr, data_len, 0, 0, copies, 1); if (ret) ret = SD_RES_EIO; else ret = SD_RES_SUCCESS; - goto out; + goto out_put_vnode_info; } if (ret != SD_RES_SUCCESS) - return ret; + goto out_put_vnode_info; /* compare attribute header */ if (strcmp(tmp_attr.name, vattr->name) == 0 && @@ -743,7 +717,7 @@ int get_vdi_attr(uint32_t epoch, struct if (excl) ret = SD_RES_VDI_EXIST; else if (delete) { - ret = write_object(entries, nr_vnodes, nr_zones, + ret = write_object(vnode_info, epoch, oid, (char *)"", 1, offsetof(struct sheepdog_vdi_attr, name), 0, copies, 0); @@ -752,7 +726,7 @@ int get_vdi_attr(uint32_t epoch, struct else ret = SD_RES_SUCCESS; } else if (wr) { - ret = write_object(entries, nr_vnodes, nr_zones, + ret = write_object(vnode_info, epoch, oid, (char *)vattr, SD_ATTR_OBJ_SIZE, 0, 0, copies, 0); @@ -762,7 +736,7 @@ int get_vdi_attr(uint32_t epoch, struct ret = SD_RES_SUCCESS; } else ret = SD_RES_SUCCESS; - goto out; + goto out_put_vnode_info; } (*attrid)++; @@ -770,8 +744,8 @@ int get_vdi_attr(uint32_t epoch, struct dprintf("there is no space for new VDIs\n"); ret = SD_RES_FULL_VDI; -out: - free_ordered_sd_vnode_list(entries); +out_put_vnode_info: + put_vnode_info(vnode_info); return ret; } Index: sheepdog/sheep/farm/trunk.c =================================================================== --- sheepdog.orig/sheep/farm/trunk.c 2012-04-27 15:26:15.100043955 +0200 +++ sheepdog/sheep/farm/trunk.c 2012-04-27 15:27:42.132046180 +0200 @@ -236,18 +236,24 @@ static unsigned char *omap_tree_insert(u static int oid_stale(uint64_t oid) { int i, vidx, copies; - struct sd_vnode *vnodes = sys->vnodes; + struct vnode_info *vnodes = get_vnode_info(); + int ret = 1; copies = sys->nr_sobjs; - if (copies > sys->nr_zones) - copies = sys->nr_zones; + if (copies > vnodes->nr_zones) + copies = vnodes->nr_zones; for (i = 0; i < copies; i++) { - vidx = obj_to_sheep(vnodes, sys->nr_vnodes, oid, i); - if (is_myself(vnodes[vidx].addr, vnodes[vidx].port)) - return 0; + vidx = obj_to_sheep(vnodes->entries, vnodes->nr_vnodes, oid, i); + if (is_myself(vnodes->entries[vidx].addr, + vnodes->entries[vidx].port)) { + ret = 0; + break; + } } - return 1; + + put_vnode_info(vnodes); + return ret; } int trunk_file_write_recovery(unsigned char *outsha1)