[Sheepdog] [PATCH v5] remove oids from object list cache when deleting a vdi
Liu Yuan
namei.unix at gmail.com
Sat May 5 06:59:54 CEST 2012
On 05/04/2012 02:49 PM, levin li wrote:
> Cluster recovery may cause objects migrated from one node
> to another, but the object list cache doesn't change, when
> deleting an object we can not find the right node in whose
> cache the id stays, so we need to notify the deletion object
> list to all the node to make them delete the specified
> object id from object list cache.
>
> I reused the deletion_work_list and deletion_wqueue for the
> object list cleanup work, so that we save some memory by not
> starting a new work queue.
>
> The del_vdi is called in worker thread, but it's locked by the
> notification mechanism, other operations about deletion_work_list
> and deletion_wqueue are done in main thread, so there's no need
> to concern about the race condition.
>
> Signed-off-by: levin li <xingke.lwp at taobao.com>
> ---
> include/sheep.h | 1 +
> sheep/group.c | 1 +
> sheep/object_list_cache.c | 17 +++++++++
> sheep/ops.c | 57 ++++++++++++++++++++++++++++++
> sheep/sheep_priv.h | 21 ++++++++++++
> sheep/vdi.c | 84 ++++++++++++++++++++++++++++++---------------
> 6 files changed, 154 insertions(+), 27 deletions(-)
>
> diff --git a/include/sheep.h b/include/sheep.h
> index 7e287c4..e941dc1 100644
> --- a/include/sheep.h
> +++ b/include/sheep.h
> @@ -46,6 +46,7 @@
> #define SD_OP_TRACE 0x95
> #define SD_OP_TRACE_CAT 0x96
> #define SD_OP_STAT_RECOVERY 0x97
> +#define SD_OP_NOTIFY_VDI_DEL 0x98
>
> #define SD_FLAG_CMD_IO_LOCAL 0x0010
> #define SD_FLAG_CMD_RECOVERY 0x0020
> diff --git a/sheep/group.c b/sheep/group.c
> index c7fd387..1f1aa8d 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -1314,6 +1314,7 @@ int create_cluster(int port, int64_t zone, int nr_vnodes)
> INIT_LIST_HEAD(&sys->req_wait_for_obj_list);
> INIT_LIST_HEAD(&sys->consistent_obj_list);
> INIT_LIST_HEAD(&sys->blocking_conn_list);
> + INIT_LIST_HEAD(&sys->deletion_work_list);
>
> INIT_LIST_HEAD(&sys->request_queue);
> INIT_LIST_HEAD(&sys->event_queue);
> diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
> index 28cdbbc..0aa93e5 100644
> --- a/sheep/object_list_cache.c
> +++ b/sheep/object_list_cache.c
> @@ -149,3 +149,20 @@ int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *d
>
> return res;
> }
> +
> +int del_vdi_from_objlist_cache(uint64_t *objs, int count)
> +{
> + int i;
> +
> + dprintf("%d\n", count);
> + for (i = 0; i < count; i++) {
> + pthread_rwlock_wrlock(&obj_list_cache.lock);
> + if (!objlist_cache_rb_remove(&obj_list_cache.root, objs[i])) {
> + dprintf("remove oid %" PRIx64 " from objlist cache\n", objs[i]);
> + obj_list_cache.cache_size--;
> + }
> + pthread_rwlock_unlock(&obj_list_cache.lock);
> + }
> +
> + return 0;
> +}
> diff --git a/sheep/ops.c b/sheep/ops.c
> index b6f8eb2..5763b6f 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -550,6 +550,57 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
> return ret;
> }
>
> +static void objlist_cache_del_vdi_work(struct work *work)
> +{
> + struct deletion_work *dw = container_of(work, struct deletion_work, work);
> +
> + del_vdi_from_objlist_cache(dw->deleted_objs, dw->deleted_count);
> +}
> +
> +static void objlist_cache_del_vdi_done(struct work *work)
> +{
> + struct deletion_work *dw = container_of(work, struct deletion_work, work);
> +
> + list_del(&dw->dw_siblings);
> + free(dw);
> +
> + if (!list_empty(&sys->deletion_work_list)) {
> + dw = list_first_entry(&sys->deletion_work_list,
> + struct deletion_work, dw_siblings);
> +
> + queue_work(sys->deletion_wqueue, &dw->work);
> + }
> +}
> +
> +static int cluster_notify_vdi_deletion(const struct sd_req *req, struct sd_rsp *rsp,
> + void *data)
> +{
> + int count = req->data_length / sizeof(uint64_t);
> + struct deletion_work *dw = NULL;
> +
> + dw = zalloc(sizeof(*dw));
> + if (!dw) {
> + eprintf("no memory to allocate\n");
> + return SD_RES_NO_MEM;
> + }
> +
> + dw->deleted_objs = data;
> + dw->deleted_count = count;
> +
> + dw->work.fn = objlist_cache_del_vdi_work;
> + dw->work.done = objlist_cache_del_vdi_done;
> +
> + if (!list_empty(&sys->deletion_work_list)) {
> + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
> + return SD_RES_SUCCESS;
> + }
> +
> + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
> + queue_work(sys->deletion_wqueue, &dw->work);
> +
> + return SD_RES_SUCCESS;
> +}
> +
> static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
> void *data)
> {
> @@ -945,6 +996,12 @@ static struct sd_op_template sd_ops[] = {
> .process_main = cluster_cleanup,
> },
>
> + [SD_OP_NOTIFY_VDI_DEL] = {
> + .type = SD_OP_TYPE_CLUSTER,
> + .force = 1,
> + .process_main = cluster_notify_vdi_deletion,
> + },
> +
> /* local operations */
> [SD_OP_GET_STORE_LIST] = {
> .type = SD_OP_TYPE_LOCAL,
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 2275a93..427db03 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -135,6 +135,7 @@ struct cluster_info {
> struct list_head req_wait_for_obj_list;
> struct list_head consistent_obj_list;
> struct list_head blocking_conn_list;
> + struct list_head deletion_work_list;
>
> int nr_copies;
>
> @@ -212,6 +213,25 @@ struct objlist_cache {
> pthread_rwlock_t lock;
> };
>
> +struct deletion_work {
> + uint32_t done;
> + uint32_t epoch;
> +
> + struct work work;
> + struct list_head dw_siblings;
> +
> + uint32_t vid;
> +
> + int count;
> + uint32_t *buf;
> +
> + uint64_t *deleted_objs;
> + uint32_t deleted_count;
> +
> + struct vnode_info *vnodes;
> + int delete_error;
> +};
> +
> extern struct cluster_info *sys;
> extern struct store_driver *sd_store;
> extern char *obj_path;
> @@ -313,6 +333,7 @@ int prealloc(int fd, uint32_t size);
> int init_objlist_cache(void);
> int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid);
> int check_and_insert_objlist_cache(uint64_t oid);
> +int del_vdi_from_objlist_cache(uint64_t *objs, int count);
>
> /* Operations */
>
> diff --git a/sheep/vdi.c b/sheep/vdi.c
> index 5b96dbb..609066e 100644
> --- a/sheep/vdi.c
> +++ b/sheep/vdi.c
> @@ -10,6 +10,7 @@
> */
> #include <stdio.h>
> #include <stdlib.h>
> +#include <unistd.h>
> #include <sys/time.h>
>
> #include "sheepdog_proto.h"
> @@ -373,24 +374,6 @@ int read_vdis(char *data, int len, unsigned int *rsp_len)
> return SD_RES_SUCCESS;
> }
>
> -struct deletion_work {
> - uint32_t done;
> - uint32_t epoch;
> -
> - struct work work;
> - struct list_head dw_siblings;
> -
> - uint32_t vid;
> -
> - int count;
> - uint32_t *buf;
> -
> - struct vnode_info *vnodes;
> - int delete_error;
> -};
> -
> -static LIST_HEAD(deletion_work_list);
> -
> static int delete_inode(struct deletion_work *dw)
> {
> struct sheepdog_inode *inode = NULL;
> @@ -430,6 +413,39 @@ out:
> return ret;
> }
>
> +static int notify_deletion(uint64_t *oids, uint32_t count)
> +{
> + int fd, ret;
> + unsigned int wlen, rlen = 0;
> + struct sd_vdi_req hdr;
> + char host[128];
> +
> + addr_to_str(host, sizeof(host), sys->this_node.addr, 0);
> +
> + fd = connect_to(host, sys->this_node.port);
> + if (fd < 0) {
> + eprintf("connect to local node fail\n");
> + return -1;
> + }
> +
> + memset(&hdr, 0, sizeof(hdr));
> +
> + hdr.proto_ver = SD_PROTO_VER;
> + hdr.opcode = SD_OP_NOTIFY_VDI_DEL;
> + hdr.flags = SD_FLAG_CMD_WRITE;
> + hdr.data_length = sizeof(uint64_t) * count;
> + wlen = hdr.data_length;
> +
> + ret = exec_req(fd, (struct sd_req *)&hdr, oids, &wlen, &rlen);
> + close(fd);
> +
> + if (ret < 0) {
> + eprintf("send request fail\n");
> + return -1;
> + }
> +
> + return 0;
> +}
>
> static void delete_one(struct work *work)
> {
> @@ -447,6 +463,12 @@ static void delete_one(struct work *work)
> goto out;
> }
>
> + dw->deleted_objs = malloc(sizeof(uint64_t) * MAX_DATA_OBJS);
> + if (!dw->deleted_objs) {
> + eprintf("failed to allocate memory\n");
> + goto out;
> + }
> +
> nr_copies = get_nr_copies(dw->vnodes);
>
> ret = read_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id),
> @@ -459,25 +481,33 @@ static void delete_one(struct work *work)
> }
>
> for (i = 0; i < MAX_DATA_OBJS; i++) {
> + uint64_t oid;
> +
> if (!inode->data_vdi_id[i])
> continue;
>
> + oid = vid_to_data_oid(inode->data_vdi_id[i], i);
> +
> if (inode->data_vdi_id[i] != inode->vdi_id) {
> dprintf("object %" PRIx64 " is base's data, would not be deleted.\n",
> - vid_to_data_oid(inode->data_vdi_id[i], i));
> + oid);
> continue;
> }
>
> ret = remove_object(dw->vnodes, dw->epoch,
> - vid_to_data_oid(inode->data_vdi_id[i], i),
> - nr_copies);
> + oid, nr_copies);
>
> if (ret != SD_RES_SUCCESS)
> dw->delete_error = 1;
> - else
> + else {
> + dw->deleted_objs[dw->deleted_count++] = oid;
> inode->data_vdi_id[i] = 0;
> + }
> }
>
> + if (dw->deleted_count > 0)
> + notify_deletion(dw->deleted_objs, dw->deleted_count);
This name looks confusing, I guess you mean undeleted objects as in
dw->deleted_objs, dw->deleted_count.
> +
> if (dw->delete_error) {
> write_object(dw->vnodes, dw->epoch, vid_to_vdi_oid(vdi_id),
> (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0);
> @@ -505,8 +535,8 @@ static void delete_one_done(struct work *work)
> free(dw->buf);
> free(dw);
>
> - if (!list_empty(&deletion_work_list)) {
> - dw = list_first_entry(&deletion_work_list,
> + if (!list_empty(&sys->deletion_work_list)) {
> + dw = list_first_entry(&sys->deletion_work_list,
> struct deletion_work, dw_siblings);
>
> queue_work(sys->deletion_wqueue, &dw->work);
> @@ -647,12 +677,12 @@ int start_deletion(uint32_t vid, uint32_t epoch)
> if (dw->count == 0)
> goto out;
>
> - if (!list_empty(&deletion_work_list)) {
> - list_add_tail(&dw->dw_siblings, &deletion_work_list);
> + if (!list_empty(&sys->deletion_work_list)) {
> + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
> goto out;
> }
>
> - list_add_tail(&dw->dw_siblings, &deletion_work_list);
> + list_add_tail(&dw->dw_siblings, &sys->deletion_work_list);
> queue_work(sys->deletion_wqueue, &dw->work);
it looks redundant to have deletion_work_list here to get FIFO semantics.
We only have one worker for deletion_wqueue, isn't it FIFO already?
> out:
> return SD_RES_SUCCESS;
This code looks really confusing, I think you need rework the logic
first, take object recovery as an example, recovery_object just recovers
the targeted object once at a time. Your delete_one I guess serves the
same purpose as its name suggests, but you actually try to delete all
the objects in one go.
By the way, please run script/checkpatch.pl, there are some style problems.
Thanks,
Yuan
More information about the sheepdog
mailing list