[Sheepdog] [PATCH] Journal support for atomic operations
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Sat Dec 25 08:07:12 CET 2010
Hi Narendra,
At Tue, 21 Dec 2010 00:10:37 +0530,
Narendra Prasad Madanapalli wrote:
>
> [1 <multipart/alternative (7bit)>]
> [1.1 <text/plain; ISO-8859-1 (7bit)>]
> This patch adds the feature of atomicity while performing the operations
> such
> as vdi object and checksum updates.
>
> With the help of the journalling API, implemented the task of updating
> vdi object & checksum atomically in store_queue_request_local() for the
> operations SD_OP_WRITE_OBJ & SD_OP_CREATE_AND_WRITE_OBJ.
>
>
> Signed-off-by: Narendra <narendramind at gmail.com>
>
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index dc9a320..d6c776d 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -112,6 +112,8 @@ int main(int argc, char **argv)
> if (is_daemon && daemon(0, 0))
> exit(1);
>
> + jrnl_recover();
> +
> ret = init_event(EPOLL_SIZE);
> if (ret)
> exit(1);
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index c66baf4..80c75b3 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -128,6 +128,7 @@ struct cluster_info {
>
> extern struct cluster_info *sys;
>
> +
> int create_listen_port(int port, void *data);
>
> int is_io_request(unsigned op);
> @@ -190,6 +191,81 @@ int remove_object(struct sheepdog_node_list_entry *e,
> int nodes, uint32_t node_version,
> uint64_t oid, int nr);
>
> +/* Journal */
> +typedef uint32_t end_mark_t;
> +typedef uint32_t jrnl_type_t;
You shouldn't use typedef here (and some other places).
Please follow the Linux kernel coding style.
http://www.kernel.org/doc/Documentation/CodingStyle
> +
> +#define JRNL_TYPE_VDI 0
> +#define JRNL_TYPE_CKSUM 1
> +#define JRNL_MAX_TYPES 2
Sorry, I couldn't understand why JRNL_TYPE_CKSUM is required. I think
we need to care about only vdi objects to keep journaling codes
simple.
> +
> +#define SET_END_MARK 1UL
> +#define UNSET_END_MARK 0UL
> +#define IS_END_MARK_SET(var) (var == 1UL)
> +
> +
> +/* Different Journal headers */
> +typedef struct jrnl_cksum_head {
> + jrnl_type_t jh_type;
> + uint64_t jh_size;
> +} jrnl_cksum_head_t;
> +
> +typedef struct jrnl_vdi_head {
> + jrnl_type_t jh_type;
> + uint64_t jh_offset;
> + uint64_t jh_size;
> +} jrnl_vdi_head_t;
> +
> +typedef struct jrnl_cksum_data {
> + char aname_cksum[64];
> + uint64_t aval_cksum;
> +} jrnl_cksum_data_t;
> +
> +typedef struct jrnl_file_desc {
> + uint32_t jf_epoch; /* epoch */
> + uint64_t jf_oid; /* Object id */
> + int jf_fd; /* Open file fd */
> + int jf_target_fd;
> +} jrnl_file_desc_t;
> +
> +typedef struct jrnl_descriptor {
> + void *jd_head;
> + void *jd_data;
> + int jd_end_mark;
> + jrnl_file_desc_t jd_jfd;
> +#define jdf_epoch jd_jfd.jf_epoch
> +#define jdf_oid jd_jfd.jf_oid
> +#define jdf_fd jd_jfd.jf_fd
> +#define jdf_target_fd jd_jfd.jf_target_fd
> +} jrnl_desc_t;
> +
> +typedef struct jrnl_handler {
> + int (*has_end_mark)(jrnl_desc_t *jd);
> + int (*write_header)(jrnl_desc_t *jd);
> + int (*write_data)(jrnl_desc_t *jd);
> + int (*write_end_mark)(jrnl_desc_t *jd);
> + int (*apply_to_target_object)(jrnl_file_desc_t *jfd);
> + int (*commit_data)(jrnl_desc_t *jd);
> +} jrnl_handler_t;
> +
> +inline jrnl_type_t jrnl_get_type(jrnl_desc_t *jd);
> +int jrnl_get_type_from_file(jrnl_file_desc_t *jfd, jrnl_type_t
> *jrnl_type);
> +int jrnl_exists(jrnl_file_desc_t *jfd);
> +int jrnl_update_epoch_store(uint32_t epoch);
> +int jrnl_open(jrnl_file_desc_t *jfd, int aflags);
> +int jrnl_create(jrnl_file_desc_t *jfd);
> +int jrnl_remove(jrnl_file_desc_t *jfd);
> +inline int jrnl_close(jrnl_file_desc_t *jfd);
> +
> +inline int jrnl_has_end_mark(jrnl_desc_t *jd);
> +inline int jrnl_write_header(jrnl_desc_t *jd);
> +inline int jrnl_write_data(jrnl_desc_t *jd);
> +inline int jrnl_write_end_mark(jrnl_desc_t *jd);
> +inline int jrnl_apply_to_targe_object(jrnl_file_desc_t *jfd);
> +inline int jrnl_commit_data(jrnl_desc_t *jd);
> +int jrnl_perform(jrnl_desc_t *jd);
> +int jrnl_recover(void);
> +
> static inline int is_myself(struct sheepdog_node_list_entry *e)
> {
> return e->id == sys->this_node.id;
> diff --git a/sheep/store.c b/sheep/store.c
> index a4d6155..0eac4d2 100644
> --- a/sheep/store.c
> +++ b/sheep/store.c
> @@ -32,10 +32,45 @@
> static char *obj_path;
> static char *epoch_path;
> static char *mnt_path;
> +static char *jrnl_path;
>
> static mode_t def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP |
> S_IXGRP;
This patch is corrupted. Please send me a clean patch next time.
> static mode_t def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
>
> +/* Journal internal data structures */
> +static int jrnl_vdi_has_end_mark(jrnl_desc_t *jd);
> +static int jrnl_vdi_write_header(jrnl_desc_t *jd);
> +static int jrnl_vdi_write_data(jrnl_desc_t *jd);
> +static int jrnl_vdi_write_end_mark(jrnl_desc_t *jd);
> +static int jrnl_vdi_apply_to_target_object(jrnl_file_desc_t *jfd);
> +static int jrnl_vdi_commit_data(jrnl_desc_t *jd);
> +
> +static int jrnl_cksum_has_end_mark(jrnl_desc_t *jd);
> +static int jrnl_cksum_write_header(jrnl_desc_t *jd);
> +static int jrnl_cksum_write_data(jrnl_desc_t *jd);
> +static int jrnl_cksum_write_end_mark(jrnl_desc_t *jd);
> +static int jrnl_cksum_apply_to_target_object(jrnl_file_desc_t *jfd);
> +static int jrnl_cksum_commit_data(jrnl_desc_t *jd);
> +
> +static jrnl_handler_t jrnl_handlers[JRNL_MAX_TYPES] = {
> + {
> + .has_end_mark = jrnl_vdi_has_end_mark,
> + .write_header = jrnl_vdi_write_header,
> + .write_data = jrnl_vdi_write_data,
> + .write_end_mark = jrnl_vdi_write_end_mark,
> + .apply_to_target_object = jrnl_vdi_apply_to_target_object,
> + .commit_data = jrnl_vdi_commit_data
> + },
> + {
> + .has_end_mark = jrnl_cksum_has_end_mark,
> + .write_header = jrnl_cksum_write_header,
> + .write_data = jrnl_cksum_write_data,
> + .write_end_mark = jrnl_cksum_write_end_mark,
> + .apply_to_target_object = jrnl_cksum_apply_to_target_object,
> + .commit_data = jrnl_cksum_commit_data
> + }
> +};
> +
> static int obj_cmp(const void *oid1, const void *oid2)
> {
> const uint64_t hval1 = fnv_64a_buf((void *)oid1, sizeof(uint64_t),
> FNV1A_64_INIT);
> @@ -98,7 +133,8 @@ static int is_obj_in_range(uint64_t oid, uint64_t start,
> uint64_t end)
> return (start < hval || hval <= end);
> }
>
> -static int verify_object(int fd, char *buf, size_t len, int set_chksum)
> +/* When set_cksum is set, epoch & oid are required to perform journalling.
> */
> +static int verify_object(int fd, char *buf, size_t len, int set_chksum,
> uint32_t epoch, uint64_t oid)
> {
> int ret;
> uint64_t checksum;
> @@ -128,11 +164,30 @@ static int verify_object(int fd, char *buf, size_t
> len, int set_chksum)
> }
>
> if (set_chksum) {
> - checksum = fnv_64a_buf(buf, len, FNV1A_64_INIT);
> - ret = fsetxattr(fd, ANAME_CHECKSUM, &checksum, sizeof(checksum),
> 0);
> - if (ret < 0) {
> - eprintf("failed to set xattr, %m\n");
> - goto err;
> + if (epoch && oid) {
> + jrnl_desc_t jd;
> + jrnl_cksum_head_t head;
> + jrnl_cksum_data_t data;
> +
> + head.jh_type = JRNL_TYPE_CKSUM;
> + head.jh_size = sizeof(data);
> + strcpy(data.aname_cksum, ANAME_CHECKSUM);
> + data.aval_cksum = fnv_64a_buf(buf, len, FNV1A_64_INIT);
> + jd.jd_head = &head;
> + jd.jd_data = &data;
> + jd.jdf_epoch = epoch;
> + jd.jdf_oid = oid;
> + jd.jdf_target_fd = fd;
> + ret = jrnl_perform(&jd);
> + if (ret)
> + goto err;
> + } else {
> + checksum = fnv_64a_buf(buf, len, FNV1A_64_INIT);
> + ret = fsetxattr(fd, ANAME_CHECKSUM, &checksum,
> sizeof(checksum), 0);
> + if (ret < 0) {
> + eprintf("failed to set xattr, %m\n");
> + goto err;
> + }
> }
> } else {
> ret = fgetxattr(fd, ANAME_CHECKSUM, &checksum, sizeof(checksum));
> @@ -199,7 +254,7 @@ static int get_obj_list(struct request *req)
> }
> obj_nr = read(fd, buf, buf_len);
>
> - ret = verify_object(fd, buf, obj_nr, 0);
> + ret = verify_object(fd, buf, obj_nr, 0, 0, 0);
> if (ret < 0) {
> eprintf("verification failed, %s, %m\n", path);
> close(fd);
> @@ -633,6 +688,8 @@ static int store_queue_request_local(struct request
> *req, uint32_t epoch)
> uint64_t oid = hdr->oid;
> uint32_t opcode = hdr->opcode;
> char path[1024], *buf;
> + jrnl_desc_t jd;
> + jrnl_vdi_head_t jh;
>
> switch (opcode) {
> case SD_OP_CREATE_AND_WRITE_OBJ:
> @@ -766,24 +823,31 @@ static int store_queue_request_local(struct request
> *req, uint32_t epoch)
> /* } */
> }
> /* fall through */
> - case SD_OP_CREATE_AND_WRITE_OBJ:
> - ret = pwrite64(fd, req->data, hdr->data_length, hdr->offset);
> - if (ret != hdr->data_length) {
> - if (errno == ENOSPC)
> - ret = SD_RES_NO_SPACE;
> - else
> - ret = SD_RES_EIO;
> + case SD_OP_CREATE_AND_WRITE_OBJ:
> +
> + jd.jdf_epoch = epoch;
> + jd.jdf_oid = oid;
> + jd.jdf_target_fd = fd;
> +
> + jh.jh_type = JRNL_TYPE_VDI;
> + jh.jh_offset = hdr->offset;
> + jh.jh_size = hdr->data_length;
> +
> + jd.jd_head = &jh;
> + jd.jd_data = req->data;
> + jd.jd_end_mark = SET_END_MARK;
> +
> + ret = jrnl_perform(&jd);
Journaling support is necessary for only vdi objects. This seems to
be using journaling for data objects too, doesn't it?
> + if (ret)
> goto out;
> - }
>
> if (!is_data_obj(oid)) {
> - /* FIXME: need to update atomically */
> -/* ret = verify_object(fd, NULL, 0, 1); */
> -/* if (ret < 0) { */
> -/* eprintf("failed to set checksum, %"PRIx64"\n", oid); */
> -/* ret = SD_RES_EIO; */
> -/* goto out; */
> -/* } */
> + ret = verify_object(fd, NULL, 0, 1, epoch, oid);
> + if (ret < 0) {
> + eprintf("failed to set checksum, %"PRIx64"\n", oid);
> + ret = SD_RES_EIO;
> + goto out;
> + }
We don't need checksum here if we use journaling, no?
I think verify_object() is a bit confusing to understand sheepdog
codes. I think of removing the function.
Thanks,
Kazutaka
More information about the sheepdog
mailing list