Hi Narendra, At Tue, 21 Dec 2010 00:10:37 +0530, Narendra Prasad Madanapalli wrote: > > [1 <multipart/alternative (7bit)>] > [1.1 <text/plain; ISO-8859-1 (7bit)>] > This patch adds the feature of atomicity while performing the operations > such > as vdi object and checksum updates. > > With the help of the journalling API, implemented the task of updating > vdi object & checksum atomically in store_queue_request_local() for the > operations SD_OP_WRITE_OBJ & SD_OP_CREATE_AND_WRITE_OBJ. > > > Signed-off-by: Narendra <narendramind at gmail.com> > > diff --git a/sheep/sheep.c b/sheep/sheep.c > index dc9a320..d6c776d 100644 > --- a/sheep/sheep.c > +++ b/sheep/sheep.c > @@ -112,6 +112,8 @@ int main(int argc, char **argv) > if (is_daemon && daemon(0, 0)) > exit(1); > > + jrnl_recover(); > + > ret = init_event(EPOLL_SIZE); > if (ret) > exit(1); > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index c66baf4..80c75b3 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -128,6 +128,7 @@ struct cluster_info { > > extern struct cluster_info *sys; > > + > int create_listen_port(int port, void *data); > > int is_io_request(unsigned op); > @@ -190,6 +191,81 @@ int remove_object(struct sheepdog_node_list_entry *e, > int nodes, uint32_t node_version, > uint64_t oid, int nr); > > +/* Journal */ > +typedef uint32_t end_mark_t; > +typedef uint32_t jrnl_type_t; You shouldn't use typedef here (and some other places). Please follow the Linux kernel coding style. http://www.kernel.org/doc/Documentation/CodingStyle > + > +#define JRNL_TYPE_VDI 0 > +#define JRNL_TYPE_CKSUM 1 > +#define JRNL_MAX_TYPES 2 Sorry, I couldn't understand why JRNL_TYPE_CKSUM is required. I think we need to care about only vdi objects to keep journaling codes simple. > + > +#define SET_END_MARK 1UL > +#define UNSET_END_MARK 0UL > +#define IS_END_MARK_SET(var) (var == 1UL) > + > + > +/* Different Journal headers */ > +typedef struct jrnl_cksum_head { > + jrnl_type_t jh_type; > + uint64_t jh_size; > +} jrnl_cksum_head_t; > + > +typedef struct jrnl_vdi_head { > + jrnl_type_t jh_type; > + uint64_t jh_offset; > + uint64_t jh_size; > +} jrnl_vdi_head_t; > + > +typedef struct jrnl_cksum_data { > + char aname_cksum[64]; > + uint64_t aval_cksum; > +} jrnl_cksum_data_t; > + > +typedef struct jrnl_file_desc { > + uint32_t jf_epoch; /* epoch */ > + uint64_t jf_oid; /* Object id */ > + int jf_fd; /* Open file fd */ > + int jf_target_fd; > +} jrnl_file_desc_t; > + > +typedef struct jrnl_descriptor { > + void *jd_head; > + void *jd_data; > + int jd_end_mark; > + jrnl_file_desc_t jd_jfd; > +#define jdf_epoch jd_jfd.jf_epoch > +#define jdf_oid jd_jfd.jf_oid > +#define jdf_fd jd_jfd.jf_fd > +#define jdf_target_fd jd_jfd.jf_target_fd > +} jrnl_desc_t; > + > +typedef struct jrnl_handler { > + int (*has_end_mark)(jrnl_desc_t *jd); > + int (*write_header)(jrnl_desc_t *jd); > + int (*write_data)(jrnl_desc_t *jd); > + int (*write_end_mark)(jrnl_desc_t *jd); > + int (*apply_to_target_object)(jrnl_file_desc_t *jfd); > + int (*commit_data)(jrnl_desc_t *jd); > +} jrnl_handler_t; > + > +inline jrnl_type_t jrnl_get_type(jrnl_desc_t *jd); > +int jrnl_get_type_from_file(jrnl_file_desc_t *jfd, jrnl_type_t > *jrnl_type); > +int jrnl_exists(jrnl_file_desc_t *jfd); > +int jrnl_update_epoch_store(uint32_t epoch); > +int jrnl_open(jrnl_file_desc_t *jfd, int aflags); > +int jrnl_create(jrnl_file_desc_t *jfd); > +int jrnl_remove(jrnl_file_desc_t *jfd); > +inline int jrnl_close(jrnl_file_desc_t *jfd); > + > +inline int jrnl_has_end_mark(jrnl_desc_t *jd); > +inline int jrnl_write_header(jrnl_desc_t *jd); > +inline int jrnl_write_data(jrnl_desc_t *jd); > +inline int jrnl_write_end_mark(jrnl_desc_t *jd); > +inline int jrnl_apply_to_targe_object(jrnl_file_desc_t *jfd); > +inline int jrnl_commit_data(jrnl_desc_t *jd); > +int jrnl_perform(jrnl_desc_t *jd); > +int jrnl_recover(void); > + > static inline int is_myself(struct sheepdog_node_list_entry *e) > { > return e->id == sys->this_node.id; > diff --git a/sheep/store.c b/sheep/store.c > index a4d6155..0eac4d2 100644 > --- a/sheep/store.c > +++ b/sheep/store.c > @@ -32,10 +32,45 @@ > static char *obj_path; > static char *epoch_path; > static char *mnt_path; > +static char *jrnl_path; > > static mode_t def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | > S_IXGRP; This patch is corrupted. Please send me a clean patch next time. > static mode_t def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; > > +/* Journal internal data structures */ > +static int jrnl_vdi_has_end_mark(jrnl_desc_t *jd); > +static int jrnl_vdi_write_header(jrnl_desc_t *jd); > +static int jrnl_vdi_write_data(jrnl_desc_t *jd); > +static int jrnl_vdi_write_end_mark(jrnl_desc_t *jd); > +static int jrnl_vdi_apply_to_target_object(jrnl_file_desc_t *jfd); > +static int jrnl_vdi_commit_data(jrnl_desc_t *jd); > + > +static int jrnl_cksum_has_end_mark(jrnl_desc_t *jd); > +static int jrnl_cksum_write_header(jrnl_desc_t *jd); > +static int jrnl_cksum_write_data(jrnl_desc_t *jd); > +static int jrnl_cksum_write_end_mark(jrnl_desc_t *jd); > +static int jrnl_cksum_apply_to_target_object(jrnl_file_desc_t *jfd); > +static int jrnl_cksum_commit_data(jrnl_desc_t *jd); > + > +static jrnl_handler_t jrnl_handlers[JRNL_MAX_TYPES] = { > + { > + .has_end_mark = jrnl_vdi_has_end_mark, > + .write_header = jrnl_vdi_write_header, > + .write_data = jrnl_vdi_write_data, > + .write_end_mark = jrnl_vdi_write_end_mark, > + .apply_to_target_object = jrnl_vdi_apply_to_target_object, > + .commit_data = jrnl_vdi_commit_data > + }, > + { > + .has_end_mark = jrnl_cksum_has_end_mark, > + .write_header = jrnl_cksum_write_header, > + .write_data = jrnl_cksum_write_data, > + .write_end_mark = jrnl_cksum_write_end_mark, > + .apply_to_target_object = jrnl_cksum_apply_to_target_object, > + .commit_data = jrnl_cksum_commit_data > + } > +}; > + > static int obj_cmp(const void *oid1, const void *oid2) > { > const uint64_t hval1 = fnv_64a_buf((void *)oid1, sizeof(uint64_t), > FNV1A_64_INIT); > @@ -98,7 +133,8 @@ static int is_obj_in_range(uint64_t oid, uint64_t start, > uint64_t end) > return (start < hval || hval <= end); > } > > -static int verify_object(int fd, char *buf, size_t len, int set_chksum) > +/* When set_cksum is set, epoch & oid are required to perform journalling. > */ > +static int verify_object(int fd, char *buf, size_t len, int set_chksum, > uint32_t epoch, uint64_t oid) > { > int ret; > uint64_t checksum; > @@ -128,11 +164,30 @@ static int verify_object(int fd, char *buf, size_t > len, int set_chksum) > } > > if (set_chksum) { > - checksum = fnv_64a_buf(buf, len, FNV1A_64_INIT); > - ret = fsetxattr(fd, ANAME_CHECKSUM, &checksum, sizeof(checksum), > 0); > - if (ret < 0) { > - eprintf("failed to set xattr, %m\n"); > - goto err; > + if (epoch && oid) { > + jrnl_desc_t jd; > + jrnl_cksum_head_t head; > + jrnl_cksum_data_t data; > + > + head.jh_type = JRNL_TYPE_CKSUM; > + head.jh_size = sizeof(data); > + strcpy(data.aname_cksum, ANAME_CHECKSUM); > + data.aval_cksum = fnv_64a_buf(buf, len, FNV1A_64_INIT); > + jd.jd_head = &head; > + jd.jd_data = &data; > + jd.jdf_epoch = epoch; > + jd.jdf_oid = oid; > + jd.jdf_target_fd = fd; > + ret = jrnl_perform(&jd); > + if (ret) > + goto err; > + } else { > + checksum = fnv_64a_buf(buf, len, FNV1A_64_INIT); > + ret = fsetxattr(fd, ANAME_CHECKSUM, &checksum, > sizeof(checksum), 0); > + if (ret < 0) { > + eprintf("failed to set xattr, %m\n"); > + goto err; > + } > } > } else { > ret = fgetxattr(fd, ANAME_CHECKSUM, &checksum, sizeof(checksum)); > @@ -199,7 +254,7 @@ static int get_obj_list(struct request *req) > } > obj_nr = read(fd, buf, buf_len); > > - ret = verify_object(fd, buf, obj_nr, 0); > + ret = verify_object(fd, buf, obj_nr, 0, 0, 0); > if (ret < 0) { > eprintf("verification failed, %s, %m\n", path); > close(fd); > @@ -633,6 +688,8 @@ static int store_queue_request_local(struct request > *req, uint32_t epoch) > uint64_t oid = hdr->oid; > uint32_t opcode = hdr->opcode; > char path[1024], *buf; > + jrnl_desc_t jd; > + jrnl_vdi_head_t jh; > > switch (opcode) { > case SD_OP_CREATE_AND_WRITE_OBJ: > @@ -766,24 +823,31 @@ static int store_queue_request_local(struct request > *req, uint32_t epoch) > /* } */ > } > /* fall through */ > - case SD_OP_CREATE_AND_WRITE_OBJ: > - ret = pwrite64(fd, req->data, hdr->data_length, hdr->offset); > - if (ret != hdr->data_length) { > - if (errno == ENOSPC) > - ret = SD_RES_NO_SPACE; > - else > - ret = SD_RES_EIO; > + case SD_OP_CREATE_AND_WRITE_OBJ: > + > + jd.jdf_epoch = epoch; > + jd.jdf_oid = oid; > + jd.jdf_target_fd = fd; > + > + jh.jh_type = JRNL_TYPE_VDI; > + jh.jh_offset = hdr->offset; > + jh.jh_size = hdr->data_length; > + > + jd.jd_head = &jh; > + jd.jd_data = req->data; > + jd.jd_end_mark = SET_END_MARK; > + > + ret = jrnl_perform(&jd); Journaling support is necessary for only vdi objects. This seems to be using journaling for data objects too, doesn't it? > + if (ret) > goto out; > - } > > if (!is_data_obj(oid)) { > - /* FIXME: need to update atomically */ > -/* ret = verify_object(fd, NULL, 0, 1); */ > -/* if (ret < 0) { */ > -/* eprintf("failed to set checksum, %"PRIx64"\n", oid); */ > -/* ret = SD_RES_EIO; */ > -/* goto out; */ > -/* } */ > + ret = verify_object(fd, NULL, 0, 1, epoch, oid); > + if (ret < 0) { > + eprintf("failed to set checksum, %"PRIx64"\n", oid); > + ret = SD_RES_EIO; > + goto out; > + } We don't need checksum here if we use journaling, no? I think verify_object() is a bit confusing to understand sheepdog codes. I think of removing the function. Thanks, Kazutaka |