[Sheepdog] [PATCH] Journal support for atomic operations

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Sat Dec 25 08:07:12 CET 2010


Hi Narendra,

At Tue, 21 Dec 2010 00:10:37 +0530,
Narendra Prasad Madanapalli wrote:
> 
> [1  <multipart/alternative (7bit)>]
> [1.1  <text/plain; ISO-8859-1 (7bit)>]
> This patch adds the feature of atomicity while performing the operations
> such
> as vdi object and checksum updates.
> 
> With the help of the journalling API, implemented the task of updating
> vdi object & checksum atomically in store_queue_request_local() for the
> operations SD_OP_WRITE_OBJ & SD_OP_CREATE_AND_WRITE_OBJ.
> 
> 
> Signed-off-by: Narendra <narendramind at gmail.com>
> 
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index dc9a320..d6c776d 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -112,6 +112,8 @@ int main(int argc, char **argv)
>      if (is_daemon && daemon(0, 0))
>          exit(1);
> 
> +    jrnl_recover();
> +
>      ret = init_event(EPOLL_SIZE);
>      if (ret)
>          exit(1);
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index c66baf4..80c75b3 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -128,6 +128,7 @@ struct cluster_info {
> 
>  extern struct cluster_info *sys;
> 
> +
>  int create_listen_port(int port, void *data);
> 
>  int is_io_request(unsigned op);
> @@ -190,6 +191,81 @@ int remove_object(struct sheepdog_node_list_entry *e,
>            int nodes, uint32_t node_version,
>            uint64_t oid, int nr);
> 
> +/* Journal */
> +typedef uint32_t end_mark_t;
> +typedef uint32_t jrnl_type_t;

You shouldn't use typedef here (and some other places).
Please follow the Linux kernel coding style.
  http://www.kernel.org/doc/Documentation/CodingStyle


> +
> +#define JRNL_TYPE_VDI        0
> +#define JRNL_TYPE_CKSUM      1
> +#define JRNL_MAX_TYPES       2

Sorry, I couldn't understand why JRNL_TYPE_CKSUM is required.  I think
we need to care about only vdi objects to keep journaling codes
simple.


> +
> +#define SET_END_MARK            1UL
> +#define UNSET_END_MARK            0UL
> +#define IS_END_MARK_SET(var)    (var == 1UL)
> +
> +
> +/* Different Journal headers */
> +typedef struct jrnl_cksum_head {
> +    jrnl_type_t  jh_type;
> +    uint64_t  jh_size;
> +}  jrnl_cksum_head_t;
> +
> +typedef struct jrnl_vdi_head {
> +    jrnl_type_t  jh_type;
> +    uint64_t  jh_offset;
> +    uint64_t  jh_size;
> +}  jrnl_vdi_head_t;
> +
> +typedef struct jrnl_cksum_data {
> +    char  aname_cksum[64];
> +    uint64_t aval_cksum;
> +} jrnl_cksum_data_t;
> +
> +typedef struct jrnl_file_desc {
> +    uint32_t  jf_epoch;    /* epoch */
> +    uint64_t  jf_oid;     /* Object id */
> +    int       jf_fd;      /* Open file fd */
> +    int       jf_target_fd;
> +} jrnl_file_desc_t;
> +
> +typedef struct jrnl_descriptor {
> +    void                *jd_head;
> +    void                *jd_data;
> +    int                 jd_end_mark;
> +    jrnl_file_desc_t    jd_jfd;
> +#define jdf_epoch          jd_jfd.jf_epoch
> +#define jdf_oid            jd_jfd.jf_oid
> +#define jdf_fd             jd_jfd.jf_fd
> +#define jdf_target_fd      jd_jfd.jf_target_fd
> +} jrnl_desc_t;
> +
> +typedef struct jrnl_handler {
> +    int  (*has_end_mark)(jrnl_desc_t *jd);
> +    int  (*write_header)(jrnl_desc_t  *jd);
> +    int  (*write_data)(jrnl_desc_t  *jd);
> +    int  (*write_end_mark)(jrnl_desc_t  *jd);
> +    int  (*apply_to_target_object)(jrnl_file_desc_t *jfd);
> +    int  (*commit_data)(jrnl_desc_t  *jd);
> +} jrnl_handler_t;
> +
> +inline jrnl_type_t jrnl_get_type(jrnl_desc_t  *jd);
> +int  jrnl_get_type_from_file(jrnl_file_desc_t *jfd, jrnl_type_t
> *jrnl_type);
> +int  jrnl_exists(jrnl_file_desc_t  *jfd);
> +int  jrnl_update_epoch_store(uint32_t epoch);
> +int  jrnl_open(jrnl_file_desc_t  *jfd, int  aflags);
> +int  jrnl_create(jrnl_file_desc_t  *jfd);
> +int  jrnl_remove(jrnl_file_desc_t *jfd);
> +inline int  jrnl_close(jrnl_file_desc_t  *jfd);
> +
> +inline int  jrnl_has_end_mark(jrnl_desc_t  *jd);
> +inline int  jrnl_write_header(jrnl_desc_t  *jd);
> +inline int  jrnl_write_data(jrnl_desc_t  *jd);
> +inline int  jrnl_write_end_mark(jrnl_desc_t  *jd);
> +inline int  jrnl_apply_to_targe_object(jrnl_file_desc_t *jfd);
> +inline int  jrnl_commit_data(jrnl_desc_t  *jd);
> +int  jrnl_perform(jrnl_desc_t  *jd);
> +int  jrnl_recover(void);
> +
>  static inline int is_myself(struct sheepdog_node_list_entry *e)
>  {
>      return e->id == sys->this_node.id;
> diff --git a/sheep/store.c b/sheep/store.c
> index a4d6155..0eac4d2 100644
> --- a/sheep/store.c
> +++ b/sheep/store.c
> @@ -32,10 +32,45 @@
>  static char *obj_path;
>  static char *epoch_path;
>  static char *mnt_path;
> +static char *jrnl_path;
> 
>  static mode_t def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP |
> S_IXGRP;

This patch is corrupted.  Please send me a clean patch next time.


>  static mode_t def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
> 
> +/* Journal internal data structures */
> +static int  jrnl_vdi_has_end_mark(jrnl_desc_t  *jd);
> +static int  jrnl_vdi_write_header(jrnl_desc_t  *jd);
> +static int  jrnl_vdi_write_data(jrnl_desc_t  *jd);
> +static int  jrnl_vdi_write_end_mark(jrnl_desc_t  *jd);
> +static int  jrnl_vdi_apply_to_target_object(jrnl_file_desc_t *jfd);
> +static int  jrnl_vdi_commit_data(jrnl_desc_t  *jd);
> +
> +static int  jrnl_cksum_has_end_mark(jrnl_desc_t  *jd);
> +static int  jrnl_cksum_write_header(jrnl_desc_t  *jd);
> +static int  jrnl_cksum_write_data(jrnl_desc_t  *jd);
> +static int  jrnl_cksum_write_end_mark(jrnl_desc_t  *jd);
> +static int  jrnl_cksum_apply_to_target_object(jrnl_file_desc_t *jfd);
> +static int  jrnl_cksum_commit_data(jrnl_desc_t  *jd);
> +
> +static jrnl_handler_t  jrnl_handlers[JRNL_MAX_TYPES] = {
> +    {
> +        .has_end_mark = jrnl_vdi_has_end_mark,
> +        .write_header = jrnl_vdi_write_header,
> +        .write_data = jrnl_vdi_write_data,
> +        .write_end_mark = jrnl_vdi_write_end_mark,
> +        .apply_to_target_object = jrnl_vdi_apply_to_target_object,
> +        .commit_data = jrnl_vdi_commit_data
> +    },
> +    {
> +        .has_end_mark = jrnl_cksum_has_end_mark,
> +        .write_header = jrnl_cksum_write_header,
> +        .write_data = jrnl_cksum_write_data,
> +        .write_end_mark = jrnl_cksum_write_end_mark,
> +        .apply_to_target_object = jrnl_cksum_apply_to_target_object,
> +        .commit_data = jrnl_cksum_commit_data
> +    }
> +};
> +
>  static int obj_cmp(const void *oid1, const void *oid2)
>  {
>      const uint64_t hval1 = fnv_64a_buf((void *)oid1, sizeof(uint64_t),
> FNV1A_64_INIT);
> @@ -98,7 +133,8 @@ static int is_obj_in_range(uint64_t oid, uint64_t start,
> uint64_t end)
>          return (start < hval || hval <= end);
>  }
> 
> -static int verify_object(int fd, char *buf, size_t len, int set_chksum)
> +/* When set_cksum is set, epoch & oid are required to perform journalling.
> */
> +static int verify_object(int fd, char *buf, size_t len, int set_chksum,
> uint32_t  epoch, uint64_t oid)
>  {
>      int ret;
>      uint64_t checksum;
> @@ -128,11 +164,30 @@ static int verify_object(int fd, char *buf, size_t
> len, int set_chksum)
>      }
> 
>      if (set_chksum) {
> -        checksum = fnv_64a_buf(buf, len, FNV1A_64_INIT);
> -        ret = fsetxattr(fd, ANAME_CHECKSUM, &checksum, sizeof(checksum),
> 0);
> -        if (ret < 0) {
> -            eprintf("failed to set xattr, %m\n");
> -            goto err;
> +        if (epoch && oid) {
> +            jrnl_desc_t  jd;
> +            jrnl_cksum_head_t  head;
> +            jrnl_cksum_data_t  data;
> +
> +            head.jh_type = JRNL_TYPE_CKSUM;
> +            head.jh_size = sizeof(data);
> +            strcpy(data.aname_cksum, ANAME_CHECKSUM);
> +            data.aval_cksum = fnv_64a_buf(buf, len, FNV1A_64_INIT);
> +            jd.jd_head = &head;
> +            jd.jd_data = &data;
> +            jd.jdf_epoch = epoch;
> +            jd.jdf_oid = oid;
> +            jd.jdf_target_fd = fd;
> +            ret = jrnl_perform(&jd);
> +            if (ret)
> +                goto err;
> +        } else {
> +            checksum = fnv_64a_buf(buf, len, FNV1A_64_INIT);
> +            ret = fsetxattr(fd, ANAME_CHECKSUM, &checksum,
> sizeof(checksum), 0);
> +            if (ret < 0) {
> +                eprintf("failed to set xattr, %m\n");
> +                goto err;
> +            }
>          }
>      } else {
>          ret = fgetxattr(fd, ANAME_CHECKSUM, &checksum, sizeof(checksum));
> @@ -199,7 +254,7 @@ static int get_obj_list(struct request *req)
>      }
>      obj_nr = read(fd, buf, buf_len);
> 
> -    ret = verify_object(fd, buf, obj_nr, 0);
> +    ret = verify_object(fd, buf, obj_nr, 0, 0, 0);
>      if (ret < 0) {
>          eprintf("verification failed, %s, %m\n", path);
>          close(fd);
> @@ -633,6 +688,8 @@ static int store_queue_request_local(struct request
> *req, uint32_t epoch)
>      uint64_t oid = hdr->oid;
>      uint32_t opcode = hdr->opcode;
>      char path[1024], *buf;
> +    jrnl_desc_t  jd;
> +    jrnl_vdi_head_t jh;
> 
>      switch (opcode) {
>      case SD_OP_CREATE_AND_WRITE_OBJ:
> @@ -766,24 +823,31 @@ static int store_queue_request_local(struct request
> *req, uint32_t epoch)
>  /*             } */
>          }
>          /* fall through */
> -    case SD_OP_CREATE_AND_WRITE_OBJ:
> -        ret = pwrite64(fd, req->data, hdr->data_length, hdr->offset);
> -        if (ret != hdr->data_length) {
> -            if (errno == ENOSPC)
> -                ret = SD_RES_NO_SPACE;
> -            else
> -                ret = SD_RES_EIO;
> +    case SD_OP_CREATE_AND_WRITE_OBJ:
> +
> +        jd.jdf_epoch = epoch;
> +        jd.jdf_oid = oid;
> +        jd.jdf_target_fd = fd;
> +
> +        jh.jh_type = JRNL_TYPE_VDI;
> +        jh.jh_offset = hdr->offset;
> +        jh.jh_size = hdr->data_length;
> +
> +        jd.jd_head = &jh;
> +        jd.jd_data = req->data;
> +        jd.jd_end_mark = SET_END_MARK;
> +
> +        ret = jrnl_perform(&jd);

Journaling support is necessary for only vdi objects.  This seems to
be using journaling for data objects too, doesn't it?


> +        if (ret)
>              goto out;
> -        }
> 
>          if (!is_data_obj(oid)) {
> -            /* FIXME: need to update atomically */
> -/*             ret = verify_object(fd, NULL, 0, 1); */
> -/*             if (ret < 0) { */
> -/*                 eprintf("failed to set checksum, %"PRIx64"\n", oid); */
> -/*                 ret = SD_RES_EIO; */
> -/*                 goto out; */
> -/*             } */
> +             ret = verify_object(fd, NULL, 0, 1, epoch, oid);
> +             if (ret < 0) {
> +                 eprintf("failed to set checksum, %"PRIx64"\n", oid);
> +                 ret = SD_RES_EIO;
> +                 goto out;
> +             }

We don't need checksum here if we use journaling, no?

I think verify_object() is a bit confusing to understand sheepdog
codes.  I think of removing the function.


Thanks,

Kazutaka



More information about the sheepdog mailing list