[Sheepdog] [RFC] [PATCH] journaling support for atomic operations
Narendra Prasad Madanapalli
narendramind at gmail.com
Mon Nov 22 18:25:11 CET 2010
Hi,
This patch adds journalling support for sheepdog that can aid in
performing atomic operations.
With the help of the journalling API, implemented the task of updating
vdi object atomically in store_queue_request_local() for the
operations SD_OP_WRITE_OBJ &
SD_OP_CREATE_AND_WRITE_OBJ.
The following functions constitute journalling API:
/* Initializes journal directory */
static int jrnl_init_path(const char *base_path);
/* See if journal exists for object id specified through jfd */
int jrnl_exists(jrnl_file_desc_t *jfd);
/* See if end mark exists in journal for object id specified through jfd */
int jrnl_has_end_mark(jrnl_desc_t *jd);
/* Open a journal an object (specified through jfd) for further processing */
int jrnl_open(jrnl_file_desc_t *jfd, int aflags);
/* Create a new journal */
int jrnl_create(jrnl_file_desc_t *jfd);
/* Write header to journal */
int jrnl_write_header(jrnl_desc_t *jd);
/* Write data to journal */
int jrnl_write_data(jrnl_desc_t *jd);
/* Write end mark to journal */
int jrnl_write_end_mark(jrnl_desc_t *jd);
/* Remove journal */
int jrnl_remove(jrnl_file_desc_t *jfd);
/* Flush journal to target vdi object */
int jrnl_apply_to_targe_vdi_object(jrnl_file_desc_t *jfd);
/* Write data directly to target vdi object */
int jrnl_writte_data_to_targe_vdi_object(jrnl_file_desc_t *jfd, void *data);
/* Recover vdis from journal
* This function can be used during sheep startup time.
*/
int jrnl_recover_vdis();
Three new structures are introduced to pass and get required
information across journalling API.
/* This structure holds journal header information */
typedef struct jrnl_head {
uint64_t jh_offset;
uint64_t jh_size;
} jrnl_head_t;
/* journal file descriptor that constitute journal file and holds the
information such as journal fd & open vdi fd */
typedef struct jrnl_file_desc {
uint32_t jf_epoch; /* epoch */
uint64_t jf_oid; /* Object id */
int jf_fd; /* Open file fd */
int jf_vdi_fd;
} jrnl_file_desc_t;
typedef uint32_t end_mark_t;
/* This represents a journal */
typedef struct jrnl_descriptor {
jrnl_head_t jd_head;
#define jdh_offset jd_head.jh_offset
#define jdh_size jd_head.jh_size
void *jd_data;
int jd_end_mark;
jrnl_file_desc_t jd_jfd;
#define jdf_epoch jd_jfd.jf_epoch
#define jdf_oid jd_jfd.jf_oid
#define jdf_fd jd_jfd.jf_fd
#define jdf_vdi_fd jd_jfd.jf_vdi_fd
} jrnl_desc_t;
There are some improvements/enhancements in the code and I am in the
process of testing it out. Meanwhile, any comments/suggestions would
be appreciated.
Thanks,
Narendra.
sheep/sheep_priv.h | 42 +++++++++
sheep/store.c | 256 +++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 297 insertions(+), 1 deletions(-)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index c66baf4..5bf8299 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -128,6 +128,7 @@ struct cluster_info {
extern struct cluster_info *sys;
+
int create_listen_port(int port, void *data);
int is_io_request(unsigned op);
@@ -190,6 +191,47 @@ int remove_object(struct sheepdog_node_list_entry *e,
int nodes, uint32_t node_version,
uint64_t oid, int nr);
+/* Journal */
+typedef struct jrnl_head {
+ uint64_t jh_offset;
+ uint64_t jh_size;
+} jrnl_head_t;
+
+typedef struct jrnl_file_desc {
+ uint32_t jf_epoch; /* epoch */
+ uint64_t jf_oid; /* Object id */
+ int jf_fd; /* Open file fd */
+ int jf_vdi_fd;
+} jrnl_file_desc_t;
+
+typedef uint32_t end_mark_t;
+
+typedef struct jrnl_descriptor {
+ jrnl_head_t jd_head;
+#define jdh_offset jd_head.jh_offset
+#define jdh_size jd_head.jh_size
+ void *jd_data;
+ int jd_end_mark;
+ jrnl_file_desc_t jd_jfd;
+#define jdf_epoch jd_jfd.jf_epoch
+#define jdf_oid jd_jfd.jf_oid
+#define jdf_fd jd_jfd.jf_fd
+#define jdf_vdi_fd jd_jfd.jf_vdi_fd
+} jrnl_desc_t;
+
+static int jrnl_init_path(const char *base_path);
+int jrnl_exists(jrnl_file_desc_t *jfd);
+int jrnl_has_end_mark(jrnl_desc_t *jd);
+int jrnl_open(jrnl_file_desc_t *jfd, int aflags);
+int jrnl_create(jrnl_file_desc_t *jfd);
+int jrnl_write_header(jrnl_desc_t *jd);
+int jrnl_write_data(jrnl_desc_t *jd);
+int jrnl_write_end_mark(jrnl_desc_t *jd);
+int jrnl_remove(jrnl_file_desc_t *jfd);
+int jrnl_apply_to_targe_vdi_object(jrnl_file_desc_t *jfd);
+int jrnl_writte_data_to_targe_vdi_object(jrnl_file_desc_t *jfd, void *data);
+int jrnl_recover_vdis();
+
static inline int is_myself(struct sheepdog_node_list_entry *e)
{
return e->id == sys->this_node.id;
diff --git a/sheep/store.c b/sheep/store.c
index a4d6155..55a728c 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -32,6 +32,7 @@
static char *obj_path;
static char *epoch_path;
static char *mnt_path;
+static char *jrnl_path;
static mode_t def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP |
S_IWGRP | S_IXGRP;
static mode_t def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
@@ -633,6 +634,7 @@ static int store_queue_request_local(struct
request *req, uint32_t epoch)
uint64_t oid = hdr->oid;
uint32_t opcode = hdr->opcode;
char path[1024], *buf;
+ jrnl_desc_t jd;
switch (opcode) {
case SD_OP_CREATE_AND_WRITE_OBJ:
@@ -766,7 +768,32 @@ static int store_queue_request_local(struct
request *req, uint32_t epoch)
/* } */
}
/* fall through */
- case SD_OP_CREATE_AND_WRITE_OBJ:
+ case SD_OP_CREATE_AND_WRITE_OBJ:
+
+ jd.jdf_epoch = epoch;
+ jd.jdf_oid = oid;
+ jd.jdf_vdi_fd = fd;
+ jd.jdh_offset = hdr->offset;
+ jd.jdh_size = hdr->data_length;
+ jd.jd_data = req->data;
+ jd.jd_end_mark = 1;
+
+ ret = jrnl_create(&jd.jd_jfd);
+ if (ret)
+ goto out;
+
+ ret = jrnl_write_header(&jd);
+ if (ret)
+ goto out;
+
+ ret = jrnl_write_data(&jd);
+ if (ret)
+ goto out;
+
+ ret = jrnl_write_end_mark(&jd);
+ if (ret)
+ goto out;
+
ret = pwrite64(fd, req->data, hdr->data_length, hdr->offset);
if (ret != hdr->data_length) {
if (errno == ENOSPC)
@@ -1786,6 +1813,27 @@ static int init_mnt_path(const char *base_path)
return 0;
}
+#define JRNL_PATH "/journal/"
+
+static int jrnl_init_path(const char *base_path)
+{
+ int new, ret;
+
+ /* Create journal directory */
+ jrnl_path = zalloc(strlen(base_path) + strlen(JRNL_PATH) + 1);
+ sprintf(jrnl_path, "%s" JRNL_PATH, base_path);
+
+ ret = init_path(jrnl_path, &new);
+ /* Error during directory creation */
+ if (ret)
+ return ret;
+ /* If journal is newly created */
+ if (new)
+ return 0;
+
+ return 0;
+}
+
int init_store(const char *d)
{
int ret;
@@ -1806,6 +1854,10 @@ int init_store(const char *d)
if (ret)
return ret;
+ ret = jrnl_init_path(d);
+ if (ret)
+ return ret;
+
return ret;
}
@@ -1838,3 +1890,205 @@ int get_global_nr_copies(uint32_t *copies)
{
return attr(epoch_path, ANAME_COPIES, copies, sizeof(*copies), 0);
}
+
+int jrnl_exists(jrnl_file_desc_t *jfd)
+{
+ int ret;
+ char path[1024];
+ struct stat s;
+
+ snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+
+ ret = stat(path, &s);
+ if (ret)
+ return 1;
+
+ return 0;
+}
+
+int jrnl_open(jrnl_file_desc_t *jfd, int aflags)
+{
+ char path[1024];
+ int flags = aflags;
+ int fd, ret;
+
+
+ snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+
+ fd = open(path, flags, def_fmode);
+ if (fd < 0) {
+ eprintf("failed to open %s, %s\n", path, strerror(errno));
+ if (errno == ENOENT)
+ ret = SD_RES_NO_OBJ;
+ else
+ ret = SD_RES_UNKNOWN;
+ } else {
+ jfd->jf_fd = fd;
+ ret = SD_RES_SUCCESS;
+ }
+
+ return ret;
+}
+
+int jrnl_create(jrnl_file_desc_t *jfd)
+{
+ return jrnl_open(jfd, O_RDWR | O_CREAT);
+}
+
+int jrnl_has_end_mark(jrnl_desc_t *jd)
+{
+ ssize_t ret;
+ end_mark_t end_mark = 0;
+
+ ret = pread64(jd->jdf_fd, &end_mark, sizeof(end_mark),
+ sizeof(jd->jd_head) + jd->jdh_size);
+
+ return (end_mark == 1? 1: 0);
+}
+
+int jrnl_write_header(jrnl_desc_t *jd)
+{
+ ssize_t ret;
+
+ ret = pwrite64(jd->jdf_fd, &jd->jd_head, sizeof(jd->jd_head), 0);
+
+ if (ret != sizeof(jd->jd_head)) {
+ if (errno == ENOSPC)
+ ret = SD_RES_NO_SPACE;
+ else
+ ret = SD_RES_EIO;
+ } else
+ ret = SD_RES_SUCCESS;
+
+ return ret;
+}
+
+int jrnl_write_data(jrnl_desc_t *jd)
+{
+ ssize_t ret;
+
+ ret = pwrite64(jd->jdf_fd, jd->jd_data, jd->jdh_size, sizeof(jd->jd_head));
+
+ if (ret != jd->jdh_size) {
+ if (errno == ENOSPC)
+ ret = SD_RES_NO_SPACE;
+ else
+ ret = SD_RES_EIO;
+ } else
+ ret = SD_RES_SUCCESS;
+
+ return ret;
+}
+
+int jrnl_write_end_mark(jrnl_desc_t *jd)
+{
+ ssize_t ret;
+ end_mark_t end_mark = 1;
+
+ ret = pwrite64(jd->jdf_fd, &end_mark, sizeof(end_mark),
+ sizeof(jd->jd_head) + jd->jdh_size);
+
+ if (ret != sizeof(end_mark)) {
+ if (errno == ENOSPC)
+ ret = SD_RES_NO_SPACE;
+ else
+ ret = SD_RES_EIO;
+ } else
+ ret = SD_RES_SUCCESS;
+
+ return ret;
+}
+
+int jrnl_remove(jrnl_file_desc_t *jfd)
+{
+ char path[1024];
+ int ret;
+
+ snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+ ret = unlink(path);
+ if (ret) {
+ eprintf("failed to remove %s, %s\n", path, strerror(errno));
+ ret = SD_RES_EIO;
+ } else
+ ret = SD_RES_SUCCESS;
+
+ return ret;
+}
+
+int jrnl_apply_to_targe_vdi_object(jrnl_file_desc_t *jfd)
+{
+ char path[1024], *buf;
+ int buf_len, res = 0, ret;
+ ssize_t retsize;
+ jrnl_head_t jh;
+
+ /* FIXME: handle larger size */
+ buf_len = (1 << 22);
+ buf = zalloc(buf_len);
+ if (!buf) {
+ eprintf("failed to allocate memory\n");
+ res = SD_RES_NO_MEM;
+ goto out;
+ }
+
+ /* Open journal */
+ snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+ ret = jrnl_open(jfd, O_RDONLY);
+ if (ret) {
+ res = SD_RES_EIO;
+ goto freemem;
+ }
+
+ /* Flush out journal to disk (vdi object) */
+ retsize = pread64(jfd->jf_fd, &jh, sizeof(jh), 0);
+ retsize = pread64(jfd->jf_fd, buf, jh.jh_size, sizeof(jh));
+ retsize = pwrite64(jfd->jf_vdi_fd, buf, jh.jh_size, jh.jh_offset);
+
+ /* Clean up */
+ close(jfd->jf_fd);
+ jfd->jf_fd = 0;
+freemem:
+ free(buf);
+out:
+ return res;
+}
+
+int jrnl_writte_data_to_targe_vdi_object(jrnl_file_desc_t *jfd, void *data)
+{
+ //
+ // FIXME: Need to implement this function
+ //
+
+ return 0;
+}
+
+int jrnl_recover_vdis()
+{
+ DIR *dir;
+ struct dirent *d;
+ char jrnl_dir[1024], jrnl_file_path[1024];
+ int epoch;
+
+ epoch = get_latest_epoch();
+ if (epoch < 0) {
+ return 1;
+ }
+ snprintf(jrnl_dir, sizeof(jrnl_dir), "%s%08u", jrnl_path, epoch);
+
+ dir = opendir(jrnl_dir);
+ if (!dir)
+ return -1;
+
+ /* TODO: Not complete as error handling to be added */
+ while ((d = readdir(dir))) {
+ jrnl_file_desc_t jfd;
+ jfd.jf_oid = atoi(d->d_name);
+ snprintf(jrnl_file_path, sizeof(jrnl_file_path), "%s%016" PRIx64,
jrnl_dir, jfd.jf_oid);
+ jrnl_open(&jfd, O_RDONLY);
+ jrnl_apply_to_targe_vdi_object(&jfd);
+ }
+ closedir(dir);
+
+ return 0;
+}
+
More information about the sheepdog
mailing list