[Sheepdog] [RFC] [PATCH] journaling support for atomic operations

Narendra Prasad Madanapalli narendramind at gmail.com
Mon Nov 22 18:25:11 CET 2010


Hi,

This patch adds journalling support for sheepdog that can aid in
performing atomic operations.

With the help of the journalling API, implemented the task of updating
vdi object atomically in store_queue_request_local() for the
operations SD_OP_WRITE_OBJ &
SD_OP_CREATE_AND_WRITE_OBJ.


The following functions constitute journalling API:

/* Initializes journal directory */
static int  jrnl_init_path(const char *base_path);

/* See if journal exists for object id specified through jfd */
int  jrnl_exists(jrnl_file_desc_t  *jfd);

/* See if end mark exists in journal for object id specified through jfd */
int  jrnl_has_end_mark(jrnl_desc_t  *jd);

/* Open a journal an object (specified through jfd) for further processing */
int  jrnl_open(jrnl_file_desc_t  *jfd, int  aflags);

/* Create a new journal */
int  jrnl_create(jrnl_file_desc_t  *jfd);

/* Write header to journal */
int  jrnl_write_header(jrnl_desc_t  *jd);

/* Write data to journal */
int  jrnl_write_data(jrnl_desc_t  *jd);

/* Write end mark to journal */
int  jrnl_write_end_mark(jrnl_desc_t  *jd);

/* Remove journal */
int  jrnl_remove(jrnl_file_desc_t *jfd);

/* Flush journal to target vdi object */
int  jrnl_apply_to_targe_vdi_object(jrnl_file_desc_t *jfd);

/* Write data directly to target vdi object */
int  jrnl_writte_data_to_targe_vdi_object(jrnl_file_desc_t *jfd, void *data);

/* Recover vdis from journal
 * This function can be used during sheep startup time.
 */
int  jrnl_recover_vdis();


Three new structures are introduced to pass and get required
information across journalling API.

/* This structure holds journal header information */
typedef struct jrnl_head {
	uint64_t  jh_offset;
	uint64_t  jh_size;
}  jrnl_head_t;

/* journal file descriptor that constitute journal file and holds the
information such as journal fd & open vdi fd */
typedef struct jrnl_file_desc {
	uint32_t  jf_epoch;    /* epoch */
	uint64_t  jf_oid;     /* Object id */
	int       jf_fd;      /* Open file fd */
	int       jf_vdi_fd;
} jrnl_file_desc_t;

typedef uint32_t end_mark_t;

/* This represents a journal */
typedef struct jrnl_descriptor {
	jrnl_head_t         jd_head;
#define jdh_offset           jd_head.jh_offset
#define jdh_size             jd_head.jh_size
        void                *jd_data;
        int                 jd_end_mark;
	jrnl_file_desc_t    jd_jfd;
#define jdf_epoch            jd_jfd.jf_epoch
#define jdf_oid              jd_jfd.jf_oid
#define jdf_fd               jd_jfd.jf_fd
#define jdf_vdi_fd           jd_jfd.jf_vdi_fd
} jrnl_desc_t;



There are some improvements/enhancements in the code and I am in the
process of testing it out. Meanwhile, any comments/suggestions would
be appreciated.

Thanks,
Narendra.





 sheep/sheep_priv.h |   42 +++++++++
 sheep/store.c      |  256 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 297 insertions(+), 1 deletions(-)

diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index c66baf4..5bf8299 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -128,6 +128,7 @@ struct cluster_info {

 extern struct cluster_info *sys;

+
 int create_listen_port(int port, void *data);

 int is_io_request(unsigned op);
@@ -190,6 +191,47 @@ int remove_object(struct sheepdog_node_list_entry *e,
 		  int nodes, uint32_t node_version,
 		  uint64_t oid, int nr);

+/* Journal */
+typedef struct jrnl_head {
+	uint64_t  jh_offset;
+	uint64_t  jh_size;
+}  jrnl_head_t;
+
+typedef struct jrnl_file_desc {
+	uint32_t  jf_epoch;    /* epoch */
+	uint64_t  jf_oid;     /* Object id */
+	int       jf_fd;      /* Open file fd */
+	int       jf_vdi_fd;
+} jrnl_file_desc_t;
+
+typedef uint32_t end_mark_t;
+
+typedef struct jrnl_descriptor {
+	jrnl_head_t         jd_head;
+#define jdh_offset           jd_head.jh_offset
+#define jdh_size             jd_head.jh_size
+        void                *jd_data;
+        int                 jd_end_mark;
+	jrnl_file_desc_t    jd_jfd;
+#define jdf_epoch            jd_jfd.jf_epoch
+#define jdf_oid              jd_jfd.jf_oid
+#define jdf_fd               jd_jfd.jf_fd
+#define jdf_vdi_fd           jd_jfd.jf_vdi_fd
+} jrnl_desc_t;
+
+static int  jrnl_init_path(const char *base_path);
+int  jrnl_exists(jrnl_file_desc_t  *jfd);
+int  jrnl_has_end_mark(jrnl_desc_t  *jd);
+int  jrnl_open(jrnl_file_desc_t  *jfd, int  aflags);
+int  jrnl_create(jrnl_file_desc_t  *jfd);
+int  jrnl_write_header(jrnl_desc_t  *jd);
+int  jrnl_write_data(jrnl_desc_t  *jd);
+int  jrnl_write_end_mark(jrnl_desc_t  *jd);
+int  jrnl_remove(jrnl_file_desc_t *jfd);
+int  jrnl_apply_to_targe_vdi_object(jrnl_file_desc_t *jfd);
+int  jrnl_writte_data_to_targe_vdi_object(jrnl_file_desc_t *jfd, void *data);
+int  jrnl_recover_vdis();
+
 static inline int is_myself(struct sheepdog_node_list_entry *e)
 {
 	return e->id == sys->this_node.id;
diff --git a/sheep/store.c b/sheep/store.c
index a4d6155..55a728c 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -32,6 +32,7 @@
 static char *obj_path;
 static char *epoch_path;
 static char *mnt_path;
+static char *jrnl_path;

 static mode_t def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP |
S_IWGRP | S_IXGRP;
 static mode_t def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
@@ -633,6 +634,7 @@ static int store_queue_request_local(struct
request *req, uint32_t epoch)
 	uint64_t oid = hdr->oid;
 	uint32_t opcode = hdr->opcode;
 	char path[1024], *buf;
+	jrnl_desc_t  jd;

 	switch (opcode) {
 	case SD_OP_CREATE_AND_WRITE_OBJ:
@@ -766,7 +768,32 @@ static int store_queue_request_local(struct
request *req, uint32_t epoch)
 /* 			} */
 		}
 		/* fall through */
-	case SD_OP_CREATE_AND_WRITE_OBJ:
+	case SD_OP_CREATE_AND_WRITE_OBJ:
+
+		jd.jdf_epoch = epoch;
+		jd.jdf_oid = oid;
+		jd.jdf_vdi_fd = fd;
+		jd.jdh_offset = hdr->offset;
+		jd.jdh_size = hdr->data_length;
+		jd.jd_data = req->data;
+		jd.jd_end_mark = 1;
+
+		ret = jrnl_create(&jd.jd_jfd);
+		if (ret)
+			goto out;
+
+		ret = jrnl_write_header(&jd);
+		if (ret)
+			goto out;
+
+		ret = jrnl_write_data(&jd);
+		if (ret)
+			goto out;
+
+		ret = jrnl_write_end_mark(&jd);
+		if (ret)
+			goto out;
+
 		ret = pwrite64(fd, req->data, hdr->data_length, hdr->offset);
 		if (ret != hdr->data_length) {
 			if (errno == ENOSPC)
@@ -1786,6 +1813,27 @@ static int init_mnt_path(const char *base_path)
 	return 0;
 }

+#define JRNL_PATH "/journal/"
+
+static int  jrnl_init_path(const char *base_path)
+{
+	int new, ret;
+
+	/* Create journal directory */
+	jrnl_path = zalloc(strlen(base_path) + strlen(JRNL_PATH) + 1);
+	sprintf(jrnl_path, "%s" JRNL_PATH, base_path);
+
+	ret = init_path(jrnl_path, &new);
+        /* Error during directory creation */
+	if (ret)
+		return ret;
+	/* If journal is newly created */
+	if (new)
+		return 0;
+
+	return 0;
+}
+
 int init_store(const char *d)
 {
 	int ret;
@@ -1806,6 +1854,10 @@ int init_store(const char *d)
 	if (ret)
 		return ret;

+	ret = jrnl_init_path(d);
+	if (ret)
+		return ret;
+
 	return ret;
 }

@@ -1838,3 +1890,205 @@ int get_global_nr_copies(uint32_t *copies)
 {
 	return attr(epoch_path, ANAME_COPIES, copies, sizeof(*copies), 0);
 }
+
+int  jrnl_exists(jrnl_file_desc_t  *jfd)
+{
+	int ret;
+	char path[1024];
+	struct stat s;
+
+	snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+
+	ret = stat(path, &s);
+	if (ret)
+		return 1;
+
+	return 0;
+}
+
+int  jrnl_open(jrnl_file_desc_t  *jfd, int  aflags)
+{
+	char path[1024];
+	int flags = aflags;
+	int fd, ret;
+
+
+	snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+
+	fd = open(path, flags, def_fmode);
+	if (fd < 0) {
+		eprintf("failed to open %s, %s\n", path, strerror(errno));
+		if (errno == ENOENT)
+			ret = SD_RES_NO_OBJ;
+		else
+			ret = SD_RES_UNKNOWN;
+	} else {
+		jfd->jf_fd = fd;
+		ret = SD_RES_SUCCESS;
+	}
+
+	return ret;
+}
+
+int  jrnl_create(jrnl_file_desc_t  *jfd)
+{
+	return jrnl_open(jfd, O_RDWR | O_CREAT);
+}
+
+int  jrnl_has_end_mark(jrnl_desc_t *jd)
+{
+	ssize_t  ret;
+	end_mark_t  end_mark = 0;
+
+	ret = pread64(jd->jdf_fd, &end_mark, sizeof(end_mark),
+	               sizeof(jd->jd_head) + jd->jdh_size);
+
+	return (end_mark == 1? 1: 0);
+}
+
+int  jrnl_write_header(jrnl_desc_t  *jd)
+{
+	ssize_t  ret;
+
+	ret = pwrite64(jd->jdf_fd, &jd->jd_head, sizeof(jd->jd_head), 0);
+
+	if (ret != sizeof(jd->jd_head)) {
+		if (errno == ENOSPC)
+			ret = SD_RES_NO_SPACE;
+		else
+			ret = SD_RES_EIO;
+	} else
+		ret = SD_RES_SUCCESS;
+
+	return ret;
+}
+
+int  jrnl_write_data(jrnl_desc_t  *jd)
+{
+	ssize_t  ret;
+
+	ret = pwrite64(jd->jdf_fd, jd->jd_data, jd->jdh_size, sizeof(jd->jd_head));
+
+	if (ret != jd->jdh_size) {
+		if (errno == ENOSPC)
+			ret = SD_RES_NO_SPACE;
+		else
+			ret = SD_RES_EIO;
+	} else
+		ret = SD_RES_SUCCESS;
+
+	return ret;
+}
+
+int  jrnl_write_end_mark(jrnl_desc_t  *jd)
+{
+	ssize_t  ret;
+	end_mark_t  end_mark = 1;
+
+	ret = pwrite64(jd->jdf_fd, &end_mark, sizeof(end_mark),
+	               sizeof(jd->jd_head) + jd->jdh_size);
+
+	if (ret != sizeof(end_mark)) {
+		if (errno == ENOSPC)
+			ret = SD_RES_NO_SPACE;
+		else
+			ret = SD_RES_EIO;
+	} else
+		ret = SD_RES_SUCCESS;
+
+	return ret;
+}
+
+int  jrnl_remove(jrnl_file_desc_t *jfd)
+{
+	char path[1024];
+	int ret;
+
+	snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+	ret = unlink(path);
+	if (ret) {
+		eprintf("failed to remove %s, %s\n", path, strerror(errno));
+		ret = SD_RES_EIO;
+	} else
+		ret = SD_RES_SUCCESS;
+
+	return ret;
+}
+
+int  jrnl_apply_to_targe_vdi_object(jrnl_file_desc_t *jfd)
+{
+	char path[1024], *buf;
+	int buf_len, res = 0, ret;
+	ssize_t retsize;
+	jrnl_head_t jh;
+
+	/* FIXME: handle larger size */
+	buf_len = (1 << 22);
+	buf = zalloc(buf_len);
+	if (!buf) {
+		eprintf("failed to allocate memory\n");
+		res = SD_RES_NO_MEM;
+		goto out;
+	}
+
+	/* Open journal */
+	snprintf(path, sizeof(path), "%s%08u/%016" PRIx64, jrnl_path,
jfd->jf_epoch, jfd->jf_oid);
+	ret = jrnl_open(jfd, O_RDONLY);
+	if (ret) {
+		res = SD_RES_EIO;
+		goto freemem;
+	}
+
+	/* Flush out journal to disk (vdi object) */
+	retsize = pread64(jfd->jf_fd, &jh, sizeof(jh), 0);
+	retsize = pread64(jfd->jf_fd, buf, jh.jh_size, sizeof(jh));
+	retsize = pwrite64(jfd->jf_vdi_fd, buf, jh.jh_size, jh.jh_offset);
+
+	/* Clean up */
+	close(jfd->jf_fd);
+	jfd->jf_fd = 0;
+freemem:
+	free(buf);
+out:
+	return res;
+}
+
+int  jrnl_writte_data_to_targe_vdi_object(jrnl_file_desc_t *jfd, void *data)
+{
+	//
+	// FIXME: Need to implement this function
+	//
+
+	return 0;
+}
+
+int  jrnl_recover_vdis()
+{
+	DIR *dir;
+	struct dirent *d;
+	char jrnl_dir[1024], jrnl_file_path[1024];
+	int  epoch;
+
+	epoch = get_latest_epoch();
+	if (epoch < 0) {
+		return 1;
+	}
+	snprintf(jrnl_dir, sizeof(jrnl_dir), "%s%08u", jrnl_path, epoch);
+
+	dir = opendir(jrnl_dir);
+	if (!dir)
+		return -1;
+
+	/* TODO: Not complete as error handling to be added */
+	while ((d = readdir(dir))) {
+		jrnl_file_desc_t  jfd;
+		jfd.jf_oid = atoi(d->d_name);
+		snprintf(jrnl_file_path, sizeof(jrnl_file_path), "%s%016" PRIx64,
jrnl_dir, jfd.jf_oid);
+		jrnl_open(&jfd, O_RDONLY);
+		jrnl_apply_to_targe_vdi_object(&jfd);
+	}
+	closedir(dir);
+
+	return 0;
+}
+



More information about the sheepdog mailing list