[sheepdog] [PATCH] sheep/http: fix parallely append-upload problem
Robin Dong
robin.k.dong at gmail.com
Fri Sep 19 07:31:33 CEST 2014
From: Robin Dong <sanbai at taobao.com>
The 'append' opreation in sheepdog contains:
1. lock(data_vid)
2. read onode
3. allocate space and set o_extent.data_len to zero
4. write onode down
5. unlock(data_vid)
6. read uploading data and set o_extent.data_len in terms of req->data_length
7. write onode down
We unlock(data_vid) too early to protect the updating of 'o_extent.data_len'
so one process may overwrite the space allocated by another process.
To solve this problem, we move set operation of 'data_len' into lock region
and modify the code to write data_vid.
Signed-off-by: Robin Dong <sanbai at taobao.com>
---
sheep/http/kv.c | 106 ++++++++++++++++++++++++++++++++++----------------------
sheep/md.c | 13 ++++---
2 files changed, 72 insertions(+), 47 deletions(-)
diff --git a/sheep/http/kv.c b/sheep/http/kv.c
index 04ee462..4611e05 100644
--- a/sheep/http/kv.c
+++ b/sheep/http/kv.c
@@ -727,8 +727,11 @@ static int onode_allocate_extents(struct kv_onode *onode,
* if we can put whole request data into extra space of last
* o_extent, it don't need to allocate new extent.
*/
- if (req->data_length <= reserv_len)
+ if (req->data_length <= reserv_len) {
+ onode->o_extent[idx - 1].data_len += req->data_length;
goto out;
+ } else
+ onode->o_extent[idx - 1].data_len += reserv_len;
}
count = DIV_ROUND_UP((req->data_length - reserv_len), SD_DATA_OBJ_SIZE);
sys->cdrv->lock(data_vid);
@@ -751,52 +754,22 @@ static int onode_allocate_extents(struct kv_onode *onode,
onode->o_extent[idx].start = start;
onode->o_extent[idx].count = count;
- onode->o_extent[idx].data_len = 0;
+ onode->o_extent[idx].data_len = req->data_length - reserv_len;
onode->nr_extent++;
out:
return ret;
}
-static int onode_populate_extents(struct kv_onode *onode,
- struct http_request *req)
+static int do_vdi_write(struct http_request *req, uint32_t data_vid,
+ uint64_t offset, uint64_t total, char *data_buf,
+ bool create)
{
- struct onode_extent *ext;
- struct onode_extent *last_ext = onode->o_extent + onode->nr_extent - 1;
- ssize_t size;
- uint64_t done = 0, total, offset = 0, reserv_len;
- uint64_t write_buffer_size = MIN(kv_rw_buffer, req->data_length);
+ uint64_t done = 0, size;
int ret = SD_RES_SUCCESS;
- char *data_buf = NULL;
- uint32_t data_vid = onode->data_vid;
- bool create = true;
-
- data_buf = xmalloc(write_buffer_size);
- if (last_ext->data_len == 0 && onode->nr_extent == 1) {
- offset = last_ext->start * SD_DATA_OBJ_SIZE +
- last_ext->data_len;
- last_ext->data_len += req->data_length;
- } else if (last_ext->data_len > 0) {
- offset = last_ext->start * SD_DATA_OBJ_SIZE +
- last_ext->data_len;
- last_ext->data_len += req->data_length;
- create = false;
- } else {
- ext = last_ext - 1;
- reserv_len = ext->count * SD_DATA_OBJ_SIZE - ext->data_len;
- offset = ext->start * SD_DATA_OBJ_SIZE + ext->data_len;
- ext->data_len += reserv_len;
- last_ext->data_len = req->data_length - reserv_len;
- /*
- * if the previous oid has extra space, we don't need
- * to use SD_OP_CREATE_AND_WRITE_OBJ on this oid.
- */
- if (reserv_len > 0)
- create = false;
- }
- total = req->data_length;
while (done < total) {
- size = http_request_read(req, data_buf, write_buffer_size);
+ size = http_request_read(req, data_buf,
+ MIN(kv_rw_buffer, total - done));
if (size <= 0) {
sd_err("Failed to read http request: %ld", size);
ret = SD_RES_EIO;
@@ -804,17 +777,66 @@ static int onode_populate_extents(struct kv_onode *onode,
}
ret = vdi_read_write(data_vid, data_buf, size, offset,
false, create);
- sd_debug("vdi_write size: %"PRIu64", offset: %"
- PRIu64", ret:%d", size, offset, ret);
+ sd_debug("vdi_write offset: %"PRIu64", size: %" PRIu64
+ ", for %" PRIx32 "ret: %d", offset, size,
+ data_vid, ret);
if (ret != SD_RES_SUCCESS) {
- sd_err("Failed to write data object for %s, %s",
- onode->name, sd_strerror(ret));
+ sd_err("Failed to write data object for %" PRIx32
+ ", %s", data_vid, sd_strerror(ret));
goto out;
}
done += size;
offset += size;
}
out:
+ return ret;
+}
+
+static int onode_populate_extents(struct kv_onode *onode,
+ struct http_request *req)
+{
+ struct onode_extent *ext;
+ struct onode_extent *last_ext = onode->o_extent + onode->nr_extent - 1;
+ uint64_t total, offset = 0, reserv_len;
+ uint64_t write_buffer_size = MIN(kv_rw_buffer, req->data_length);
+ int ret = SD_RES_SUCCESS;
+ char *data_buf = NULL;
+ uint32_t data_vid = onode->data_vid;
+ bool create = true;
+
+ data_buf = xmalloc(write_buffer_size);
+
+ if (last_ext->data_len < req->data_length) {
+ ext = last_ext - 1;
+ reserv_len = (req->data_length - last_ext->data_len);
+ offset = (ext->start + ext->count) * SD_DATA_OBJ_SIZE -
+ reserv_len;
+ ret = do_vdi_write(req, data_vid, offset, reserv_len,
+ data_buf, false);
+ if (ret != SD_RES_SUCCESS) {
+ sd_err("Failed to do_vdi_write data_vid: %" PRIx32
+ ", offset: %" PRIx64 ", total: %" PRIx64
+ ", ret: %s", data_vid, offset, reserv_len,
+ sd_strerror(ret));
+ goto out;
+ }
+ offset = last_ext->start * SD_DATA_OBJ_SIZE;
+ total = last_ext->data_len;
+ } else {
+ reserv_len = (last_ext->data_len - req->data_length);
+ offset = last_ext->start * SD_DATA_OBJ_SIZE + reserv_len;
+ total = req->data_length;
+ if (last_ext->data_len > req->data_length)
+ create = false;
+ }
+
+ ret = do_vdi_write(req, data_vid, offset, total, data_buf, create);
+ if (ret != SD_RES_SUCCESS)
+ sd_err("Failed to do_vdi_write data_vid: %" PRIx32
+ ", offset: %" PRIx64 ", total: %" PRIx64
+ ", ret: %s", data_vid, offset, total,
+ sd_strerror(ret));
+out:
free(data_buf);
return ret;
}
diff --git a/sheep/md.c b/sheep/md.c
index 378d1f1..0313b95 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -541,11 +541,14 @@ static void md_do_recover(struct work *work)
out:
sd_rw_unlock(&md.lock);
- if (disk)
- update_node_disks();
-
- if (nr > 0)
- kick_recover();
+ if (disk) {
+ if (nr > 0) {
+ update_node_disks();
+ kick_recover();
+ } else {
+ leave_cluster();
+ }
+ }
free(mw);
}
--
1.9.1
More information about the sheepdog
mailing list