[sheepdog] [PATCH v3 4/9] sheep: decrement generational reference count on copy-on-write
Liu Yuan
namei.unix at gmail.com
Fri Feb 28 04:13:06 CET 2014
On Thu, Feb 27, 2014 at 10:36:19PM +0900, Hitoshi Mitake wrote:
> At Thu, 27 Feb 2014 20:55:32 +0800,
> Liu Yuan wrote:
> >
> > On Thu, Feb 27, 2014 at 09:23:27PM +0900, Hitoshi Mitake wrote:
> > > At Thu, 27 Feb 2014 17:58:37 +0800,
> > > Liu Yuan wrote:
> > > >
> > > > On Sun, Feb 23, 2014 at 02:28:23PM +0900, Hitoshi Mitake wrote:
> > > > > This decrements a reference count of the old data object when
> > > > > allocating a new data object on CoW.
> > > > >
> > > > > Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > > > > Cc: Valerio Pachera <sirio81 at gmail.com>
> > > > > Cc: Alessandro Bolgia <alessandro at extensys.it>
> > > > > Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> > > > > ---
> > > > > sheep/gateway.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > > > > 1 file changed, 106 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/sheep/gateway.c b/sheep/gateway.c
> > > > > index bfd3912..5a3d333 100644
> > > > > --- a/sheep/gateway.c
> > > > > +++ b/sheep/gateway.c
> > > > > @@ -571,6 +571,85 @@ out:
> > > > > return err_ret;
> > > > > }
> > > > >
> > > > > +static int prepare_obj_refcnt(const struct sd_req *hdr, uint32_t *vids,
> > > > > + struct generation_reference *refs)
> > > > > +{
> > > > > + int ret;
> > > > > + size_t nr_vids = hdr->data_length / sizeof(*vids);
> > > > > + uint64_t offset;
> > > > > + int start;
> > > > > +
> > > > > + offset = hdr->obj.offset - offsetof(struct sd_inode, data_vdi_id);
> > > > > + start = offset / sizeof(*vids);
> > > > > +
> > > > > + ret = sd_read_object(hdr->obj.oid, (char *)vids,
> > > > > + nr_vids * sizeof(vids[0]),
> > > > > + offsetof(struct sd_inode, data_vdi_id[start]));
> > > > > + if (ret != SD_RES_SUCCESS) {
> > > > > + sd_err("failed to read vdi, %" PRIx64, hdr->obj.oid);
> > > > > + return ret;
> > > > > + }
> > > > > +
> > > > > + ret = sd_read_object(hdr->obj.oid, (char *)refs,
> > > > > + nr_vids * sizeof(refs[0]),
> > > > > + offsetof(struct sd_inode, data_ref[start]));
> > > > > + if (ret != SD_RES_SUCCESS) {
> > > > > + sd_err("failed to read vdi, %" PRIx64, hdr->obj.oid);
> > > > > + return ret;
> > > > > + }
> > > > > +
> > > > > + return ret;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * This function decreases a refcnt of vid_to_data_oid(old_vid, idx) and
> > > > > + * increases one of vid_to_data_oid(new_vid, idx)
> > > > > + */
> > > > > +static int update_obj_refcnt(const struct sd_req *hdr, uint32_t *vids,
> > > > > + uint32_t *new_vids,
> > > > > + struct generation_reference *refs)
> > > > > +{
> > > > > + int i, start, ret = SD_RES_SUCCESS;
> > > > > + size_t nr_vids = hdr->data_length / sizeof(*vids);
> > > > > + uint64_t offset;
> > > > > +
> > > > > + offset = hdr->obj.offset - offsetof(struct sd_inode, data_vdi_id);
> > > > > + start = offset / sizeof(*vids);
> > > > > +
> > > > > + for (i = 0; i < nr_vids; i++) {
> > > > > + if (vids[i] == 0 || vids[i] == new_vids[i])
> > > > > + continue;
> > > > > +
> > > > > + ret = sd_dec_object_refcnt(vid_to_data_oid(vids[i], i + start),
> > > > > + refs[i].generation, refs[i].count);
> > > > > + if (ret != SD_RES_SUCCESS)
> > > > > + sd_err("fail, %d", ret);
> > > > > +
> > > > > + refs[i].generation = 0;
> > > > > + refs[i].count = 0;
> > > > > + }
> > > > > +
> > > > > + return sd_write_object(hdr->obj.oid, (char *)refs,
> > > > > + nr_vids * sizeof(*refs),
> > > > > + offsetof(struct sd_inode,
> > > > > + data_ref) + start * sizeof(*refs),
> > > > > + false);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * return true if the request updates a data_vdi_id field of a vdi object
> > > > > + *
> > > > > + * XXX: we assume that VMs don't update the inode header and the data_vdi_id
> > > > > + * field at the same time.
> > > > > + */
> > > > > +static bool is_data_vid_update(const struct sd_req *hdr)
> > > > > +{
> > > > > + return is_vdi_obj(hdr->obj.oid) &&
> > > > > + SD_INODE_HEADER_SIZE <= hdr->obj.offset &&
> > > > > + hdr->obj.offset + hdr->data_length <=
> > > > > + offsetof(struct sd_inode, data_ref);
> > > > > +}
> > > > > +
> > > > > int gateway_read_obj(struct request *req)
> > > > > {
> > > > > uint64_t oid = req->rq.obj.oid;
> > > > > @@ -587,6 +666,10 @@ int gateway_read_obj(struct request *req)
> > > > > int gateway_write_obj(struct request *req)
> > > > > {
> > > > > uint64_t oid = req->rq.obj.oid;
> > > > > + int ret;
> > > > > + struct sd_req *hdr = &req->rq;
> > > > > + uint32_t *vids = NULL, *new_vids = req->data;
> > > > > + struct generation_reference *refs = NULL;
> > > > >
> > > > > if (oid_is_readonly(oid))
> > > > > return SD_RES_READONLY;
> > > > > @@ -594,7 +677,29 @@ int gateway_write_obj(struct request *req)
> > > > > if (!bypass_object_cache(req))
> > > > > return object_cache_handle_request(req);
> > > > >
> > > > > - return gateway_forward_request(req);
> > > > > + if (is_data_vid_update(hdr)) {
> > > > > + size_t nr_vids = hdr->data_length / sizeof(*vids);
> > > > > +
> > > > > + /* read the previous vids to discard their references later */
> > > > > + vids = xzalloc(sizeof(*vids) * nr_vids);
> > > > > + refs = xzalloc(sizeof(*refs) * nr_vids);
> > > > > + ret = prepare_obj_refcnt(hdr, vids, refs);
> > > > > + if (ret != SD_RES_SUCCESS)
> > > > > + goto out;
> > > > > + }
> > > > > +
> > > >
> > > > Does this mean even hyper volume, which dosesn't make use of generational
> > > > reference algorithm, also get affected negatively for write?
> > > >
> > > > Also non-snapshots users (http and nfs) will get affected too? I am wondering
> > > > if we can skip ref stuff for non-snapshots vdi competely?
> > >
> > > The above dereference of objects is equal to simple remove for non
> > > snapshot users (hypervolume, http, nfs). There is no side effect.
> > >
> >
> > I think at least create_and_write will be affected. Non-snapshots don't need to
> > call prepare_obj_refcnt(), no?
>
> For http, nfs and current hypervolume, yes. But we don't have a way to
> detect a type of vdi via its inode object. So currently
> prepare_obj_refcnt() cannot be avoided.
>
> This patchset removes the child vids array of sd_inode, so we can
> recycle the field for storing types of inodes.
isn't vid_is_snapshot will be good enough to tell non-snapshots from snapshots?
Thanks
Yuan
More information about the sheepdog
mailing list