[sheepdog] [PATCH 2/2] add sha1_from_buffer()

Liu Yuan namei.unix at gmail.com
Fri Jul 19 07:39:43 CEST 2013


On Fri, Jul 19, 2013 at 02:33:08PM +0900, MORITA Kazutaka wrote:
> At Fri, 19 Jul 2013 13:24:48 +0800,
> Liu Yuan wrote:
> > 
> > On Fri, Jul 19, 2013 at 02:21:14PM +0900, MORITA Kazutaka wrote:
> > > At Fri, 19 Jul 2013 13:07:08 +0800,
> > > Liu Yuan wrote:
> > > > 
> > > > On Fri, Jul 19, 2013 at 01:16:57PM +0900, MORITA Kazutaka wrote:
> > > > > From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > > > > 
> > > > > This adds a helper function to calculate a unique sha1 digest based on
> > > > > the given buffer.
> > > > > 
> > > > > This also fixes a bug that we don't use an original buffer length to
> > > > > calculate the hash value and the result is not unique.
> > > > > 
> > > > > Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > > > > ---
> > > > >  collie/farm/sha1_file.c |   22 ++--------------------
> > > > >  include/sha1.h          |    1 +
> > > > >  lib/sha1.c              |   28 ++++++++++++++++++++++++++--
> > > > >  sheep/plain_store.c     |   10 +---------
> > > > >  4 files changed, 30 insertions(+), 31 deletions(-)
> > > > > 
> > > > > diff --git a/collie/farm/sha1_file.c b/collie/farm/sha1_file.c
> > > > > index a2f3561..d82de58 100644
> > > > > --- a/collie/farm/sha1_file.c
> > > > > +++ b/collie/farm/sha1_file.c
> > > > > @@ -28,24 +28,6 @@
> > > > >  #include "farm.h"
> > > > >  #include "util.h"
> > > > >  
> > > > > -static void get_sha1(unsigned char *buf, unsigned len, unsigned char *sha1)
> > > > > -{
> > > > > -	struct sha1_ctx c;
> > > > > -	uint64_t offset = 0;
> > > > > -	uint32_t length = len;
> > > > > -	void *tmp = valloc(length);
> > > > > -
> > > > > -	memcpy(tmp, buf, len);
> > > > > -	trim_zero_blocks(tmp, &offset, &length);
> > > > > -
> > > > > -	sha1_init(&c);
> > > > > -	sha1_update(&c, (uint8_t *)&offset, sizeof(offset));
> > > > > -	sha1_update(&c, (uint8_t *)&length, sizeof(length));
> > > > > -	sha1_update(&c, tmp, length);
> > > > > -	sha1_final(&c, sha1);
> > > > > -	free(tmp);
> > > > > -}
> > > > > -
> > > > >  static void fill_sha1_path(char *pathbuf, const unsigned char *sha1)
> > > > >  {
> > > > >  	int i;
> > > > > @@ -159,7 +141,7 @@ int sha1_file_write(void *buf, size_t len, unsigned char *outsha1)
> > > > >  {
> > > > >  	unsigned char sha1[SHA1_DIGEST_SIZE];
> > > > >  
> > > > > -	get_sha1(buf, len, sha1);
> > > > > +	sha1_from_buffer(buf, len, sha1);
> > > > >  	if (sha1_buffer_write(sha1, buf, len) < 0)
> > > > >  		return -1;
> > > > >  	if (outsha1)
> > > > > @@ -172,7 +154,7 @@ static int verify_sha1_file(const unsigned char *sha1,
> > > > >  {
> > > > >  	unsigned char tmp[SHA1_DIGEST_SIZE];
> > > > >  
> > > > > -	get_sha1(buf, len, tmp);
> > > > > +	sha1_from_buffer(buf, len, tmp);
> > > > >  	if (memcmp((char *)tmp, (char *)sha1, SHA1_DIGEST_SIZE) != 0) {
> > > > >  		fprintf(stderr, "failed, %s != %s\n", sha1_to_hex(sha1),
> > > > >  			sha1_to_hex(tmp));
> > > > > diff --git a/include/sha1.h b/include/sha1.h
> > > > > index dd1b4f4..a778aea 100644
> > > > > --- a/include/sha1.h
> > > > > +++ b/include/sha1.h
> > > > > @@ -27,5 +27,6 @@ void sha1_init(void *ctx);
> > > > >  void sha1_update(void *ctx, const uint8_t *data, unsigned int len);
> > > > >  void sha1_final(void *ctx, uint8_t *out);
> > > > >  const char *sha1_to_hex(const unsigned char *sha1);
> > > > > +void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1);
> > > > >  
> > > > >  #endif
> > > > > diff --git a/lib/sha1.c b/lib/sha1.c
> > > > > index c1ada09..34d29b8 100644
> > > > > --- a/lib/sha1.c
> > > > > +++ b/lib/sha1.c
> > > > > @@ -19,6 +19,7 @@
> > > > >   */
> > > > >  #include <arpa/inet.h>
> > > > >  #include "sha1.h"
> > > > > +#include "util.h"
> > > > >  
> > > > >  #define SHA1_DIGEST_SIZE	20
> > > > >  #define SHA1_HMAC_BLOCK_SIZE	64
> > > > > @@ -129,13 +130,13 @@ static void sha1_transform(uint32_t *state, const uint8_t *in)
> > > > >  void sha1_init(void *ctx)
> > > > >  {
> > > > >  	struct sha1_ctx *sctx = ctx;
> > > > > -	static const struct sha1_ctx initstate = {
> > > > > +	static const struct sha1_ctx init_state = {
> > > > >  	  0,
> > > > >  	  { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 },
> > > > >  	  { 0, }
> > > > >  	};
> > > > >  
> > > > > -	*sctx = initstate;
> > > > > +	*sctx = init_state;
> > > > >  }
> > > > >  
> > > > >  void sha1_update(void *ctx, const uint8_t *data, unsigned int len)
> > > > > @@ -212,3 +213,26 @@ const char *sha1_to_hex(const unsigned char *sha1)
> > > > >  	}
> > > > >  	return buffer;
> > > > >  }
> > > > > +
> > > > > +/*
> > > > > + * Calculate a sha1 message digest based on the content of 'buf'
> > > > > + *
> > > > > + * This calculates a unique sha1 digest faster than the naive calculation when
> > > > > + * the content of 'buf' is sparse.  The result will be set in 'sha1'.
> > > > > + */
> > > > > +void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1)
> > > > > +{
> > > > > +	struct sha1_ctx c;
> > > > > +	uint64_t offset = 0;
> > > > > +	uint32_t length = size;
> > > > > +
> > > > > +	sha1_init(&c);
> > > > > +	sha1_update(&c, (uint8_t *)&length, sizeof(length));
> > > > > +
> > > > > +	find_zero_blocks(buf, &offset, &length);
> > > > > +
> > > > > +	sha1_update(&c, (uint8_t *)&length, sizeof(length));
> > > > > +	sha1_update(&c, (uint8_t *)&offset, sizeof(offset));
> > > > > +	sha1_update(&c, buf, length);
> > > > > +	sha1_final(&c, sha1);
> > > > > +}
> > > > 
> > > > Why this is faster? sha1_update(&c, buf, length) will try to hash the original
> > > > buf besides double hashing of length & offset, no?
> > > 
> > > Oops, the line should be
> > > 
> > >   sha1_update(&c, buf + offset, length);
> > > 
> > > I'll send the updated version.
> > 
> > Any reason to sha1 length twice?
> 
> The first lenth is the original buffer length, and the second one is
> the trimmed buffer length.  The previous code doesn't use the original
> buffer length, and the following example leads to the same sha1 value.
> 
>  A:
>  [before trim]
>   buf: "0x00001100"
>  [after trim]
>   offset: 2, length: 1, buf: "0x11"
> 
>  B:
>  [before trim]
>   buf: "0x00001100000000"
>  [after trim]
>   offset: 2, length: 1, buf: "0x11"
> 
> 
> We can uniquely generate the original buffer from
>  - the trimmed buffer
>  - the orignal buffer length
>  - the trimmed buffer length
>  - the trimmed buffer offset
> so, generating hash values based on these values is necessary and
> sufficient.

I think you should include this in the source to testify the correctness of the
unique hash of your approach. It would help anyone later to judge it.

Thanks
Yuan



More information about the sheepdog mailing list