[sheepdog] [PATCH 2/2] add sha1_from_buffer()

Fri Jul 19 07:39:43 CEST 2013

On Fri, Jul 19, 2013 at 02:33:08PM +0900, MORITA Kazutaka wrote:
> At Fri, 19 Jul 2013 13:24:48 +0800,
> Liu Yuan wrote:
> > 
> > On Fri, Jul 19, 2013 at 02:21:14PM +0900, MORITA Kazutaka wrote:
> > > At Fri, 19 Jul 2013 13:07:08 +0800,
> > > Liu Yuan wrote:
> > > > 
> > > > On Fri, Jul 19, 2013 at 01:16:57PM +0900, MORITA Kazutaka wrote:
> > > > > From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > > > > 
> > > > > This adds a helper function to calculate a unique sha1 digest based on
> > > > > the given buffer.
> > > > > 
> > > > > This also fixes a bug that we don't use an original buffer length to
> > > > > calculate the hash value and the result is not unique.
> > > > > 
> > > > > Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> > > > > ---
> > > > >  collie/farm/sha1_file.c |   22 ++--------------------
> > > > >  include/sha1.h          |    1 +
> > > > >  lib/sha1.c              |   28 ++++++++++++++++++++++++++--
> > > > >  sheep/plain_store.c     |   10 +---------
> > > > >  4 files changed, 30 insertions(+), 31 deletions(-)
> > > > > 
> > > > > diff --git a/collie/farm/sha1_file.c b/collie/farm/sha1_file.c
> > > > > index a2f3561..d82de58 100644
> > > > > --- a/collie/farm/sha1_file.c
> > > > > +++ b/collie/farm/sha1_file.c
> > > > > @@ -28,24 +28,6 @@
> > > > >  #include "farm.h"
> > > > >  #include "util.h"
> > > > >  
> > > > > -static void get_sha1(unsigned char *buf, unsigned len, unsigned char *sha1)
> > > > > -{
> > > > > -	struct sha1_ctx c;
> > > > > -	uint64_t offset = 0;
> > > > > -	uint32_t length = len;
> > > > > -	void *tmp = valloc(length);
> > > > > -
> > > > > -	memcpy(tmp, buf, len);
> > > > > -	trim_zero_blocks(tmp, &offset, &length);
> > > > > -
> > > > > -	sha1_init(&c);
> > > > > -	sha1_update(&c, (uint8_t *)&offset, sizeof(offset));
> > > > > -	sha1_update(&c, (uint8_t *)&length, sizeof(length));
> > > > > -	sha1_update(&c, tmp, length);
> > > > > -	sha1_final(&c, sha1);
> > > > > -	free(tmp);
> > > > > -}
> > > > > -
> > > > >  static void fill_sha1_path(char *pathbuf, const unsigned char *sha1)
> > > > >  {
> > > > >  	int i;
> > > > > @@ -159,7 +141,7 @@ int sha1_file_write(void *buf, size_t len, unsigned char *outsha1)
> > > > >  {
> > > > >  	unsigned char sha1[SHA1_DIGEST_SIZE];
> > > > >  
> > > > > -	get_sha1(buf, len, sha1);
> > > > > +	sha1_from_buffer(buf, len, sha1);
> > > > >  	if (sha1_buffer_write(sha1, buf, len) < 0)
> > > > >  		return -1;
> > > > >  	if (outsha1)
> > > > > @@ -172,7 +154,7 @@ static int verify_sha1_file(const unsigned char *sha1,
> > > > >  {
> > > > >  	unsigned char tmp[SHA1_DIGEST_SIZE];
> > > > >  
> > > > > -	get_sha1(buf, len, tmp);
> > > > > +	sha1_from_buffer(buf, len, tmp);
> > > > >  	if (memcmp((char *)tmp, (char *)sha1, SHA1_DIGEST_SIZE) != 0) {
> > > > >  		fprintf(stderr, "failed, %s != %s\n", sha1_to_hex(sha1),
> > > > >  			sha1_to_hex(tmp));
> > > > > diff --git a/include/sha1.h b/include/sha1.h
> > > > > index dd1b4f4..a778aea 100644
> > > > > --- a/include/sha1.h
> > > > > +++ b/include/sha1.h
> > > > > @@ -27,5 +27,6 @@ void sha1_init(void *ctx);
> > > > >  void sha1_update(void *ctx, const uint8_t *data, unsigned int len);
> > > > >  void sha1_final(void *ctx, uint8_t *out);
> > > > >  const char *sha1_to_hex(const unsigned char *sha1);
> > > > > +void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1);
> > > > >  
> > > > >  #endif
> > > > > diff --git a/lib/sha1.c b/lib/sha1.c
> > > > > index c1ada09..34d29b8 100644
> > > > > --- a/lib/sha1.c
> > > > > +++ b/lib/sha1.c
> > > > > @@ -19,6 +19,7 @@
> > > > >   */
> > > > >  #include <arpa/inet.h>
> > > > >  #include "sha1.h"
> > > > > +#include "util.h"
> > > > >  
> > > > >  #define SHA1_DIGEST_SIZE	20
> > > > >  #define SHA1_HMAC_BLOCK_SIZE	64
> > > > > @@ -129,13 +130,13 @@ static void sha1_transform(uint32_t *state, const uint8_t *in)
> > > > >  void sha1_init(void *ctx)
> > > > >  {
> > > > >  	struct sha1_ctx *sctx = ctx;
> > > > > -	static const struct sha1_ctx initstate = {
> > > > > +	static const struct sha1_ctx init_state = {
> > > > >  	  0,
> > > > >  	  { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 },
> > > > >  	  { 0, }
> > > > >  	};
> > > > >  
> > > > > -	*sctx = initstate;
> > > > > +	*sctx = init_state;
> > > > >  }
> > > > >  
> > > > >  void sha1_update(void *ctx, const uint8_t *data, unsigned int len)
> > > > > @@ -212,3 +213,26 @@ const char *sha1_to_hex(const unsigned char *sha1)
> > > > >  	}
> > > > >  	return buffer;
> > > > >  }
> > > > > +
> > > > > +/*
> > > > > + * Calculate a sha1 message digest based on the content of 'buf'
> > > > > + *
> > > > > + * This calculates a unique sha1 digest faster than the naive calculation when
> > > > > + * the content of 'buf' is sparse.  The result will be set in 'sha1'.
> > > > > + */
> > > > > +void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1)
> > > > > +{
> > > > > +	struct sha1_ctx c;
> > > > > +	uint64_t offset = 0;
> > > > > +	uint32_t length = size;
> > > > > +
> > > > > +	sha1_init(&c);
> > > > > +	sha1_update(&c, (uint8_t *)&length, sizeof(length));
> > > > > +
> > > > > +	find_zero_blocks(buf, &offset, &length);
> > > > > +
> > > > > +	sha1_update(&c, (uint8_t *)&length, sizeof(length));
> > > > > +	sha1_update(&c, (uint8_t *)&offset, sizeof(offset));
> > > > > +	sha1_update(&c, buf, length);
> > > > > +	sha1_final(&c, sha1);
> > > > > +}
> > > > 
> > > > Why this is faster? sha1_update(&c, buf, length) will try to hash the original
> > > > buf besides double hashing of length & offset, no?
> > > 
> > > Oops, the line should be
> > > 
> > >   sha1_update(&c, buf + offset, length);
> > > 
> > > I'll send the updated version.
> > 
> > Any reason to sha1 length twice?
> 
> The first lenth is the original buffer length, and the second one is
> the trimmed buffer length.  The previous code doesn't use the original
> buffer length, and the following example leads to the same sha1 value.
> 
>  A:
>  [before trim]
>   buf: "0x00001100"
>  [after trim]
>   offset: 2, length: 1, buf: "0x11"
> 
>  B:
>  [before trim]
>   buf: "0x00001100000000"
>  [after trim]
>   offset: 2, length: 1, buf: "0x11"
> 
> 
> We can uniquely generate the original buffer from
>  - the trimmed buffer
>  - the orignal buffer length
>  - the trimmed buffer length
>  - the trimmed buffer offset
> so, generating hash values based on these values is necessary and
> sufficient.

I think you should include this in the source to testify the correctness of the
unique hash of your approach. It would help anyone later to judge it.

Thanks
Yuan