[sheepdog] [PATCH v2 2/2] sha1: add x86 hardware acceleration for sha1

Mon Sep 9 14:11:07 CEST 2013

This doesn't change the sha1 algorithm. This patch tries to make use of intel's
ssse3 and avx for sha1 acceleration.

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 configure.ac     |    2 +
 include/sha1.h   |   11 +-
 lib/Makefile.am  |    4 +
 lib/sha1.c       |  165 ++++++++++++++--
 lib/sha1_ssse3.S |  564 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 732 insertions(+), 14 deletions(-)
 create mode 100644 lib/sha1_ssse3.S

diff --git a/configure.ac b/configure.ac
index 2e37277..b169ae7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -195,6 +195,8 @@ PACKAGE_FEATURES=""
 LINT_FLAGS="-weak -unrecog +posixlib +ignoresigns -fcnuse \
 	-badflag -D__gnuc_va_list=va_list -D__attribute\(x\)="
 
+AM_CONDITIONAL(BUILD_SHA1_HW, [[[[ $host = *x86_64* ]]]])
+
 AC_ARG_ENABLE([fatal-warnings],
 	[  --enable-fatal-warnings : enable fatal warnings. ],
 	[ default="no" ])
diff --git a/include/sha1.h b/include/sha1.h
index a778aea..9ae70b5 100644
--- a/include/sha1.h
+++ b/include/sha1.h
@@ -23,9 +23,14 @@ struct sha1_ctx {
 	uint8_t buffer[SHA1_BLOCK_SIZE];
 };
 
-void sha1_init(void *ctx);
-void sha1_update(void *ctx, const uint8_t *data, unsigned int len);
-void sha1_final(void *ctx, uint8_t *out);
+typedef void (*sha1_init_func_t)(void *);
+typedef void (*sha1_update_func_t)(void *, const uint8_t *, unsigned int);
+typedef void (*sha1_final_func_t)(void *, uint8_t *);
+
+sha1_init_func_t sha1_init;
+sha1_update_func_t sha1_update;
+sha1_final_func_t sha1_final;
+
 const char *sha1_to_hex(const unsigned char *sha1);
 void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1);
 
diff --git a/lib/Makefile.am b/lib/Makefile.am
index 22edf6e..496c588 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -7,6 +7,10 @@ noinst_LIBRARIES	= libsheepdog.a
 libsheepdog_a_SOURCES	= event.c logger.c net.c util.c rbtree.c strbuf.c \
 			  sha1.c option.c work.c sockfd_cache.c
 
+if BUILD_SHA1_HW
+libsheepdog_a_SOURCES	+= sha1_ssse3.S
+endif
+
 # support for GNU Flymake
 check-syntax:
 	$(COMPILE) -fsyntax-only $(CHK_SOURCES)
diff --git a/lib/sha1.c b/lib/sha1.c
index 85db278..9da0518 100644
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -11,6 +11,8 @@
  * Copyright (c) Andrew McDonald <andrew at mcdonald.org.uk>
  * Copyright (c) Jean-Francois Dive <jef at linuxbe.org>
  *
+ * Add x86 hardware acceleration by Liu Yuan <namei.unix at gmail.com>
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
  * Software Foundation; either version 2 of the License, or (at your option)
@@ -21,6 +23,16 @@
 #include "sha1.h"
 #include "util.h"
 
+#define SHA1_H0		0x67452301UL
+#define SHA1_H1		0xefcdab89UL
+#define SHA1_H2		0x98badcfeUL
+#define SHA1_H3		0x10325476UL
+#define SHA1_H4		0xc3d2e1f0UL
+
+sha1_init_func_t sha1_init;
+sha1_update_func_t sha1_update;
+sha1_final_func_t sha1_final;
+
 static __always_inline uint32_t rol(uint32_t value, uint32_t bits)
 {
 	return (value << bits) | (value >> (32 - bits));
@@ -124,19 +136,17 @@ static void sha1_transform(uint32_t *state, const uint8_t *in)
 	memset(block32, 0x00, sizeof block32);
 }
 
-void sha1_init(void *ctx)
+static void generic_sha1_init(void *ctx)
 {
 	struct sha1_ctx *sctx = ctx;
-	static const struct sha1_ctx init_state = {
-	  0,
-	  { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 },
-	  { 0, }
-	};
 
-	*sctx = init_state;
+	*sctx = (struct sha1_ctx){
+		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+	};
 }
 
-void sha1_update(void *ctx, const uint8_t *data, unsigned int len)
+static void generic_sha1_update(void *ctx, const uint8_t *data,
+				unsigned int len)
 {
 	struct sha1_ctx *sctx = ctx;
 	unsigned int i, j;
@@ -157,7 +167,7 @@ void sha1_update(void *ctx, const uint8_t *data, unsigned int len)
 
 
 /* Add padding and return the message digest. */
-void sha1_final(void *ctx, uint8_t *out)
+static void generic_sha1_final(void *ctx, uint8_t *out)
 {
 	struct sha1_ctx *sctx = ctx;
 	uint32_t i, j, idx, padlen;
@@ -178,10 +188,10 @@ void sha1_final(void *ctx, uint8_t *out)
 	/* Pad out to 56 mod 64 */
 	idx = (sctx->count >> 3) & 0x3f;
 	padlen = (idx < 56) ? (56 - idx) : ((64+56) - idx);
-	sha1_update(sctx, padding, padlen);
+	generic_sha1_update(sctx, padding, padlen);
 
 	/* Append length */
-	sha1_update(sctx, bits, sizeof bits);
+	generic_sha1_update(sctx, bits, sizeof bits);
 
 	/* Store state in digest */
 	for (i = j = 0; i < 5; i++, j += 4) {
@@ -196,6 +206,119 @@ void sha1_final(void *ctx, uint8_t *out)
 	memset(sctx, 0, sizeof *sctx);
 }
 
+#ifdef __x86_64__
+
+static asmlinkage void
+(*sha1_transform_asm)(uint32_t *, const uint8_t *, unsigned int);
+
+asmlinkage void sha1_transform_ssse3(uint32_t *, const uint8_t *, unsigned int);
+asmlinkage void sha1_transform_avx(uint32_t *, const uint8_t *, unsigned int);
+
+static void do_ssse3_sha1_update(struct sha1_ctx *ctx, const uint8_t *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha1_ctx *sctx = ctx;
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA1_BLOCK_SIZE - partial;
+		memcpy(sctx->buffer + partial, data, done);
+		sha1_transform_asm(sctx->state, sctx->buffer, 1);
+	}
+
+	if (len - done >= SHA1_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+		sha1_transform_asm(sctx->state, data + done, rounds);
+		done += rounds * SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buffer, data + done, len - done);
+
+	return;
+}
+
+static void ssse3_sha1_update(void *ctx, const uint8_t *data,
+			     unsigned int len)
+{
+	struct sha1_ctx *sctx = ctx;
+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA1_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buffer + partial, data, len);
+
+		return;
+	}
+
+	do_ssse3_sha1_update(ctx, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static void ssse3_sha1_final(void *ctx, uint8_t *out)
+{
+	struct sha1_ctx *sctx = ctx;
+	unsigned int i, j, idx, padlen;
+	uint64_t t;
+	uint8_t bits[8] = { 0, };
+	static const uint8_t padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+	t = sctx->count << 3;
+	bits[7] = 0xff & t; t >>= 8;
+	bits[6] = 0xff & t; t >>= 8;
+	bits[5] = 0xff & t; t >>= 8;
+	bits[4] = 0xff & t; t >>= 8;
+	bits[3] = 0xff & t; t >>= 8;
+	bits[2] = 0xff & t; t >>= 8;
+	bits[1] = 0xff & t; t >>= 8;
+	bits[0] = 0xff & t;
+
+	/* Pad out to 56 mod 64 and append length */
+	idx = sctx->count % SHA1_BLOCK_SIZE;
+	padlen = (idx < 56) ? (56 - idx) : ((SHA1_BLOCK_SIZE+56) - idx);
+	/* We need to fill a whole block for do_ssse3_sha1_update() */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buffer + idx, padding, padlen);
+	} else {
+		do_ssse3_sha1_update(ctx, padding, padlen, idx);
+	}
+	do_ssse3_sha1_update(ctx, (const uint8_t *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = j = 0; i < 5; i++, j += 4) {
+		uint32_t t2 = sctx->state[i];
+		out[j+3] = t2 & 0xff; t2 >>= 8;
+		out[j+2] = t2 & 0xff; t2 >>= 8;
+		out[j+1] = t2 & 0xff; t2 >>= 8;
+		out[j] = t2 & 0xff;
+	}
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return;
+}
+
+static bool avx_usable(void)
+{
+	uint64_t xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave)
+		return false;
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM))
+		return false;
+
+	return true;
+}
+
+#endif
+
 const char *sha1_to_hex(const unsigned char *sha1)
 {
 	static __thread char buffer[50];
@@ -239,3 +362,23 @@ void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1)
 	sha1_update(&c, buf, length);
 	sha1_final(&c, sha1);
 }
+
+static void __attribute__((constructor)) __sha1_init(void)
+{
+	sha1_init = generic_sha1_init;
+	sha1_update = generic_sha1_update;
+	sha1_final = generic_sha1_final;
+
+#ifdef __x86_64__
+	if (cpu_has_ssse3)
+		sha1_transform_asm = sha1_transform_ssse3;
+	else
+		return;
+
+	if (avx_usable())
+		sha1_transform_asm = sha1_transform_avx;
+
+	sha1_update = ssse3_sha1_update;
+	sha1_final = ssse3_sha1_final;
+#endif
+}
diff --git a/lib/sha1_ssse3.S b/lib/sha1_ssse3.S
new file mode 100644
index 0000000..22ce597
--- /dev/null
+++ b/lib/sha1_ssse3.S
@@ -0,0 +1,564 @@
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ *    http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ *   Authors: Maxim Locktyukhin <maxim.locktyukhin at intel.com>
+ *            Ronen Zohar <ronen.zohar at intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ *   Author: Mathias Krause <minipli at googlemail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#define _ALIGN_TEXT .align 16, 0x90
+#define ENTRY(x) \
+        .text; _ALIGN_TEXT; .globl x; .type x, at function; x:
+
+#define END(name) \
+  .size name, .-name
+
+#define ENDPROC(name) \
+  .type name, @function; \
+  END(name)
+
+#define CTX	%rdi	// arg1
+#define BUF	%rsi	// arg2
+#define CNT	%rdx	// arg3
+
+#define REG_A	%ecx
+#define REG_B	%esi
+#define REG_C	%edi
+#define REG_D	%ebp
+#define REG_E	%edx
+
+#define REG_T1	%eax
+#define REG_T2	%ebx
+
+#define K_BASE		%r8
+#define HASH_PTR	%r9
+#define BUFFER_PTR	%r10
+#define BUFFER_END	%r11
+
+#define W_TMP1	%xmm0
+#define W_TMP2	%xmm9
+
+#define W0	%xmm1
+#define W4	%xmm2
+#define W8	%xmm3
+#define W12	%xmm4
+#define W16	%xmm5
+#define W20	%xmm6
+#define W24	%xmm7
+#define W28	%xmm8
+
+#define XMM_SHUFB_BSWAP	%xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t)	(((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD	16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM  name
+	ENTRY(\name)
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+
+	mov	%rsp, %r12
+	sub	$64, %rsp		# allocate workspace
+	and	$~15, %rsp		# align stack
+
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+
+	shl	$6, CNT			# multiply by 64
+	add	BUF, CNT
+	mov	CNT, BUFFER_END
+
+	lea	K_XMM_AR(%rip), K_BASE
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	# cleanup workspace
+	mov	$8, %ecx
+	mov	%rsp, %rdi
+	xor	%rax, %rax
+	rep stosq
+
+	mov	%r12, %rsp		# deallocate workspace
+
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+
+	ENDPROC(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+	INIT_REGALLOC
+
+	mov	  (HASH_PTR), A
+	mov	 4(HASH_PTR), B
+	mov	 8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+  .set i, 0
+  .rept W_PRECALC_AHEAD
+	W_PRECALC i
+    .set i, (i+1)
+  .endr
+
+.align 4
+1:
+	RR F1,A,B,C,D,E,0
+	RR F1,D,E,A,B,C,2
+	RR F1,B,C,D,E,A,4
+	RR F1,E,A,B,C,D,6
+	RR F1,C,D,E,A,B,8
+
+	RR F1,A,B,C,D,E,10
+	RR F1,D,E,A,B,C,12
+	RR F1,B,C,D,E,A,14
+	RR F1,E,A,B,C,D,16
+	RR F1,C,D,E,A,B,18
+
+	RR F2,A,B,C,D,E,20
+	RR F2,D,E,A,B,C,22
+	RR F2,B,C,D,E,A,24
+	RR F2,E,A,B,C,D,26
+	RR F2,C,D,E,A,B,28
+
+	RR F2,A,B,C,D,E,30
+	RR F2,D,E,A,B,C,32
+	RR F2,B,C,D,E,A,34
+	RR F2,E,A,B,C,D,36
+	RR F2,C,D,E,A,B,38
+
+	RR F3,A,B,C,D,E,40
+	RR F3,D,E,A,B,C,42
+	RR F3,B,C,D,E,A,44
+	RR F3,E,A,B,C,D,46
+	RR F3,C,D,E,A,B,48
+
+	RR F3,A,B,C,D,E,50
+	RR F3,D,E,A,B,C,52
+	RR F3,B,C,D,E,A,54
+	RR F3,E,A,B,C,D,56
+	RR F3,C,D,E,A,B,58
+
+	add	$64, BUFFER_PTR		# move to the next 64-byte block
+	cmp	BUFFER_END, BUFFER_PTR	# if the current is the last one use
+	cmovae	K_BASE, BUFFER_PTR	# dummy source to avoid buffer overrun
+
+	RR F4,A,B,C,D,E,60
+	RR F4,D,E,A,B,C,62
+	RR F4,B,C,D,E,A,64
+	RR F4,E,A,B,C,D,66
+	RR F4,C,D,E,A,B,68
+
+	RR F4,A,B,C,D,E,70
+	RR F4,D,E,A,B,C,72
+	RR F4,B,C,D,E,A,74
+	RR F4,E,A,B,C,D,76
+	RR F4,C,D,E,A,B,78
+
+	UPDATE_HASH   (HASH_PTR), A
+	UPDATE_HASH  4(HASH_PTR), B
+	UPDATE_HASH  8(HASH_PTR), C
+	UPDATE_HASH 12(HASH_PTR), D
+	UPDATE_HASH 16(HASH_PTR), E
+
+	RESTORE_RENAMED_REGS
+	cmp	K_BASE, BUFFER_PTR	# K_BASE means, we reached the end
+	jne	1b
+.endm
+
+.macro INIT_REGALLOC
+  .set A, REG_A
+  .set B, REG_B
+  .set C, REG_C
+  .set D, REG_D
+  .set E, REG_E
+  .set T1, REG_T1
+  .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+	# order is important (REG_C is where it should be)
+	mov	B, REG_B
+	mov	D, REG_D
+	mov	A, REG_A
+	mov	E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES  a, b
+  .set _T, \a
+  .set \a, \b
+  .set \b, _T
+.endm
+
+.macro F1  b, c, d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	xor	\d, T1
+	and	\b, T1
+	xor	\d, T1
+.endm
+
+.macro F2  b, c, d
+	mov	\d, T1
+	SWAP_REG_NAMES \d, T1
+	xor	\c, T1
+	xor	\b, T1
+.endm
+
+.macro F3  b, c ,d
+	mov	\c, T1
+	SWAP_REG_NAMES \c, T1
+	mov	\b, T2
+	or	\b, T1
+	and	\c, T2
+	and	\d, T1
+	or	T2, T1
+.endm
+
+.macro F4  b, c, d
+	F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ *   t1 = F(b, c, d);   e += w(i)
+ *   e += t1;           b <<= 30;   d  += w(i+1);
+ *   t1 = F(a, b, c);
+ *   d += t1;           a <<= 5;
+ *   e += a;
+ *   t1 = e;            a >>= 7;
+ *   t1 <<= 5;
+ *   d += t1;
+ */
+.macro RR  F, a, b, c, d, e, round
+	add	WK(\round), \e
+	\F   \b, \c, \d		# t1 = F(b, c, d);
+	W_PRECALC (\round + W_PRECALC_AHEAD)
+	rol	$30, \b
+	add	T1, \e
+	add	WK(\round + 1), \d
+
+	\F   \a, \b, \c
+	W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+	rol	$5, \a
+	add	\a, \e
+	add	T1, \d
+	ror	$7, \a		# (a <<r 5) >>r 7) => a <<r 30)
+
+	mov	\e, T1
+	SWAP_REG_NAMES \e, T1
+
+	rol	$5, T1
+	add	T1, \d
+
+	# write:  \a, \b
+	# rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC  r
+  .set i, \r
+
+  .if (i < 20)
+    .set K_XMM, 0
+  .elseif (i < 40)
+    .set K_XMM, 16
+  .elseif (i < 60)
+    .set K_XMM, 32
+  .elseif (i < 80)
+    .set K_XMM, 48
+  .endif
+
+  .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+    .set i, ((\r) % 80)	    # pre-compute for the next iteration
+    .if (i == 0)
+	W_PRECALC_RESET
+    .endif
+	W_PRECALC_00_15
+  .elseif (i<32)
+	W_PRECALC_16_31
+  .elseif (i < 80)   // rounds 32-79
+	W_PRECALC_32_79
+  .endif
+.endm
+
+.macro W_PRECALC_RESET
+  .set W,          W0
+  .set W_minus_04, W4
+  .set W_minus_08, W8
+  .set W_minus_12, W12
+  .set W_minus_16, W16
+  .set W_minus_20, W20
+  .set W_minus_24, W24
+  .set W_minus_28, W28
+  .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+  .set W_minus_32, W_minus_28
+  .set W_minus_28, W_minus_24
+  .set W_minus_24, W_minus_20
+  .set W_minus_20, W_minus_16
+  .set W_minus_16, W_minus_12
+  .set W_minus_12, W_minus_08
+  .set W_minus_08, W_minus_04
+  .set W_minus_04, W
+  .set W,          W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+	W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+	W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+	W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+  .if ((i & 3) == 0)
+	movdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	pshufb	XMM_SHUFB_BSWAP, W_TMP1
+	movdqa	W_TMP1, W
+  .elseif ((i & 3) == 2)
+	paddd	(K_BASE), W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa  W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ *   instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+  # blended scheduling of vector and scalar instruction streams, one 4-wide
+  # vector iteration / 4 scalar rounds
+  .if ((i & 3) == 0)
+	movdqa	W_minus_12, W
+	palignr	$8, W_minus_16, W	# w[i-14]
+	movdqa	W_minus_04, W_TMP1
+	psrldq	$4, W_TMP1		# w[i-3]
+	pxor	W_minus_08, W
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W_TMP1
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP2
+	movdqa	W, W_TMP1
+	pslldq	$12, W_TMP2
+  .elseif ((i & 3) == 2)
+	psrld	$31, W
+	pslld	$1, W_TMP1
+	por	W, W_TMP1
+	movdqa	W_TMP2, W
+	psrld	$30, W_TMP2
+	pslld	$2, W
+  .elseif ((i & 3) == 3)
+	pxor	W, W_TMP1
+	pxor	W_TMP2, W_TMP1
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+  .if ((i & 3) == 0)
+	movdqa	W_minus_04, W_TMP1
+	pxor	W_minus_28, W		# W is W_minus_32 before xor
+	palignr	$8, W_minus_08, W_TMP1
+  .elseif ((i & 3) == 1)
+	pxor	W_minus_16, W
+	pxor	W_TMP1, W
+	movdqa	W, W_TMP1
+  .elseif ((i & 3) == 2)
+	psrld	$30, W
+	pslld	$2, W_TMP1
+	por	W, W_TMP1
+  .elseif ((i & 3) == 3)
+	movdqa	W_TMP1, W
+	paddd	K_XMM(K_BASE), W_TMP1
+	movdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm		// W_PRECALC_SSSE3
+
+
+#define K1	0x5a827999
+#define K2	0x6ed9eba1
+#define K3	0x8f1bbcdc
+#define K4	0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+	movdqu	\a,\b
+.endm
+
+/*
+ * SSSE3 optimized implementation:
+ *  extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
+ *                                       unsigned int rounds);
+ */
+SHA1_VECTOR_ASM     sha1_transform_ssse3
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro  W_PRECALC_00_15
+    W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro  W_PRECALC_16_31
+    W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro  W_PRECALC_32_79
+    W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+  .if ((i & 3) == 0)
+	vmovdqu	(i*4)(BUFFER_PTR), W_TMP1
+  .elseif ((i & 3) == 1)
+	vpshufb	XMM_SHUFB_BSWAP, W_TMP1, W
+  .elseif ((i & 3) == 2)
+	vpaddd	(K_BASE), W, W_TMP1
+  .elseif ((i & 3) == 3)
+	vmovdqa	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_16, W_minus_12, W	# w[i-14]
+	vpsrldq	$4, W_minus_04, W_TMP1		# w[i-3]
+	vpxor	W_minus_08, W, W
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+  .elseif ((i & 3) == 1)
+	vpxor	W_TMP1, W, W
+	vpslldq	$12, W, W_TMP2
+	vpslld	$1, W, W_TMP1
+  .elseif ((i & 3) == 2)
+	vpsrld	$31, W, W
+	vpor	W, W_TMP1, W_TMP1
+	vpslld	$2, W_TMP2, W
+	vpsrld	$30, W_TMP2, W_TMP2
+  .elseif ((i & 3) == 3)
+	vpxor	W, W_TMP1, W_TMP1
+	vpxor	W_TMP2, W_TMP1, W
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+  .if ((i & 3) == 0)
+	vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+	vpxor	W_minus_28, W, W		# W is W_minus_32 before xor
+  .elseif ((i & 3) == 1)
+	vpxor	W_minus_16, W_TMP1, W_TMP1
+	vpxor	W_TMP1, W, W
+  .elseif ((i & 3) == 2)
+	vpslld	$2, W, W_TMP1
+	vpsrld	$30, W, W
+	vpor	W, W_TMP1, W
+  .elseif ((i & 3) == 3)
+	vpaddd	K_XMM(K_BASE), W, W_TMP1
+	vmovdqu	W_TMP1, WK(i&~3)
+	W_PRECALC_ROTATE
+  .endif
+.endm
+
+.endm    // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+	vmovdqu	\a,\b
+.endm
+
+/*
+ * AVX optimized implementation:
+ *  extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
+ *                                     unsigned int rounds);
+ */
+SHA1_VECTOR_ASM     sha1_transform_avx
-- 
1.7.9.5