sha1-avx2-asm.S source code [Linux/lib/crypto/x86/sha1-avx2-asm.S]

1	/*
2	* Implement fast SHA-1 with AVX2 instructions. (x86_64)
3	*
4	* This file is provided under a dual BSD/GPLv2 license. When using or
5	* redistributing this file, you may do so under either license.
6	*
7	* GPL LICENSE SUMMARY
8	*
9	* Copyright(c) 2014 Intel Corporation.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of version 2 of the GNU General Public License as
13	* published by the Free Software Foundation.
14	*
15	* This program is distributed in the hope that it will be useful, but
16	* WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.
19	*
20	* Contact Information:
21	* Ilya Albrekht <ilya.albrekht@intel.com>
22	* Maxim Locktyukhin <maxim.locktyukhin@intel.com>
23	* Ronen Zohar <ronen.zohar@intel.com>
24	* Chandramouli Narayanan <mouli@linux.intel.com>
25	*
26	* BSD LICENSE
27	*
28	* Copyright(c) 2014 Intel Corporation.
29	*
30	* Redistribution and use in source and binary forms, with or without
31	* modification, are permitted provided that the following conditions
32	* are met:
33	*
34	* Redistributions of source code must retain the above copyright
35	* notice, this list of conditions and the following disclaimer.
36	* Redistributions in binary form must reproduce the above copyright
37	* notice, this list of conditions and the following disclaimer in
38	* the documentation and/or other materials provided with the
39	* distribution.
40	* Neither the name of Intel Corporation nor the names of its
41	* contributors may be used to endorse or promote products derived
42	* from this software without specific prior written permission.
43	*
44	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55	*
56	*/
57
58	/*
59	* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
60	*
61	*This implementation is based on the previous SSSE3 release:
62	*Visit http://software.intel.com/en-us/articles/
63	*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
64	*
65	* void sha1_transform_avx2(struct sha1_block_state *state,
66	* const u8 *data, size_t nblocks);
67	*/
68
69	#include <linux/linkage.h>
70
71	#define CTX %rdi /* arg1 */
72	#define BUF %rsi /* arg2 */
73	#define CNT %rdx /* arg3 */
74
75	#define REG_A %ecx
76	#define REG_B %esi
77	#define REG_C %edi
78	#define REG_D %eax
79	#define REG_E %edx
80	#define REG_TB %ebx
81	#define REG_TA %r12d
82	#define REG_RA %rcx
83	#define REG_RB %rsi
84	#define REG_RC %rdi
85	#define REG_RD %rax
86	#define REG_RE %rdx
87	#define REG_RTA %r12
88	#define REG_RTB %rbx
89	#define REG_T1 %r11d
90	#define xmm_mov vmovups
91	#define avx2_zeroupper vzeroupper
92	#define RND_F1 1
93	#define RND_F2 2
94	#define RND_F3 3
95
96	.macro REGALLOC
97	.set A, REG_A
98	.set B, REG_B
99	.set C, REG_C
100	.set D, REG_D
101	.set E, REG_E
102	.set TB, REG_TB
103	.set TA, REG_TA
104
105	.set RA, REG_RA
106	.set RB, REG_RB
107	.set RC, REG_RC
108	.set RD, REG_RD
109	.set RE, REG_RE
110
111	.set RTA, REG_RTA
112	.set RTB, REG_RTB
113
114	.set T1, REG_T1
115	.endm
116
117	#define HASH_PTR %r9
118	#define BLOCKS_CTR %r8
119	#define BUFFER_PTR %r10
120	#define BUFFER_PTR2 %r13
121
122	#define PRECALC_BUF %r14
123	#define WK_BUF %r15
124
125	#define W_TMP %xmm0
126	#define WY_TMP %ymm0
127	#define WY_TMP2 %ymm9
128
129	# AVX2 variables
130	#define WY0 %ymm3
131	#define WY4 %ymm5
132	#define WY08 %ymm7
133	#define WY12 %ymm8
134	#define WY16 %ymm12
135	#define WY20 %ymm13
136	#define WY24 %ymm14
137	#define WY28 %ymm15
138
139	#define YMM_SHUFB_BSWAP %ymm10
140
141	/*
142	* Keep 2 iterations precalculated at a time:
143	* - 80 DWORDs per iteration * 2
144	*/
145	#define W_SIZE (8022 +16)
146
147	#define WK(t) ((((t) % 80) / 4)32 + ( (t) % 4)4 + ((t)/80)*16 )(WK_BUF)
148	#define PRECALC_WK(t) ((t)22)(PRECALC_BUF)
149
150
151	.macro UPDATE_HASH hash, val
152	add \hash, \val
153	mov \val, \hash
154	.endm
155
156	.macro PRECALC_RESET_WY
157	.set WY_00, WY0
158	.set WY_04, WY4
159	.set WY_08, WY08
160	.set WY_12, WY12
161	.set WY_16, WY16
162	.set WY_20, WY20
163	.set WY_24, WY24
164	.set WY_28, WY28
165	.set WY_32, WY_00
166	.endm
167
168	.macro PRECALC_ROTATE_WY
169	/ Rotate macros /
170	.set WY_32, WY_28
171	.set WY_28, WY_24
172	.set WY_24, WY_20
173	.set WY_20, WY_16
174	.set WY_16, WY_12
175	.set WY_12, WY_08
176	.set WY_08, WY_04
177	.set WY_04, WY_00
178	.set WY_00, WY_32
179
180	/ Define register aliases /
181	.set WY, WY_00
182	.set WY_minus_04, WY_04
183	.set WY_minus_08, WY_08
184	.set WY_minus_12, WY_12
185	.set WY_minus_16, WY_16
186	.set WY_minus_20, WY_20
187	.set WY_minus_24, WY_24
188	.set WY_minus_28, WY_28
189	.set WY_minus_32, WY
190	.endm
191
192	.macro PRECALC_00_15
193	.if (i == `0`) # Initialize and rotate registers
194	PRECALC_RESET_WY
195	PRECALC_ROTATE_WY
196	.endif
197
198	/ message scheduling pre-compute for rounds 0-15 /
199	.if ((i & `7`) == `0`)
200	/*
201	* blended AVX2 and ALU instruction scheduling
202	* 1 vector iteration per 8 rounds
203	*/
204	vmovdqu (i * `2`)(BUFFER_PTR), W_TMP
205	.elseif ((i & `7`) == `1`)
206	vinsertf128 $`1`, ((i-`1`) * `2`)(BUFFER_PTR2),\
207	WY_TMP, WY_TMP
208	.elseif ((i & `7`) == `2`)
209	vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
210	.elseif ((i & `7`) == `4`)
211	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
212	.elseif ((i & `7`) == `7`)
213	vmovdqu WY_TMP, PRECALC_WK(i&~`7`)
214
215	PRECALC_ROTATE_WY
216	.endif
217	.endm
218
219	.macro PRECALC_16_31
220	/*
221	* message scheduling pre-compute for rounds 16-31
222	* calculating last 32 w[i] values in 8 XMM registers
223	* pre-calculate K+w[i] values and store to mem
224	* for later load by ALU add instruction
225	*
226	* "brute force" vectorization for rounds 16-31 only
227	* due to w[i]->w[i-3] dependency
228	*/
229	.if ((i & `7`) == `0`)
230	/*
231	* blended AVX2 and ALU instruction scheduling
232	* 1 vector iteration per 8 rounds
233	*/
234	/ w[i-14] /
235	vpalignr $`8`, WY_minus_16, WY_minus_12, WY
236	vpsrldq $`4`, WY_minus_04, WY_TMP / w[i-3] /
237	.elseif ((i & `7`) == `1`)
238	vpxor WY_minus_08, WY, WY
239	vpxor WY_minus_16, WY_TMP, WY_TMP
240	.elseif ((i & `7`) == `2`)
241	vpxor WY_TMP, WY, WY
242	vpslldq $`12`, WY, WY_TMP2
243	.elseif ((i & `7`) == `3`)
244	vpslld $`1`, WY, WY_TMP
245	vpsrld $`31`, WY, WY
246	.elseif ((i & `7`) == `4`)
247	vpor WY, WY_TMP, WY_TMP
248	vpslld $`2`, WY_TMP2, WY
249	.elseif ((i & `7`) == `5`)
250	vpsrld $`30`, WY_TMP2, WY_TMP2
251	vpxor WY, WY_TMP, WY_TMP
252	.elseif ((i & `7`) == `7`)
253	vpxor WY_TMP2, WY_TMP, WY
254	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
255	vmovdqu WY_TMP, PRECALC_WK(i&~`7`)
256
257	PRECALC_ROTATE_WY
258	.endif
259	.endm
260
261	.macro PRECALC_32_79
262	/*
263	* in SHA-1 specification:
264	* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
265	* instead we do equal:
266	* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
267	* allows more efficient vectorization
268	* since w[i]=>w[i-3] dependency is broken
269	*/
270
271	.if ((i & `7`) == `0`)
272	/*
273	* blended AVX2 and ALU instruction scheduling
274	* 1 vector iteration per 8 rounds
275	*/
276	vpalignr $`8`, WY_minus_08, WY_minus_04, WY_TMP
277	.elseif ((i & `7`) == `1`)
278	/ W is W_minus_32 before xor /
279	vpxor WY_minus_28, WY, WY
280	.elseif ((i & `7`) == `2`)
281	vpxor WY_minus_16, WY_TMP, WY_TMP
282	.elseif ((i & `7`) == `3`)
283	vpxor WY_TMP, WY, WY
284	.elseif ((i & `7`) == `4`)
285	vpslld $`2`, WY, WY_TMP
286	.elseif ((i & `7`) == `5`)
287	vpsrld $`30`, WY, WY
288	vpor WY, WY_TMP, WY
289	.elseif ((i & `7`) == `7`)
290	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
291	vmovdqu WY_TMP, PRECALC_WK(i&~`7`)
292
293	PRECALC_ROTATE_WY
294	.endif
295	.endm
296
297	.macro PRECALC r, s
298	.set i, \r
299
300	.if (i < `40`)
301	.set K_XMM, `32`*`0`
302	.elseif (i < `80`)
303	.set K_XMM, `32`*`1`
304	.elseif (i < `120`)
305	.set K_XMM, `32`*`2`
306	.else
307	.set K_XMM, `32`*`3`
308	.endif
309
310	.if (i<`32`)
311	PRECALC_00_15 \s
312	.elseif (i<`64`)
313	PRECALC_16_31 \s
314	.elseif (i < `160`)
315	PRECALC_32_79 \s
316	.endif
317	.endm
318
319	.macro ROTATE_STATE
320	.set T_REG, E
321	.set E, D
322	.set D, C
323	.set C, B
324	.set B, TB
325	.set TB, A
326	.set A, T_REG
327
328	.set T_REG, RE
329	.set RE, RD
330	.set RD, RC
331	.set RC, RB
332	.set RB, RTB
333	.set RTB, RA
334	.set RA, T_REG
335	.endm
336
337	/ Macro relies on saved ROUND_Fx /
338
339	.macro RND_FUN f, r
340	.if (\f == RND_F1)
341	ROUND_F1 \r
342	.elseif (\f == RND_F2)
343	ROUND_F2 \r
344	.elseif (\f == RND_F3)
345	ROUND_F3 \r
346	.endif
347	.endm
348
349	.macro RR r
350	.set round_id, (\r % `80`)
351
352	.if (round_id == `0`) / Precalculate F for first round /
353	.set ROUND_FUNC, RND_F1
354	mov B, TB
355
356	rorx $(`32`-`30`), B, B / b>>>2 /
357	andn D, TB, T1
358	and C, TB
359	xor T1, TB
360	.endif
361
362	RND_FUN ROUND_FUNC, \r
363	ROTATE_STATE
364
365	.if (round_id == `18`)
366	.set ROUND_FUNC, RND_F2
367	.elseif (round_id == `38`)
368	.set ROUND_FUNC, RND_F3
369	.elseif (round_id == `58`)
370	.set ROUND_FUNC, RND_F2
371	.endif
372
373	.set round_id, ( (\r+`1`) % `80`)
374
375	RND_FUN ROUND_FUNC, (\r+`1`)
376	ROTATE_STATE
377	.endm
378
379	.macro ROUND_F1 r
380	add WK(\r), E
381
382	andn C, A, T1 / ~b&d /
383	lea (RE,RTB), E / Add F from the previous round /
384
385	rorx $(`32`-`5`), A, TA / T2 = A >>> 5 /
386	rorx $(`32`-`30`),A, TB / b>>>2 for next round /
387
388	PRECALC (\r) / msg scheduling for next 2 blocks /
389
390	/*
391	* Calculate F for the next round
392	* (b & c) ^ andn[b, d]
393	*/
394	and B, A / b&c /
395	xor T1, A / F1 = (b&c) ^ (~b&d) /
396
397	lea (RE,RTA), E / E += A >>> 5 /
398	.endm
399
400	.macro ROUND_F2 r
401	add WK(\r), E
402	lea (RE,RTB), E / Add F from the previous round /
403
404	/ Calculate F for the next round /
405	rorx $(`32`-`5`), A, TA / T2 = A >>> 5 /
406	.if ((round_id) < `79`)
407	rorx $(`32`-`30`), A, TB / b>>>2 for next round /
408	.endif
409	PRECALC (\r) / msg scheduling for next 2 blocks /
410
411	.if ((round_id) < `79`)
412	xor B, A
413	.endif
414
415	add TA, E / E += A >>> 5 /
416
417	.if ((round_id) < `79`)
418	xor C, A
419	.endif
420	.endm
421
422	.macro ROUND_F3 r
423	add WK(\r), E
424	PRECALC (\r) / msg scheduling for next 2 blocks /
425
426	lea (RE,RTB), E / Add F from the previous round /
427
428	mov B, T1
429	or A, T1
430
431	rorx $(`32`-`5`), A, TA / T2 = A >>> 5 /
432	rorx $(`32`-`30`), A, TB / b>>>2 for next round /
433
434	/ Calculate F for the next round*
435	* (b and c) or (d and (b or c))
436	*/
437	and C, T1
438	and B, A
439	or T1, A
440
441	add TA, E / E += A >>> 5 /
442
443	.endm
444
445	/ Add constant only if (%2 > %3) condition met (uses RTA as temp)*
446	* %1 + %2 >= %3 ? %4 : 0
447	*/
448	.macro ADD_IF_GE a, b, c, d
449	mov \a, RTA
450	add $\d, RTA
451	cmp $\c, \b
452	cmovge RTA, \a
453	.endm
454
455	/*
456	* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
457	*/
458	.macro SHA1_PIPELINED_MAIN_BODY
459
460	REGALLOC
461
462	mov (HASH_PTR), A
463	mov `4`(HASH_PTR), B
464	mov `8`(HASH_PTR), C
465	mov `12`(HASH_PTR), D
466	mov `16`(HASH_PTR), E
467
468	mov %rsp, PRECALC_BUF
469	lea (`2``4``80`+`32`)(%rsp), WK_BUF
470
471	# Precalc WK for first 2 blocks
472	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, `2`, `64`
473	.set i, `0`
474	.rept `160`
475	PRECALC i
476	.set i, i + `1`
477	.endr
478
479	/ Go to next block if needed /
480	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, `3`, `128`
481	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, `4`, `128`
482	xchg WK_BUF, PRECALC_BUF
483
484	.align `32`
485	.L_loop:
486	/*
487	* code loops through more than one block
488	* we use K_BASE value as a signal of a last block,
489	* it is set below by: cmovae BUFFER_PTR, K_BASE
490	*/
491	test BLOCKS_CTR, BLOCKS_CTR
492	jnz .L_begin
493	.align `32`
494	jmp .L_end
495	.align `32`
496	.L_begin:
497
498	/*
499	* Do first block
500	* rounds: 0,2,4,6,8
501	*/
502	.set j, `0`
503	.rept `5`
504	RR j
505	.set j, j+`2`
506	.endr
507
508	/*
509	* rounds:
510	* 10,12,14,16,18
511	* 20,22,24,26,28
512	* 30,32,34,36,38
513	* 40,42,44,46,48
514	* 50,52,54,56,58
515	*/
516	.rept `25`
517	RR j
518	.set j, j+`2`
519	.endr
520
521	/ Update Counter /
522	sub $`1`, BLOCKS_CTR
523	/ Move to the next block only if needed/
524	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, `4`, `128`
525	/*
526	* rounds
527	* 60,62,64,66,68
528	* 70,72,74,76,78
529	*/
530	.rept `10`
531	RR j
532	.set j, j+`2`
533	.endr
534
535	UPDATE_HASH (HASH_PTR), A
536	UPDATE_HASH `4`(HASH_PTR), TB
537	UPDATE_HASH `8`(HASH_PTR), C
538	UPDATE_HASH `12`(HASH_PTR), D
539	UPDATE_HASH `16`(HASH_PTR), E
540
541	test BLOCKS_CTR, BLOCKS_CTR
542	jz .L_loop
543
544	mov TB, B
545
546	/ Process second block /
547	/*
548	* rounds
549	* 0+80, 2+80, 4+80, 6+80, 8+80
550	* 10+80,12+80,14+80,16+80,18+80
551	*/
552
553	.set j, `0`
554	.rept `10`
555	RR j+`80`
556	.set j, j+`2`
557	.endr
558
559	/*
560	* rounds
561	* 20+80,22+80,24+80,26+80,28+80
562	* 30+80,32+80,34+80,36+80,38+80
563	*/
564	.rept `10`
565	RR j+`80`
566	.set j, j+`2`
567	.endr
568
569	/*
570	* rounds
571	* 40+80,42+80,44+80,46+80,48+80
572	* 50+80,52+80,54+80,56+80,58+80
573	*/
574	.rept `10`
575	RR j+`80`
576	.set j, j+`2`
577	.endr
578
579	/ update counter /
580	sub $`1`, BLOCKS_CTR
581	/ Move to the next block only if needed/
582	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, `4`, `128`
583
584	/*
585	* rounds
586	* 60+80,62+80,64+80,66+80,68+80
587	* 70+80,72+80,74+80,76+80,78+80
588	*/
589	.rept `10`
590	RR j+`80`
591	.set j, j+`2`
592	.endr
593
594	UPDATE_HASH (HASH_PTR), A
595	UPDATE_HASH `4`(HASH_PTR), TB
596	UPDATE_HASH `8`(HASH_PTR), C
597	UPDATE_HASH `12`(HASH_PTR), D
598	UPDATE_HASH `16`(HASH_PTR), E
599
600	/ Reset state for AVX2 reg permutation /
601	mov A, TA
602	mov TB, A
603	mov C, TB
604	mov E, C
605	mov D, B
606	mov TA, D
607
608	REGALLOC
609
610	xchg WK_BUF, PRECALC_BUF
611
612	jmp .L_loop
613
614	.align `32`
615	.L_end:
616
617	.endm
618	/*
619	* macro implements SHA-1 function's body for several 64-byte blocks
620	* param: function's name
621	*/
622	.macro SHA1_VECTOR_ASM name
623	SYM_FUNC_START(\name)
624
625	push %rbx
626	push %r12
627	push %r13
628	push %r14
629	push %r15
630
631	RESERVE_STACK = (W_SIZE*`4` + `8`+`24`)
632
633	/ Align stack /
634	push %rbp
635	mov %rsp, %rbp
636	and $~(`0x20`-`1`), %rsp
637	sub $RESERVE_STACK, %rsp
638
639	avx2_zeroupper
640
641	/ Setup initial values /
642	mov CTX, HASH_PTR
643	mov BUF, BUFFER_PTR
644
645	mov BUF, BUFFER_PTR2
646	mov CNT, BLOCKS_CTR
647
648	xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
649
650	SHA1_PIPELINED_MAIN_BODY
651
652	avx2_zeroupper
653
654	mov %rbp, %rsp
655	pop %rbp
656
657	pop %r15
658	pop %r14
659	pop %r13
660	pop %r12
661	pop %rbx
662
663	RET
664
665	SYM_FUNC_END(\name)
666	.endm
667
668	.section .rodata
669
670	#define K1 0x5a827999
671	#define K2 0x6ed9eba1
672	#define K3 0x8f1bbcdc
673	#define K4 0xca62c1d6
674
675	.align `128`
676	K_XMM_AR:
677	.long K1, K1, K1, K1
678	.long K1, K1, K1, K1
679	.long K2, K2, K2, K2
680	.long K2, K2, K2, K2
681	.long K3, K3, K3, K3
682	.long K3, K3, K3, K3
683	.long K4, K4, K4, K4
684	.long K4, K4, K4, K4
685
686	BSWAP_SHUFB_CTL:
687	.long `0x00010203`
688	.long `0x04050607`
689	.long `0x08090a0b`
690	.long `0x0c0d0e0f`
691	.long `0x00010203`
692	.long `0x04050607`
693	.long `0x08090a0b`
694	.long `0x0c0d0e0f`
695	.text
696
697	SHA1_VECTOR_ASM sha1_transform_avx2
698

Browse the source code of Linux/lib/crypto/x86/sha1-avx2-asm.S