1/*
2 * Implement fast SHA-1 with AVX2 instructions. (x86_64)
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Ilya Albrekht <ilya.albrekht@intel.com>
22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
23 * Ronen Zohar <ronen.zohar@intel.com>
24 * Chandramouli Narayanan <mouli@linux.intel.com>
25 *
26 * BSD LICENSE
27 *
28 * Copyright(c) 2014 Intel Corporation.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 *
34 * Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in
38 * the documentation and/or other materials provided with the
39 * distribution.
40 * Neither the name of Intel Corporation nor the names of its
41 * contributors may be used to endorse or promote products derived
42 * from this software without specific prior written permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 *
56 */
57
58/*
59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
60 *
61 *This implementation is based on the previous SSSE3 release:
62 *Visit http://software.intel.com/en-us/articles/
63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
64 *
65 * void sha1_transform_avx2(struct sha1_block_state *state,
66 * const u8 *data, size_t nblocks);
67 */
68
69#include <linux/linkage.h>
70
71#define CTX %rdi /* arg1 */
72#define BUF %rsi /* arg2 */
73#define CNT %rdx /* arg3 */
74
75#define REG_A %ecx
76#define REG_B %esi
77#define REG_C %edi
78#define REG_D %eax
79#define REG_E %edx
80#define REG_TB %ebx
81#define REG_TA %r12d
82#define REG_RA %rcx
83#define REG_RB %rsi
84#define REG_RC %rdi
85#define REG_RD %rax
86#define REG_RE %rdx
87#define REG_RTA %r12
88#define REG_RTB %rbx
89#define REG_T1 %r11d
90#define xmm_mov vmovups
91#define avx2_zeroupper vzeroupper
92#define RND_F1 1
93#define RND_F2 2
94#define RND_F3 3
95
96.macro REGALLOC
97 .set A, REG_A
98 .set B, REG_B
99 .set C, REG_C
100 .set D, REG_D
101 .set E, REG_E
102 .set TB, REG_TB
103 .set TA, REG_TA
104
105 .set RA, REG_RA
106 .set RB, REG_RB
107 .set RC, REG_RC
108 .set RD, REG_RD
109 .set RE, REG_RE
110
111 .set RTA, REG_RTA
112 .set RTB, REG_RTB
113
114 .set T1, REG_T1
115.endm
116
117#define HASH_PTR %r9
118#define BLOCKS_CTR %r8
119#define BUFFER_PTR %r10
120#define BUFFER_PTR2 %r13
121
122#define PRECALC_BUF %r14
123#define WK_BUF %r15
124
125#define W_TMP %xmm0
126#define WY_TMP %ymm0
127#define WY_TMP2 %ymm9
128
129# AVX2 variables
130#define WY0 %ymm3
131#define WY4 %ymm5
132#define WY08 %ymm7
133#define WY12 %ymm8
134#define WY16 %ymm12
135#define WY20 %ymm13
136#define WY24 %ymm14
137#define WY28 %ymm15
138
139#define YMM_SHUFB_BSWAP %ymm10
140
141/*
142 * Keep 2 iterations precalculated at a time:
143 * - 80 DWORDs per iteration * 2
144 */
145#define W_SIZE (80*2*2 +16)
146
147#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
148#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
149
150
151.macro UPDATE_HASH hash, val
152 add \hash, \val
153 mov \val, \hash
154.endm
155
156.macro PRECALC_RESET_WY
157 .set WY_00, WY0
158 .set WY_04, WY4
159 .set WY_08, WY08
160 .set WY_12, WY12
161 .set WY_16, WY16
162 .set WY_20, WY20
163 .set WY_24, WY24
164 .set WY_28, WY28
165 .set WY_32, WY_00
166.endm
167
168.macro PRECALC_ROTATE_WY
169 /* Rotate macros */
170 .set WY_32, WY_28
171 .set WY_28, WY_24
172 .set WY_24, WY_20
173 .set WY_20, WY_16
174 .set WY_16, WY_12
175 .set WY_12, WY_08
176 .set WY_08, WY_04
177 .set WY_04, WY_00
178 .set WY_00, WY_32
179
180 /* Define register aliases */
181 .set WY, WY_00
182 .set WY_minus_04, WY_04
183 .set WY_minus_08, WY_08
184 .set WY_minus_12, WY_12
185 .set WY_minus_16, WY_16
186 .set WY_minus_20, WY_20
187 .set WY_minus_24, WY_24
188 .set WY_minus_28, WY_28
189 .set WY_minus_32, WY
190.endm
191
192.macro PRECALC_00_15
193 .if (i == 0) # Initialize and rotate registers
194 PRECALC_RESET_WY
195 PRECALC_ROTATE_WY
196 .endif
197
198 /* message scheduling pre-compute for rounds 0-15 */
199 .if ((i & 7) == 0)
200 /*
201 * blended AVX2 and ALU instruction scheduling
202 * 1 vector iteration per 8 rounds
203 */
204 vmovdqu (i * 2)(BUFFER_PTR), W_TMP
205 .elseif ((i & 7) == 1)
206 vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
207 WY_TMP, WY_TMP
208 .elseif ((i & 7) == 2)
209 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
210 .elseif ((i & 7) == 4)
211 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
212 .elseif ((i & 7) == 7)
213 vmovdqu WY_TMP, PRECALC_WK(i&~7)
214
215 PRECALC_ROTATE_WY
216 .endif
217.endm
218
219.macro PRECALC_16_31
220 /*
221 * message scheduling pre-compute for rounds 16-31
222 * calculating last 32 w[i] values in 8 XMM registers
223 * pre-calculate K+w[i] values and store to mem
224 * for later load by ALU add instruction
225 *
226 * "brute force" vectorization for rounds 16-31 only
227 * due to w[i]->w[i-3] dependency
228 */
229 .if ((i & 7) == 0)
230 /*
231 * blended AVX2 and ALU instruction scheduling
232 * 1 vector iteration per 8 rounds
233 */
234 /* w[i-14] */
235 vpalignr $8, WY_minus_16, WY_minus_12, WY
236 vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
237 .elseif ((i & 7) == 1)
238 vpxor WY_minus_08, WY, WY
239 vpxor WY_minus_16, WY_TMP, WY_TMP
240 .elseif ((i & 7) == 2)
241 vpxor WY_TMP, WY, WY
242 vpslldq $12, WY, WY_TMP2
243 .elseif ((i & 7) == 3)
244 vpslld $1, WY, WY_TMP
245 vpsrld $31, WY, WY
246 .elseif ((i & 7) == 4)
247 vpor WY, WY_TMP, WY_TMP
248 vpslld $2, WY_TMP2, WY
249 .elseif ((i & 7) == 5)
250 vpsrld $30, WY_TMP2, WY_TMP2
251 vpxor WY, WY_TMP, WY_TMP
252 .elseif ((i & 7) == 7)
253 vpxor WY_TMP2, WY_TMP, WY
254 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
255 vmovdqu WY_TMP, PRECALC_WK(i&~7)
256
257 PRECALC_ROTATE_WY
258 .endif
259.endm
260
261.macro PRECALC_32_79
262 /*
263 * in SHA-1 specification:
264 * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
265 * instead we do equal:
266 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
267 * allows more efficient vectorization
268 * since w[i]=>w[i-3] dependency is broken
269 */
270
271 .if ((i & 7) == 0)
272 /*
273 * blended AVX2 and ALU instruction scheduling
274 * 1 vector iteration per 8 rounds
275 */
276 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
277 .elseif ((i & 7) == 1)
278 /* W is W_minus_32 before xor */
279 vpxor WY_minus_28, WY, WY
280 .elseif ((i & 7) == 2)
281 vpxor WY_minus_16, WY_TMP, WY_TMP
282 .elseif ((i & 7) == 3)
283 vpxor WY_TMP, WY, WY
284 .elseif ((i & 7) == 4)
285 vpslld $2, WY, WY_TMP
286 .elseif ((i & 7) == 5)
287 vpsrld $30, WY, WY
288 vpor WY, WY_TMP, WY
289 .elseif ((i & 7) == 7)
290 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
291 vmovdqu WY_TMP, PRECALC_WK(i&~7)
292
293 PRECALC_ROTATE_WY
294 .endif
295.endm
296
297.macro PRECALC r, s
298 .set i, \r
299
300 .if (i < 40)
301 .set K_XMM, 32*0
302 .elseif (i < 80)
303 .set K_XMM, 32*1
304 .elseif (i < 120)
305 .set K_XMM, 32*2
306 .else
307 .set K_XMM, 32*3
308 .endif
309
310 .if (i<32)
311 PRECALC_00_15 \s
312 .elseif (i<64)
313 PRECALC_16_31 \s
314 .elseif (i < 160)
315 PRECALC_32_79 \s
316 .endif
317.endm
318
319.macro ROTATE_STATE
320 .set T_REG, E
321 .set E, D
322 .set D, C
323 .set C, B
324 .set B, TB
325 .set TB, A
326 .set A, T_REG
327
328 .set T_REG, RE
329 .set RE, RD
330 .set RD, RC
331 .set RC, RB
332 .set RB, RTB
333 .set RTB, RA
334 .set RA, T_REG
335.endm
336
337/* Macro relies on saved ROUND_Fx */
338
339.macro RND_FUN f, r
340 .if (\f == RND_F1)
341 ROUND_F1 \r
342 .elseif (\f == RND_F2)
343 ROUND_F2 \r
344 .elseif (\f == RND_F3)
345 ROUND_F3 \r
346 .endif
347.endm
348
349.macro RR r
350 .set round_id, (\r % 80)
351
352 .if (round_id == 0) /* Precalculate F for first round */
353 .set ROUND_FUNC, RND_F1
354 mov B, TB
355
356 rorx $(32-30), B, B /* b>>>2 */
357 andn D, TB, T1
358 and C, TB
359 xor T1, TB
360 .endif
361
362 RND_FUN ROUND_FUNC, \r
363 ROTATE_STATE
364
365 .if (round_id == 18)
366 .set ROUND_FUNC, RND_F2
367 .elseif (round_id == 38)
368 .set ROUND_FUNC, RND_F3
369 .elseif (round_id == 58)
370 .set ROUND_FUNC, RND_F2
371 .endif
372
373 .set round_id, ( (\r+1) % 80)
374
375 RND_FUN ROUND_FUNC, (\r+1)
376 ROTATE_STATE
377.endm
378
379.macro ROUND_F1 r
380 add WK(\r), E
381
382 andn C, A, T1 /* ~b&d */
383 lea (RE,RTB), E /* Add F from the previous round */
384
385 rorx $(32-5), A, TA /* T2 = A >>> 5 */
386 rorx $(32-30),A, TB /* b>>>2 for next round */
387
388 PRECALC (\r) /* msg scheduling for next 2 blocks */
389
390 /*
391 * Calculate F for the next round
392 * (b & c) ^ andn[b, d]
393 */
394 and B, A /* b&c */
395 xor T1, A /* F1 = (b&c) ^ (~b&d) */
396
397 lea (RE,RTA), E /* E += A >>> 5 */
398.endm
399
400.macro ROUND_F2 r
401 add WK(\r), E
402 lea (RE,RTB), E /* Add F from the previous round */
403
404 /* Calculate F for the next round */
405 rorx $(32-5), A, TA /* T2 = A >>> 5 */
406 .if ((round_id) < 79)
407 rorx $(32-30), A, TB /* b>>>2 for next round */
408 .endif
409 PRECALC (\r) /* msg scheduling for next 2 blocks */
410
411 .if ((round_id) < 79)
412 xor B, A
413 .endif
414
415 add TA, E /* E += A >>> 5 */
416
417 .if ((round_id) < 79)
418 xor C, A
419 .endif
420.endm
421
422.macro ROUND_F3 r
423 add WK(\r), E
424 PRECALC (\r) /* msg scheduling for next 2 blocks */
425
426 lea (RE,RTB), E /* Add F from the previous round */
427
428 mov B, T1
429 or A, T1
430
431 rorx $(32-5), A, TA /* T2 = A >>> 5 */
432 rorx $(32-30), A, TB /* b>>>2 for next round */
433
434 /* Calculate F for the next round
435 * (b and c) or (d and (b or c))
436 */
437 and C, T1
438 and B, A
439 or T1, A
440
441 add TA, E /* E += A >>> 5 */
442
443.endm
444
445/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
446 * %1 + %2 >= %3 ? %4 : 0
447 */
448.macro ADD_IF_GE a, b, c, d
449 mov \a, RTA
450 add $\d, RTA
451 cmp $\c, \b
452 cmovge RTA, \a
453.endm
454
455/*
456 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
457 */
458.macro SHA1_PIPELINED_MAIN_BODY
459
460 REGALLOC
461
462 mov (HASH_PTR), A
463 mov 4(HASH_PTR), B
464 mov 8(HASH_PTR), C
465 mov 12(HASH_PTR), D
466 mov 16(HASH_PTR), E
467
468 mov %rsp, PRECALC_BUF
469 lea (2*4*80+32)(%rsp), WK_BUF
470
471 # Precalc WK for first 2 blocks
472 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
473 .set i, 0
474 .rept 160
475 PRECALC i
476 .set i, i + 1
477 .endr
478
479 /* Go to next block if needed */
480 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
481 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
482 xchg WK_BUF, PRECALC_BUF
483
484 .align 32
485.L_loop:
486 /*
487 * code loops through more than one block
488 * we use K_BASE value as a signal of a last block,
489 * it is set below by: cmovae BUFFER_PTR, K_BASE
490 */
491 test BLOCKS_CTR, BLOCKS_CTR
492 jnz .L_begin
493 .align 32
494 jmp .L_end
495 .align 32
496.L_begin:
497
498 /*
499 * Do first block
500 * rounds: 0,2,4,6,8
501 */
502 .set j, 0
503 .rept 5
504 RR j
505 .set j, j+2
506 .endr
507
508 /*
509 * rounds:
510 * 10,12,14,16,18
511 * 20,22,24,26,28
512 * 30,32,34,36,38
513 * 40,42,44,46,48
514 * 50,52,54,56,58
515 */
516 .rept 25
517 RR j
518 .set j, j+2
519 .endr
520
521 /* Update Counter */
522 sub $1, BLOCKS_CTR
523 /* Move to the next block only if needed*/
524 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
525 /*
526 * rounds
527 * 60,62,64,66,68
528 * 70,72,74,76,78
529 */
530 .rept 10
531 RR j
532 .set j, j+2
533 .endr
534
535 UPDATE_HASH (HASH_PTR), A
536 UPDATE_HASH 4(HASH_PTR), TB
537 UPDATE_HASH 8(HASH_PTR), C
538 UPDATE_HASH 12(HASH_PTR), D
539 UPDATE_HASH 16(HASH_PTR), E
540
541 test BLOCKS_CTR, BLOCKS_CTR
542 jz .L_loop
543
544 mov TB, B
545
546 /* Process second block */
547 /*
548 * rounds
549 * 0+80, 2+80, 4+80, 6+80, 8+80
550 * 10+80,12+80,14+80,16+80,18+80
551 */
552
553 .set j, 0
554 .rept 10
555 RR j+80
556 .set j, j+2
557 .endr
558
559 /*
560 * rounds
561 * 20+80,22+80,24+80,26+80,28+80
562 * 30+80,32+80,34+80,36+80,38+80
563 */
564 .rept 10
565 RR j+80
566 .set j, j+2
567 .endr
568
569 /*
570 * rounds
571 * 40+80,42+80,44+80,46+80,48+80
572 * 50+80,52+80,54+80,56+80,58+80
573 */
574 .rept 10
575 RR j+80
576 .set j, j+2
577 .endr
578
579 /* update counter */
580 sub $1, BLOCKS_CTR
581 /* Move to the next block only if needed*/
582 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
583
584 /*
585 * rounds
586 * 60+80,62+80,64+80,66+80,68+80
587 * 70+80,72+80,74+80,76+80,78+80
588 */
589 .rept 10
590 RR j+80
591 .set j, j+2
592 .endr
593
594 UPDATE_HASH (HASH_PTR), A
595 UPDATE_HASH 4(HASH_PTR), TB
596 UPDATE_HASH 8(HASH_PTR), C
597 UPDATE_HASH 12(HASH_PTR), D
598 UPDATE_HASH 16(HASH_PTR), E
599
600 /* Reset state for AVX2 reg permutation */
601 mov A, TA
602 mov TB, A
603 mov C, TB
604 mov E, C
605 mov D, B
606 mov TA, D
607
608 REGALLOC
609
610 xchg WK_BUF, PRECALC_BUF
611
612 jmp .L_loop
613
614 .align 32
615.L_end:
616
617.endm
618/*
619 * macro implements SHA-1 function's body for several 64-byte blocks
620 * param: function's name
621 */
622.macro SHA1_VECTOR_ASM name
623 SYM_FUNC_START(\name)
624
625 push %rbx
626 push %r12
627 push %r13
628 push %r14
629 push %r15
630
631 RESERVE_STACK = (W_SIZE*4 + 8+24)
632
633 /* Align stack */
634 push %rbp
635 mov %rsp, %rbp
636 and $~(0x20-1), %rsp
637 sub $RESERVE_STACK, %rsp
638
639 avx2_zeroupper
640
641 /* Setup initial values */
642 mov CTX, HASH_PTR
643 mov BUF, BUFFER_PTR
644
645 mov BUF, BUFFER_PTR2
646 mov CNT, BLOCKS_CTR
647
648 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
649
650 SHA1_PIPELINED_MAIN_BODY
651
652 avx2_zeroupper
653
654 mov %rbp, %rsp
655 pop %rbp
656
657 pop %r15
658 pop %r14
659 pop %r13
660 pop %r12
661 pop %rbx
662
663 RET
664
665 SYM_FUNC_END(\name)
666.endm
667
668.section .rodata
669
670#define K1 0x5a827999
671#define K2 0x6ed9eba1
672#define K3 0x8f1bbcdc
673#define K4 0xca62c1d6
674
675.align 128
676K_XMM_AR:
677 .long K1, K1, K1, K1
678 .long K1, K1, K1, K1
679 .long K2, K2, K2, K2
680 .long K2, K2, K2, K2
681 .long K3, K3, K3, K3
682 .long K3, K3, K3, K3
683 .long K4, K4, K4, K4
684 .long K4, K4, K4, K4
685
686BSWAP_SHUFB_CTL:
687 .long 0x00010203
688 .long 0x04050607
689 .long 0x08090a0b
690 .long 0x0c0d0e0f
691 .long 0x00010203
692 .long 0x04050607
693 .long 0x08090a0b
694 .long 0x0c0d0e0f
695.text
696
697SHA1_VECTOR_ASM sha1_transform_avx2
698