1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 */
6
7#include <linux/linkage.h>
8
9.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
10.align 32
11IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F
13.section .rodata.cst16.ROT16, "aM", @progbits, 16
14.align 16
15ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
16.section .rodata.cst16.ROR328, "aM", @progbits, 16
17.align 16
18ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
19.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
20.align 64
21SIGMA:
22.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
23.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
24.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
25.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
26.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
27.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
28.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
29.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
30.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
31.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
32.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160
33.align 64
34SIGMA2:
35.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
36.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
37.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
38.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
39.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
40.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
41.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
42.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
43.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
44.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
45
46.text
47SYM_FUNC_START(blake2s_compress_ssse3)
48 testq %rdx,%rdx
49 je .Lendofloop
50 movdqu (%rdi),%xmm0
51 movdqu 0x10(%rdi),%xmm1
52 movdqa ROT16(%rip),%xmm12
53 movdqa ROR328(%rip),%xmm13
54 movdqu 0x20(%rdi),%xmm14
55 movq %rcx,%xmm15
56 leaq SIGMA+0xa0(%rip),%r8
57 jmp .Lbeginofloop
58 .align 32
59.Lbeginofloop:
60 movdqa %xmm0,%xmm10
61 movdqa %xmm1,%xmm11
62 paddq %xmm15,%xmm14
63 movdqa IV(%rip),%xmm2
64 movdqa %xmm14,%xmm3
65 pxor IV+0x10(%rip),%xmm3
66 leaq SIGMA(%rip),%rcx
67.Lroundloop:
68 movzbl (%rcx),%eax
69 movd (%rsi,%rax,4),%xmm4
70 movzbl 0x1(%rcx),%eax
71 movd (%rsi,%rax,4),%xmm5
72 movzbl 0x2(%rcx),%eax
73 movd (%rsi,%rax,4),%xmm6
74 movzbl 0x3(%rcx),%eax
75 movd (%rsi,%rax,4),%xmm7
76 punpckldq %xmm5,%xmm4
77 punpckldq %xmm7,%xmm6
78 punpcklqdq %xmm6,%xmm4
79 paddd %xmm4,%xmm0
80 paddd %xmm1,%xmm0
81 pxor %xmm0,%xmm3
82 pshufb %xmm12,%xmm3
83 paddd %xmm3,%xmm2
84 pxor %xmm2,%xmm1
85 movdqa %xmm1,%xmm8
86 psrld $0xc,%xmm1
87 pslld $0x14,%xmm8
88 por %xmm8,%xmm1
89 movzbl 0x4(%rcx),%eax
90 movd (%rsi,%rax,4),%xmm5
91 movzbl 0x5(%rcx),%eax
92 movd (%rsi,%rax,4),%xmm6
93 movzbl 0x6(%rcx),%eax
94 movd (%rsi,%rax,4),%xmm7
95 movzbl 0x7(%rcx),%eax
96 movd (%rsi,%rax,4),%xmm4
97 punpckldq %xmm6,%xmm5
98 punpckldq %xmm4,%xmm7
99 punpcklqdq %xmm7,%xmm5
100 paddd %xmm5,%xmm0
101 paddd %xmm1,%xmm0
102 pxor %xmm0,%xmm3
103 pshufb %xmm13,%xmm3
104 paddd %xmm3,%xmm2
105 pxor %xmm2,%xmm1
106 movdqa %xmm1,%xmm8
107 psrld $0x7,%xmm1
108 pslld $0x19,%xmm8
109 por %xmm8,%xmm1
110 pshufd $0x93,%xmm0,%xmm0
111 pshufd $0x4e,%xmm3,%xmm3
112 pshufd $0x39,%xmm2,%xmm2
113 movzbl 0x8(%rcx),%eax
114 movd (%rsi,%rax,4),%xmm6
115 movzbl 0x9(%rcx),%eax
116 movd (%rsi,%rax,4),%xmm7
117 movzbl 0xa(%rcx),%eax
118 movd (%rsi,%rax,4),%xmm4
119 movzbl 0xb(%rcx),%eax
120 movd (%rsi,%rax,4),%xmm5
121 punpckldq %xmm7,%xmm6
122 punpckldq %xmm5,%xmm4
123 punpcklqdq %xmm4,%xmm6
124 paddd %xmm6,%xmm0
125 paddd %xmm1,%xmm0
126 pxor %xmm0,%xmm3
127 pshufb %xmm12,%xmm3
128 paddd %xmm3,%xmm2
129 pxor %xmm2,%xmm1
130 movdqa %xmm1,%xmm8
131 psrld $0xc,%xmm1
132 pslld $0x14,%xmm8
133 por %xmm8,%xmm1
134 movzbl 0xc(%rcx),%eax
135 movd (%rsi,%rax,4),%xmm7
136 movzbl 0xd(%rcx),%eax
137 movd (%rsi,%rax,4),%xmm4
138 movzbl 0xe(%rcx),%eax
139 movd (%rsi,%rax,4),%xmm5
140 movzbl 0xf(%rcx),%eax
141 movd (%rsi,%rax,4),%xmm6
142 punpckldq %xmm4,%xmm7
143 punpckldq %xmm6,%xmm5
144 punpcklqdq %xmm5,%xmm7
145 paddd %xmm7,%xmm0
146 paddd %xmm1,%xmm0
147 pxor %xmm0,%xmm3
148 pshufb %xmm13,%xmm3
149 paddd %xmm3,%xmm2
150 pxor %xmm2,%xmm1
151 movdqa %xmm1,%xmm8
152 psrld $0x7,%xmm1
153 pslld $0x19,%xmm8
154 por %xmm8,%xmm1
155 pshufd $0x39,%xmm0,%xmm0
156 pshufd $0x4e,%xmm3,%xmm3
157 pshufd $0x93,%xmm2,%xmm2
158 addq $0x10,%rcx
159 cmpq %r8,%rcx
160 jnz .Lroundloop
161 pxor %xmm2,%xmm0
162 pxor %xmm3,%xmm1
163 pxor %xmm10,%xmm0
164 pxor %xmm11,%xmm1
165 addq $0x40,%rsi
166 decq %rdx
167 jnz .Lbeginofloop
168 movdqu %xmm0,(%rdi)
169 movdqu %xmm1,0x10(%rdi)
170 movdqu %xmm14,0x20(%rdi)
171.Lendofloop:
172 RET
173SYM_FUNC_END(blake2s_compress_ssse3)
174
175SYM_FUNC_START(blake2s_compress_avx512)
176 vmovdqu (%rdi),%xmm0
177 vmovdqu 0x10(%rdi),%xmm1
178 vmovdqu 0x20(%rdi),%xmm4
179 vmovq %rcx,%xmm5
180 vmovdqa IV(%rip),%xmm14
181 vmovdqa IV+16(%rip),%xmm15
182 jmp .Lblake2s_compress_avx512_mainloop
183.align 32
184.Lblake2s_compress_avx512_mainloop:
185 vmovdqa %xmm0,%xmm10
186 vmovdqa %xmm1,%xmm11
187 vpaddq %xmm5,%xmm4,%xmm4
188 vmovdqa %xmm14,%xmm2
189 vpxor %xmm15,%xmm4,%xmm3
190 vmovdqu (%rsi),%ymm6
191 vmovdqu 0x20(%rsi),%ymm7
192 addq $0x40,%rsi
193 leaq SIGMA2(%rip),%rax
194 movb $0xa,%cl
195.Lblake2s_compress_avx512_roundloop:
196 vpmovzxbd (%rax),%ymm8
197 vpmovzxbd 0x8(%rax),%ymm9
198 addq $0x10,%rax
199 vpermi2d %ymm7,%ymm6,%ymm8
200 vpermi2d %ymm7,%ymm6,%ymm9
201 vmovdqa %ymm8,%ymm6
202 vmovdqa %ymm9,%ymm7
203 vpaddd %xmm8,%xmm0,%xmm0
204 vpaddd %xmm1,%xmm0,%xmm0
205 vpxor %xmm0,%xmm3,%xmm3
206 vprord $0x10,%xmm3,%xmm3
207 vpaddd %xmm3,%xmm2,%xmm2
208 vpxor %xmm2,%xmm1,%xmm1
209 vprord $0xc,%xmm1,%xmm1
210 vextracti128 $0x1,%ymm8,%xmm8
211 vpaddd %xmm8,%xmm0,%xmm0
212 vpaddd %xmm1,%xmm0,%xmm0
213 vpxor %xmm0,%xmm3,%xmm3
214 vprord $0x8,%xmm3,%xmm3
215 vpaddd %xmm3,%xmm2,%xmm2
216 vpxor %xmm2,%xmm1,%xmm1
217 vprord $0x7,%xmm1,%xmm1
218 vpshufd $0x93,%xmm0,%xmm0
219 vpshufd $0x4e,%xmm3,%xmm3
220 vpshufd $0x39,%xmm2,%xmm2
221 vpaddd %xmm9,%xmm0,%xmm0
222 vpaddd %xmm1,%xmm0,%xmm0
223 vpxor %xmm0,%xmm3,%xmm3
224 vprord $0x10,%xmm3,%xmm3
225 vpaddd %xmm3,%xmm2,%xmm2
226 vpxor %xmm2,%xmm1,%xmm1
227 vprord $0xc,%xmm1,%xmm1
228 vextracti128 $0x1,%ymm9,%xmm9
229 vpaddd %xmm9,%xmm0,%xmm0
230 vpaddd %xmm1,%xmm0,%xmm0
231 vpxor %xmm0,%xmm3,%xmm3
232 vprord $0x8,%xmm3,%xmm3
233 vpaddd %xmm3,%xmm2,%xmm2
234 vpxor %xmm2,%xmm1,%xmm1
235 vprord $0x7,%xmm1,%xmm1
236 vpshufd $0x39,%xmm0,%xmm0
237 vpshufd $0x4e,%xmm3,%xmm3
238 vpshufd $0x93,%xmm2,%xmm2
239 decb %cl
240 jne .Lblake2s_compress_avx512_roundloop
241 vpxor %xmm10,%xmm0,%xmm0
242 vpxor %xmm11,%xmm1,%xmm1
243 vpxor %xmm2,%xmm0,%xmm0
244 vpxor %xmm3,%xmm1,%xmm1
245 decq %rdx
246 jne .Lblake2s_compress_avx512_mainloop
247 vmovdqu %xmm0,(%rdi)
248 vmovdqu %xmm1,0x10(%rdi)
249 vmovdqu %xmm4,0x20(%rdi)
250 vzeroupper
251 RET
252SYM_FUNC_END(blake2s_compress_avx512)
253