1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52
53## assume buffers not aligned
54#define VMOVDQ vmovdqu
55
56################################ Define Macros
57
58# addm [mem], reg
59# Add reg to mem using reg-mem add and store
60.macro addm p1 p2
61 add \p1, \p2
62 mov \p2, \p1
63.endm
64
65################################
66
67X0 = %ymm4
68X1 = %ymm5
69X2 = %ymm6
70X3 = %ymm7
71
72# XMM versions of above
73XWORD0 = %xmm4
74XWORD1 = %xmm5
75XWORD2 = %xmm6
76XWORD3 = %xmm7
77
78XTMP0 = %ymm0
79XTMP1 = %ymm1
80XTMP2 = %ymm2
81XTMP3 = %ymm3
82XTMP4 = %ymm8
83XFER = %ymm9
84XTMP5 = %ymm11
85
86SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
87SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %ymm13
89
90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91
92NUM_BLKS = %rdx # 3rd arg
93INP = %rsi # 2nd arg
94CTX = %rdi # 1st arg
95c = %ecx
96d = %r8d
97e = %edx # clobbers NUM_BLKS
98y3 = %esi # clobbers INP
99
100SRND = CTX # SRND is same register as CTX
101
102a = %eax
103b = %ebx
104f = %r9d
105g = %r10d
106h = %r11d
107old_h = %r11d
108
109T1 = %r12d
110y0 = %r13d
111y1 = %r14d
112y2 = %r15d
113
114
115_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
116_XMM_SAVE_SIZE = 0
117_INP_END_SIZE = 8
118_INP_SIZE = 8
119_CTX_SIZE = 8
120
121_XFER = 0
122_XMM_SAVE = _XFER + _XFER_SIZE
123_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
124_INP = _INP_END + _INP_END_SIZE
125_CTX = _INP + _INP_SIZE
126STACK_SIZE = _CTX + _CTX_SIZE
127
128# rotate_Xs
129# Rotate values of symbols X0...X3
130.macro rotate_Xs
131 X_ = X0
132 X0 = X1
133 X1 = X2
134 X2 = X3
135 X3 = X_
136.endm
137
138# ROTATE_ARGS
139# Rotate values of symbols a...h
140.macro ROTATE_ARGS
141 old_h = h
142 TMP_ = h
143 h = g
144 g = f
145 f = e
146 e = d
147 d = c
148 c = b
149 b = a
150 a = TMP_
151.endm
152
153.macro FOUR_ROUNDS_AND_SCHED disp
154################################### RND N + 0 ############################
155
156 mov a, y3 # y3 = a # MAJA
157 rorx $25, e, y0 # y0 = e >> 25 # S1A
158 rorx $11, e, y1 # y1 = e >> 11 # S1B
159
160 addl \disp(%rsp, SRND), h # h = k + w + h # --
161 or c, y3 # y3 = a|c # MAJA
162 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
163 mov f, y2 # y2 = f # CH
164 rorx $13, a, T1 # T1 = a >> 13 # S0B
165
166 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
167 xor g, y2 # y2 = f^g # CH
168 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
169 rorx $6, e, y1 # y1 = (e >> 6) # S1
170
171 and e, y2 # y2 = (f^g)&e # CH
172 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
173 rorx $22, a, y1 # y1 = a >> 22 # S0A
174 add h, d # d = k + w + h + d # --
175
176 and b, y3 # y3 = (a|c)&b # MAJA
177 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
178 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
179 rorx $2, a, T1 # T1 = (a >> 2) # S0
180
181 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
182 vpsrld $7, XTMP1, XTMP2
183 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
184 mov a, T1 # T1 = a # MAJB
185 and c, T1 # T1 = a&c # MAJB
186
187 add y0, y2 # y2 = S1 + CH # --
188 vpslld $(32-7), XTMP1, XTMP3
189 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
190 add y1, h # h = k + w + h + S0 # --
191
192 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
193 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
194
195 vpsrld $18, XTMP1, XTMP2
196 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
197 add y3, h # h = t1 + S0 + MAJ # --
198
199
200 ROTATE_ARGS
201
202################################### RND N + 1 ############################
203
204 mov a, y3 # y3 = a # MAJA
205 rorx $25, e, y0 # y0 = e >> 25 # S1A
206 rorx $11, e, y1 # y1 = e >> 11 # S1B
207 offset = \disp + 1*4
208 addl offset(%rsp, SRND), h # h = k + w + h # --
209 or c, y3 # y3 = a|c # MAJA
210
211
212 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
213 mov f, y2 # y2 = f # CH
214 rorx $13, a, T1 # T1 = a >> 13 # S0B
215 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
216 xor g, y2 # y2 = f^g # CH
217
218
219 rorx $6, e, y1 # y1 = (e >> 6) # S1
220 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
221 rorx $22, a, y1 # y1 = a >> 22 # S0A
222 and e, y2 # y2 = (f^g)&e # CH
223 add h, d # d = k + w + h + d # --
224
225 vpslld $(32-18), XTMP1, XTMP1
226 and b, y3 # y3 = (a|c)&b # MAJA
227 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
228
229 vpxor XTMP1, XTMP3, XTMP3
230 rorx $2, a, T1 # T1 = (a >> 2) # S0
231 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
232
233 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
234 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
235 mov a, T1 # T1 = a # MAJB
236 and c, T1 # T1 = a&c # MAJB
237 add y0, y2 # y2 = S1 + CH # --
238
239 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
240 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
241 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
242 add y1, h # h = k + w + h + S0 # --
243
244 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
245 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
246 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
247 add y3, h # h = t1 + S0 + MAJ # --
248
249 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
250
251
252 ROTATE_ARGS
253
254################################### RND N + 2 ############################
255
256 mov a, y3 # y3 = a # MAJA
257 rorx $25, e, y0 # y0 = e >> 25 # S1A
258 offset = \disp + 2*4
259 addl offset(%rsp, SRND), h # h = k + w + h # --
260
261 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
262 rorx $11, e, y1 # y1 = e >> 11 # S1B
263 or c, y3 # y3 = a|c # MAJA
264 mov f, y2 # y2 = f # CH
265 xor g, y2 # y2 = f^g # CH
266
267 rorx $13, a, T1 # T1 = a >> 13 # S0B
268 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
269 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
270 and e, y2 # y2 = (f^g)&e # CH
271
272 rorx $6, e, y1 # y1 = (e >> 6) # S1
273 vpxor XTMP3, XTMP2, XTMP2
274 add h, d # d = k + w + h + d # --
275 and b, y3 # y3 = (a|c)&b # MAJA
276
277 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
278 rorx $22, a, y1 # y1 = a >> 22 # S0A
279 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
280 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
281
282 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
283 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
284 rorx $2, a ,T1 # T1 = (a >> 2) # S0
285 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
286
287 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
288 mov a, T1 # T1 = a # MAJB
289 and c, T1 # T1 = a&c # MAJB
290 add y0, y2 # y2 = S1 + CH # --
291 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
292
293 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
294 add y1,h # h = k + w + h + S0 # --
295 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
296 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
297
298 add y3,h # h = t1 + S0 + MAJ # --
299
300
301 ROTATE_ARGS
302
303################################### RND N + 3 ############################
304
305 mov a, y3 # y3 = a # MAJA
306 rorx $25, e, y0 # y0 = e >> 25 # S1A
307 rorx $11, e, y1 # y1 = e >> 11 # S1B
308 offset = \disp + 3*4
309 addl offset(%rsp, SRND), h # h = k + w + h # --
310 or c, y3 # y3 = a|c # MAJA
311
312
313 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
314 mov f, y2 # y2 = f # CH
315 rorx $13, a, T1 # T1 = a >> 13 # S0B
316 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
317 xor g, y2 # y2 = f^g # CH
318
319
320 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
321 rorx $6, e, y1 # y1 = (e >> 6) # S1
322 and e, y2 # y2 = (f^g)&e # CH
323 add h, d # d = k + w + h + d # --
324 and b, y3 # y3 = (a|c)&b # MAJA
325
326 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
327 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
328 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
329
330 vpxor XTMP3, XTMP2, XTMP2
331 rorx $22, a, y1 # y1 = a >> 22 # S0A
332 add y0, y2 # y2 = S1 + CH # --
333
334 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
335 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
336 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
337
338 rorx $2, a, T1 # T1 = (a >> 2) # S0
339 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
340
341 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
342 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
343 mov a, T1 # T1 = a # MAJB
344 and c, T1 # T1 = a&c # MAJB
345 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
346
347 add y1, h # h = k + w + h + S0 # --
348 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
349 add y3, h # h = t1 + S0 + MAJ # --
350
351 ROTATE_ARGS
352 rotate_Xs
353.endm
354
355.macro DO_4ROUNDS disp
356################################### RND N + 0 ###########################
357
358 mov f, y2 # y2 = f # CH
359 rorx $25, e, y0 # y0 = e >> 25 # S1A
360 rorx $11, e, y1 # y1 = e >> 11 # S1B
361 xor g, y2 # y2 = f^g # CH
362
363 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
364 rorx $6, e, y1 # y1 = (e >> 6) # S1
365 and e, y2 # y2 = (f^g)&e # CH
366
367 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
368 rorx $13, a, T1 # T1 = a >> 13 # S0B
369 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
370 rorx $22, a, y1 # y1 = a >> 22 # S0A
371 mov a, y3 # y3 = a # MAJA
372
373 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
374 rorx $2, a, T1 # T1 = (a >> 2) # S0
375 addl \disp(%rsp, SRND), h # h = k + w + h # --
376 or c, y3 # y3 = a|c # MAJA
377
378 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
379 mov a, T1 # T1 = a # MAJB
380 and b, y3 # y3 = (a|c)&b # MAJA
381 and c, T1 # T1 = a&c # MAJB
382 add y0, y2 # y2 = S1 + CH # --
383
384
385 add h, d # d = k + w + h + d # --
386 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
387 add y1, h # h = k + w + h + S0 # --
388 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
389
390 ROTATE_ARGS
391
392################################### RND N + 1 ###########################
393
394 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
395 mov f, y2 # y2 = f # CH
396 rorx $25, e, y0 # y0 = e >> 25 # S1A
397 rorx $11, e, y1 # y1 = e >> 11 # S1B
398 xor g, y2 # y2 = f^g # CH
399
400 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
401 rorx $6, e, y1 # y1 = (e >> 6) # S1
402 and e, y2 # y2 = (f^g)&e # CH
403 add y3, old_h # h = t1 + S0 + MAJ # --
404
405 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
406 rorx $13, a, T1 # T1 = a >> 13 # S0B
407 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
408 rorx $22, a, y1 # y1 = a >> 22 # S0A
409 mov a, y3 # y3 = a # MAJA
410
411 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
412 rorx $2, a, T1 # T1 = (a >> 2) # S0
413 offset = 4*1 + \disp
414 addl offset(%rsp, SRND), h # h = k + w + h # --
415 or c, y3 # y3 = a|c # MAJA
416
417 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
418 mov a, T1 # T1 = a # MAJB
419 and b, y3 # y3 = (a|c)&b # MAJA
420 and c, T1 # T1 = a&c # MAJB
421 add y0, y2 # y2 = S1 + CH # --
422
423
424 add h, d # d = k + w + h + d # --
425 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
426 add y1, h # h = k + w + h + S0 # --
427
428 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
429
430 ROTATE_ARGS
431
432################################### RND N + 2 ##############################
433
434 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
435 mov f, y2 # y2 = f # CH
436 rorx $25, e, y0 # y0 = e >> 25 # S1A
437 rorx $11, e, y1 # y1 = e >> 11 # S1B
438 xor g, y2 # y2 = f^g # CH
439
440 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
441 rorx $6, e, y1 # y1 = (e >> 6) # S1
442 and e, y2 # y2 = (f^g)&e # CH
443 add y3, old_h # h = t1 + S0 + MAJ # --
444
445 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
446 rorx $13, a, T1 # T1 = a >> 13 # S0B
447 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
448 rorx $22, a, y1 # y1 = a >> 22 # S0A
449 mov a, y3 # y3 = a # MAJA
450
451 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
452 rorx $2, a, T1 # T1 = (a >> 2) # S0
453 offset = 4*2 + \disp
454 addl offset(%rsp, SRND), h # h = k + w + h # --
455 or c, y3 # y3 = a|c # MAJA
456
457 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
458 mov a, T1 # T1 = a # MAJB
459 and b, y3 # y3 = (a|c)&b # MAJA
460 and c, T1 # T1 = a&c # MAJB
461 add y0, y2 # y2 = S1 + CH # --
462
463
464 add h, d # d = k + w + h + d # --
465 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
466 add y1, h # h = k + w + h + S0 # --
467
468 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
469
470 ROTATE_ARGS
471
472################################### RND N + 3 ###########################
473
474 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
475 mov f, y2 # y2 = f # CH
476 rorx $25, e, y0 # y0 = e >> 25 # S1A
477 rorx $11, e, y1 # y1 = e >> 11 # S1B
478 xor g, y2 # y2 = f^g # CH
479
480 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
481 rorx $6, e, y1 # y1 = (e >> 6) # S1
482 and e, y2 # y2 = (f^g)&e # CH
483 add y3, old_h # h = t1 + S0 + MAJ # --
484
485 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
486 rorx $13, a, T1 # T1 = a >> 13 # S0B
487 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
488 rorx $22, a, y1 # y1 = a >> 22 # S0A
489 mov a, y3 # y3 = a # MAJA
490
491 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
492 rorx $2, a, T1 # T1 = (a >> 2) # S0
493 offset = 4*3 + \disp
494 addl offset(%rsp, SRND), h # h = k + w + h # --
495 or c, y3 # y3 = a|c # MAJA
496
497 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
498 mov a, T1 # T1 = a # MAJB
499 and b, y3 # y3 = (a|c)&b # MAJA
500 and c, T1 # T1 = a&c # MAJB
501 add y0, y2 # y2 = S1 + CH # --
502
503
504 add h, d # d = k + w + h + d # --
505 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
506 add y1, h # h = k + w + h + S0 # --
507
508 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
509
510
511 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
512
513 add y3, h # h = t1 + S0 + MAJ # --
514
515 ROTATE_ARGS
516
517.endm
518
519########################################################################
520## void sha256_transform_rorx(struct sha256_block_state *state,
521## const u8 *data, size_t nblocks);
522########################################################################
523.text
524SYM_FUNC_START(sha256_transform_rorx)
525 pushq %rbx
526 pushq %r12
527 pushq %r13
528 pushq %r14
529 pushq %r15
530
531 push %rbp
532 mov %rsp, %rbp
533
534 subq $STACK_SIZE, %rsp
535 and $-32, %rsp # align rsp to 32 byte boundary
536
537 shl $6, NUM_BLKS # convert to bytes
538 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
539 mov NUM_BLKS, _INP_END(%rsp)
540
541 cmp NUM_BLKS, INP
542 je .Lonly_one_block
543
544 ## load initial digest
545 mov (CTX), a
546 mov 4*1(CTX), b
547 mov 4*2(CTX), c
548 mov 4*3(CTX), d
549 mov 4*4(CTX), e
550 mov 4*5(CTX), f
551 mov 4*6(CTX), g
552 mov 4*7(CTX), h
553
554 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
555 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
556 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
557
558 mov CTX, _CTX(%rsp)
559
560.Lloop0:
561 ## Load first 16 dwords from two blocks
562 VMOVDQ 0*32(INP),XTMP0
563 VMOVDQ 1*32(INP),XTMP1
564 VMOVDQ 2*32(INP),XTMP2
565 VMOVDQ 3*32(INP),XTMP3
566
567 ## byte swap data
568 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
569 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
570 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
571 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
572
573 ## transpose data into high/low halves
574 vperm2i128 $0x20, XTMP2, XTMP0, X0
575 vperm2i128 $0x31, XTMP2, XTMP0, X1
576 vperm2i128 $0x20, XTMP3, XTMP1, X2
577 vperm2i128 $0x31, XTMP3, XTMP1, X3
578
579.Llast_block_enter:
580 add $64, INP
581 mov INP, _INP(%rsp)
582
583 ## schedule 48 input dwords, by doing 3 rounds of 12 each
584 xor SRND, SRND
585
586.align 16
587.Lloop1:
588 leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
589 vpaddd (INP, SRND), X0, XFER
590 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
591 FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
592
593 leaq K256+1*32(%rip), INP
594 vpaddd (INP, SRND), X0, XFER
595 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
596 FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
597
598 leaq K256+2*32(%rip), INP
599 vpaddd (INP, SRND), X0, XFER
600 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
601 FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
602
603 leaq K256+3*32(%rip), INP
604 vpaddd (INP, SRND), X0, XFER
605 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
606 FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
607
608 add $4*32, SRND
609 cmp $3*4*32, SRND
610 jb .Lloop1
611
612.Lloop2:
613 ## Do last 16 rounds with no scheduling
614 leaq K256+0*32(%rip), INP
615 vpaddd (INP, SRND), X0, XFER
616 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
617 DO_4ROUNDS (_XFER + 0*32)
618
619 leaq K256+1*32(%rip), INP
620 vpaddd (INP, SRND), X1, XFER
621 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622 DO_4ROUNDS (_XFER + 1*32)
623 add $2*32, SRND
624
625 vmovdqa X2, X0
626 vmovdqa X3, X1
627
628 cmp $4*4*32, SRND
629 jb .Lloop2
630
631 mov _CTX(%rsp), CTX
632 mov _INP(%rsp), INP
633
634 addm (4*0)(CTX),a
635 addm (4*1)(CTX),b
636 addm (4*2)(CTX),c
637 addm (4*3)(CTX),d
638 addm (4*4)(CTX),e
639 addm (4*5)(CTX),f
640 addm (4*6)(CTX),g
641 addm (4*7)(CTX),h
642
643 cmp _INP_END(%rsp), INP
644 ja .Ldone_hash
645
646 #### Do second block using previously scheduled results
647 xor SRND, SRND
648.align 16
649.Lloop3:
650 DO_4ROUNDS (_XFER + 0*32 + 16)
651 DO_4ROUNDS (_XFER + 1*32 + 16)
652 add $2*32, SRND
653 cmp $4*4*32, SRND
654 jb .Lloop3
655
656 mov _CTX(%rsp), CTX
657 mov _INP(%rsp), INP
658 add $64, INP
659
660 addm (4*0)(CTX),a
661 addm (4*1)(CTX),b
662 addm (4*2)(CTX),c
663 addm (4*3)(CTX),d
664 addm (4*4)(CTX),e
665 addm (4*5)(CTX),f
666 addm (4*6)(CTX),g
667 addm (4*7)(CTX),h
668
669 cmp _INP_END(%rsp), INP
670 jb .Lloop0
671 ja .Ldone_hash
672
673.Ldo_last_block:
674 VMOVDQ 0*16(INP),XWORD0
675 VMOVDQ 1*16(INP),XWORD1
676 VMOVDQ 2*16(INP),XWORD2
677 VMOVDQ 3*16(INP),XWORD3
678
679 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
680 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
681 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
682 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
683
684 jmp .Llast_block_enter
685
686.Lonly_one_block:
687
688 ## load initial digest
689 mov (4*0)(CTX),a
690 mov (4*1)(CTX),b
691 mov (4*2)(CTX),c
692 mov (4*3)(CTX),d
693 mov (4*4)(CTX),e
694 mov (4*5)(CTX),f
695 mov (4*6)(CTX),g
696 mov (4*7)(CTX),h
697
698 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
700 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
701
702 mov CTX, _CTX(%rsp)
703 jmp .Ldo_last_block
704
705.Ldone_hash:
706
707 mov %rbp, %rsp
708 pop %rbp
709
710 popq %r15
711 popq %r14
712 popq %r13
713 popq %r12
714 popq %rbx
715 vzeroupper
716 RET
717SYM_FUNC_END(sha256_transform_rorx)
718
719.section .rodata.cst512.K256, "aM", @progbits, 512
720.align 64
721K256:
722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
754
755.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
756.align 32
757PSHUFFLE_BYTE_FLIP_MASK:
758 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
759
760# shuffle xBxA -> 00BA
761.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
762.align 32
763_SHUF_00BA:
764 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
765
766# shuffle xDxC -> DC00
767.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
768.align 32
769_SHUF_DC00:
770 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
771