1/*
2 * Intel SHA Extensions optimized implementation of a SHA-256 update function
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Sean Gulley <sean.m.gulley@intel.com>
22 * Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2015 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * * Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * * Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in
36 * the documentation and/or other materials provided with the
37 * distribution.
38 * * Neither the name of Intel Corporation nor the names of its
39 * contributors may be used to endorse or promote products derived
40 * from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 *
54 */
55
56#include <linux/linkage.h>
57
58#define STATE_PTR %rdi /* 1st arg */
59#define DATA_PTR %rsi /* 2nd arg */
60#define NUM_BLKS %rdx /* 3rd arg */
61
62#define SHA256CONSTANTS %rax
63
64#define MSG %xmm0 /* sha256rnds2 implicit operand */
65#define STATE0 %xmm1
66#define STATE1 %xmm2
67#define MSG0 %xmm3
68#define MSG1 %xmm4
69#define MSG2 %xmm5
70#define MSG3 %xmm6
71#define TMP %xmm7
72
73#define SHUF_MASK %xmm8
74
75#define ABEF_SAVE %xmm9
76#define CDGH_SAVE %xmm10
77
78.macro do_4rounds i, m0, m1, m2, m3
79.if \i < 16
80 movdqu \i*4(DATA_PTR), \m0
81 pshufb SHUF_MASK, \m0
82.endif
83 movdqa (\i-32)*4(SHA256CONSTANTS), MSG
84 paddd \m0, MSG
85 sha256rnds2 STATE0, STATE1
86.if \i >= 12 && \i < 60
87 movdqa \m0, TMP
88 palignr $4, \m3, TMP
89 paddd TMP, \m1
90 sha256msg2 \m0, \m1
91.endif
92 punpckhqdq MSG, MSG
93 sha256rnds2 STATE1, STATE0
94.if \i >= 4 && \i < 52
95 sha256msg1 \m0, \m3
96.endif
97.endm
98
99/*
100 * Intel SHA Extensions optimized implementation of a SHA-256 block function
101 *
102 * This function takes a pointer to the current SHA-256 state, a pointer to the
103 * input data, and the number of 64-byte blocks to process. Once all blocks
104 * have been processed, the state is updated with the new state. This function
105 * only processes complete blocks. State initialization, buffering of partial
106 * blocks, and digest finalization is expected to be handled elsewhere.
107 *
108 * void sha256_ni_transform(struct sha256_block_state *state,
109 * const u8 *data, size_t nblocks);
110 */
111.text
112SYM_FUNC_START(sha256_ni_transform)
113
114 shl $6, NUM_BLKS /* convert to bytes */
115 add DATA_PTR, NUM_BLKS /* pointer to end of data */
116
117 /*
118 * load initial hash values
119 * Need to reorder these appropriately
120 * DCBA, HGFE -> ABEF, CDGH
121 */
122 movdqu 0*16(STATE_PTR), STATE0 /* DCBA */
123 movdqu 1*16(STATE_PTR), STATE1 /* HGFE */
124
125 movdqa STATE0, TMP
126 punpcklqdq STATE1, STATE0 /* FEBA */
127 punpckhqdq TMP, STATE1 /* DCHG */
128 pshufd $0x1B, STATE0, STATE0 /* ABEF */
129 pshufd $0xB1, STATE1, STATE1 /* CDGH */
130
131 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
132 lea K256+32*4(%rip), SHA256CONSTANTS
133
134.Lloop0:
135 /* Save hash values for addition after rounds */
136 movdqa STATE0, ABEF_SAVE
137 movdqa STATE1, CDGH_SAVE
138
139.irp i, 0, 16, 32, 48
140 do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
141 do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
142 do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
143 do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
144.endr
145
146 /* Add current hash values with previously saved */
147 paddd ABEF_SAVE, STATE0
148 paddd CDGH_SAVE, STATE1
149
150 /* Increment data pointer and loop if more to process */
151 add $64, DATA_PTR
152 cmp NUM_BLKS, DATA_PTR
153 jne .Lloop0
154
155 /* Write hash values back in the correct order */
156 movdqa STATE0, TMP
157 punpcklqdq STATE1, STATE0 /* GHEF */
158 punpckhqdq TMP, STATE1 /* ABCD */
159 pshufd $0xB1, STATE0, STATE0 /* HGFE */
160 pshufd $0x1B, STATE1, STATE1 /* DCBA */
161
162 movdqu STATE1, 0*16(STATE_PTR)
163 movdqu STATE0, 1*16(STATE_PTR)
164
165 RET
166SYM_FUNC_END(sha256_ni_transform)
167
168#undef DIGEST_PTR
169#undef DATA_PTR
170#undef NUM_BLKS
171#undef SHA256CONSTANTS
172#undef MSG
173#undef STATE0
174#undef STATE1
175#undef MSG0
176#undef MSG1
177#undef MSG2
178#undef MSG3
179#undef TMP
180#undef SHUF_MASK
181#undef ABEF_SAVE
182#undef CDGH_SAVE
183
184// parameters for sha256_ni_finup2x()
185#define CTX %rdi
186#define DATA1 %rsi
187#define DATA2 %rdx
188#define LEN %ecx
189#define LEN8 %cl
190#define LEN64 %rcx
191#define OUT1 %r8
192#define OUT2 %r9
193
194// other scalar variables
195#define SHA256CONSTANTS %rax
196#define COUNT %r10
197#define COUNT32 %r10d
198#define FINAL_STEP %r11d
199
200// rbx is used as a temporary.
201
202#define MSG %xmm0 // sha256rnds2 implicit operand
203#define STATE0_A %xmm1
204#define STATE1_A %xmm2
205#define STATE0_B %xmm3
206#define STATE1_B %xmm4
207#define TMP_A %xmm5
208#define TMP_B %xmm6
209#define MSG0_A %xmm7
210#define MSG1_A %xmm8
211#define MSG2_A %xmm9
212#define MSG3_A %xmm10
213#define MSG0_B %xmm11
214#define MSG1_B %xmm12
215#define MSG2_B %xmm13
216#define MSG3_B %xmm14
217#define SHUF_MASK %xmm15
218
219#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
220#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
221#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
222
223// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
224// contain the current 4 message schedule words for the first and second message
225// respectively.
226//
227// If not all the message schedule words have been computed yet, then this also
228// computes 4 more message schedule words for each message. m1_a-m3_a contain
229// the next 3 groups of 4 message schedule words for the first message, and
230// likewise m1_b-m3_b for the second. After consuming the current value of
231// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
232// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the
233// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
234// cycle through the registers accordingly.
235.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
236 movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A
237 movdqa TMP_A, TMP_B
238 paddd \m0_a, TMP_A
239 paddd \m0_b, TMP_B
240.if \i < 48
241 sha256msg1 \m1_a, \m0_a
242 sha256msg1 \m1_b, \m0_b
243.endif
244 movdqa TMP_A, MSG
245 sha256rnds2 STATE0_A, STATE1_A
246 movdqa TMP_B, MSG
247 sha256rnds2 STATE0_B, STATE1_B
248 pshufd $0x0E, TMP_A, MSG
249 sha256rnds2 STATE1_A, STATE0_A
250 pshufd $0x0E, TMP_B, MSG
251 sha256rnds2 STATE1_B, STATE0_B
252.if \i < 48
253 movdqa \m3_a, TMP_A
254 movdqa \m3_b, TMP_B
255 palignr $4, \m2_a, TMP_A
256 palignr $4, \m2_b, TMP_B
257 paddd TMP_A, \m0_a
258 paddd TMP_B, \m0_b
259 sha256msg2 \m3_a, \m0_a
260 sha256msg2 \m3_b, \m0_b
261.endif
262.endm
263
264//
265// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
266// const u8 *data1, const u8 *data2, int len,
267// u8 out1[SHA256_DIGEST_SIZE],
268// u8 out2[SHA256_DIGEST_SIZE]);
269//
270// This function computes the SHA-256 digests of two messages |data1| and
271// |data2| that are both |len| bytes long, starting from the initial context
272// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
273//
274// The instructions for the two SHA-256 operations are interleaved. On many
275// CPUs, this is almost twice as fast as hashing each message individually due
276// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
277//
278SYM_FUNC_START(sha256_ni_finup2x)
279 // Allocate 128 bytes of stack space, 16-byte aligned.
280 push %rbx
281 push %rbp
282 mov %rsp, %rbp
283 sub $128, %rsp
284 and $~15, %rsp
285
286 // Load the shuffle mask for swapping the endianness of 32-bit words.
287 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
288
289 // Set up pointer to the round constants.
290 lea K256+32*4(%rip), SHA256CONSTANTS
291
292 // Initially we're not processing the final blocks.
293 xor FINAL_STEP, FINAL_STEP
294
295 // Load the initial state from ctx->state.
296 movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA
297 movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE
298 movdqa STATE0_A, TMP_A
299 punpcklqdq STATE1_A, STATE0_A // FEBA
300 punpckhqdq TMP_A, STATE1_A // DCHG
301 pshufd $0x1B, STATE0_A, STATE0_A // ABEF
302 pshufd $0xB1, STATE1_A, STATE1_A // CDGH
303
304 // Load ctx->bytecount. Take the mod 64 of it to get the number of
305 // bytes that are buffered in ctx->buf. Also save it in a register with
306 // LEN added to it.
307 mov LEN, LEN
308 mov OFFSETOF_BYTECOUNT(CTX), %rbx
309 lea (%rbx, LEN64, 1), COUNT
310 and $63, %ebx
311 jz .Lfinup2x_enter_loop // No bytes buffered?
312
313 // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
314 // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
315 // just load 64 bytes from each of ctx->buf, DATA1, and DATA2
316 // unconditionally and rearrange the data as needed.
317
318 movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A
319 movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A
320 movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A
321 movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A
322 movdqa MSG0_A, 0*16(%rsp)
323 movdqa MSG1_A, 1*16(%rsp)
324 movdqa MSG2_A, 2*16(%rsp)
325 movdqa MSG3_A, 3*16(%rsp)
326
327 movdqu 0*16(DATA1), MSG0_A
328 movdqu 1*16(DATA1), MSG1_A
329 movdqu 2*16(DATA1), MSG2_A
330 movdqu 3*16(DATA1), MSG3_A
331 movdqu MSG0_A, 0*16(%rsp,%rbx)
332 movdqu MSG1_A, 1*16(%rsp,%rbx)
333 movdqu MSG2_A, 2*16(%rsp,%rbx)
334 movdqu MSG3_A, 3*16(%rsp,%rbx)
335 movdqa 0*16(%rsp), MSG0_A
336 movdqa 1*16(%rsp), MSG1_A
337 movdqa 2*16(%rsp), MSG2_A
338 movdqa 3*16(%rsp), MSG3_A
339
340 movdqu 0*16(DATA2), MSG0_B
341 movdqu 1*16(DATA2), MSG1_B
342 movdqu 2*16(DATA2), MSG2_B
343 movdqu 3*16(DATA2), MSG3_B
344 movdqu MSG0_B, 0*16(%rsp,%rbx)
345 movdqu MSG1_B, 1*16(%rsp,%rbx)
346 movdqu MSG2_B, 2*16(%rsp,%rbx)
347 movdqu MSG3_B, 3*16(%rsp,%rbx)
348 movdqa 0*16(%rsp), MSG0_B
349 movdqa 1*16(%rsp), MSG1_B
350 movdqa 2*16(%rsp), MSG2_B
351 movdqa 3*16(%rsp), MSG3_B
352
353 sub $64, %rbx // rbx = buffered - 64
354 sub %rbx, DATA1 // DATA1 += 64 - buffered
355 sub %rbx, DATA2 // DATA2 += 64 - buffered
356 add %ebx, LEN // LEN += buffered - 64
357 movdqa STATE0_A, STATE0_B
358 movdqa STATE1_A, STATE1_B
359 jmp .Lfinup2x_loop_have_data
360
361.Lfinup2x_enter_loop:
362 sub $64, LEN
363 movdqa STATE0_A, STATE0_B
364 movdqa STATE1_A, STATE1_B
365.Lfinup2x_loop:
366 // Load the next two data blocks.
367 movdqu 0*16(DATA1), MSG0_A
368 movdqu 0*16(DATA2), MSG0_B
369 movdqu 1*16(DATA1), MSG1_A
370 movdqu 1*16(DATA2), MSG1_B
371 movdqu 2*16(DATA1), MSG2_A
372 movdqu 2*16(DATA2), MSG2_B
373 movdqu 3*16(DATA1), MSG3_A
374 movdqu 3*16(DATA2), MSG3_B
375 add $64, DATA1
376 add $64, DATA2
377.Lfinup2x_loop_have_data:
378 // Convert the words of the data blocks from big endian.
379 pshufb SHUF_MASK, MSG0_A
380 pshufb SHUF_MASK, MSG0_B
381 pshufb SHUF_MASK, MSG1_A
382 pshufb SHUF_MASK, MSG1_B
383 pshufb SHUF_MASK, MSG2_A
384 pshufb SHUF_MASK, MSG2_B
385 pshufb SHUF_MASK, MSG3_A
386 pshufb SHUF_MASK, MSG3_B
387.Lfinup2x_loop_have_bswapped_data:
388
389 // Save the original state for each block.
390 movdqa STATE0_A, 0*16(%rsp)
391 movdqa STATE0_B, 1*16(%rsp)
392 movdqa STATE1_A, 2*16(%rsp)
393 movdqa STATE1_B, 3*16(%rsp)
394
395 // Do the SHA-256 rounds on each block.
396.irp i, 0, 16, 32, 48
397 do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
398 MSG0_B, MSG1_B, MSG2_B, MSG3_B
399 do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
400 MSG1_B, MSG2_B, MSG3_B, MSG0_B
401 do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
402 MSG2_B, MSG3_B, MSG0_B, MSG1_B
403 do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
404 MSG3_B, MSG0_B, MSG1_B, MSG2_B
405.endr
406
407 // Add the original state for each block.
408 paddd 0*16(%rsp), STATE0_A
409 paddd 1*16(%rsp), STATE0_B
410 paddd 2*16(%rsp), STATE1_A
411 paddd 3*16(%rsp), STATE1_B
412
413 // Update LEN and loop back if more blocks remain.
414 sub $64, LEN
415 jge .Lfinup2x_loop
416
417 // Check if any final blocks need to be handled.
418 // FINAL_STEP = 2: all done
419 // FINAL_STEP = 1: need to do count-only padding block
420 // FINAL_STEP = 0: need to do the block with 0x80 padding byte
421 cmp $1, FINAL_STEP
422 jg .Lfinup2x_done
423 je .Lfinup2x_finalize_countonly
424 add $64, LEN
425 jz .Lfinup2x_finalize_blockaligned
426
427 // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
428 // To do this, write the padding starting with the 0x80 byte to
429 // &sp[64]. Then for each message, copy the last 64 data bytes to sp
430 // and load from &sp[64 - LEN] to get the needed padding block. This
431 // code relies on the data buffers being >= 64 bytes in length.
432 mov $64, %ebx
433 sub LEN, %ebx // ebx = 64 - LEN
434 sub %rbx, DATA1 // DATA1 -= 64 - LEN
435 sub %rbx, DATA2 // DATA2 -= 64 - LEN
436 mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary
437 movd FINAL_STEP, MSG0_A
438 pxor MSG1_A, MSG1_A
439 movdqa MSG0_A, 4*16(%rsp)
440 movdqa MSG1_A, 5*16(%rsp)
441 movdqa MSG1_A, 6*16(%rsp)
442 movdqa MSG1_A, 7*16(%rsp)
443 cmp $56, LEN
444 jge 1f // will COUNT spill into its own block?
445 shl $3, COUNT
446 bswap COUNT
447 mov COUNT, 56(%rsp,%rbx)
448 mov $2, FINAL_STEP // won't need count-only block
449 jmp 2f
4501:
451 mov $1, FINAL_STEP // will need count-only block
4522:
453 movdqu 0*16(DATA1), MSG0_A
454 movdqu 1*16(DATA1), MSG1_A
455 movdqu 2*16(DATA1), MSG2_A
456 movdqu 3*16(DATA1), MSG3_A
457 movdqa MSG0_A, 0*16(%rsp)
458 movdqa MSG1_A, 1*16(%rsp)
459 movdqa MSG2_A, 2*16(%rsp)
460 movdqa MSG3_A, 3*16(%rsp)
461 movdqu 0*16(%rsp,%rbx), MSG0_A
462 movdqu 1*16(%rsp,%rbx), MSG1_A
463 movdqu 2*16(%rsp,%rbx), MSG2_A
464 movdqu 3*16(%rsp,%rbx), MSG3_A
465
466 movdqu 0*16(DATA2), MSG0_B
467 movdqu 1*16(DATA2), MSG1_B
468 movdqu 2*16(DATA2), MSG2_B
469 movdqu 3*16(DATA2), MSG3_B
470 movdqa MSG0_B, 0*16(%rsp)
471 movdqa MSG1_B, 1*16(%rsp)
472 movdqa MSG2_B, 2*16(%rsp)
473 movdqa MSG3_B, 3*16(%rsp)
474 movdqu 0*16(%rsp,%rbx), MSG0_B
475 movdqu 1*16(%rsp,%rbx), MSG1_B
476 movdqu 2*16(%rsp,%rbx), MSG2_B
477 movdqu 3*16(%rsp,%rbx), MSG3_B
478 jmp .Lfinup2x_loop_have_data
479
480 // Prepare a padding block, either:
481 //
482 // {0x80, 0, 0, 0, ..., count (as __be64)}
483 // This is for a block aligned message.
484 //
485 // { 0, 0, 0, 0, ..., count (as __be64)}
486 // This is for a message whose length mod 64 is >= 56.
487 //
488 // Pre-swap the endianness of the words.
489.Lfinup2x_finalize_countonly:
490 pxor MSG0_A, MSG0_A
491 jmp 1f
492
493.Lfinup2x_finalize_blockaligned:
494 mov $0x80000000, %ebx
495 movd %ebx, MSG0_A
4961:
497 pxor MSG1_A, MSG1_A
498 pxor MSG2_A, MSG2_A
499 ror $29, COUNT
500 movq COUNT, MSG3_A
501 pslldq $8, MSG3_A
502 movdqa MSG0_A, MSG0_B
503 pxor MSG1_B, MSG1_B
504 pxor MSG2_B, MSG2_B
505 movdqa MSG3_A, MSG3_B
506 mov $2, FINAL_STEP
507 jmp .Lfinup2x_loop_have_bswapped_data
508
509.Lfinup2x_done:
510 // Write the two digests with all bytes in the correct order.
511 movdqa STATE0_A, TMP_A
512 movdqa STATE0_B, TMP_B
513 punpcklqdq STATE1_A, STATE0_A // GHEF
514 punpcklqdq STATE1_B, STATE0_B
515 punpckhqdq TMP_A, STATE1_A // ABCD
516 punpckhqdq TMP_B, STATE1_B
517 pshufd $0xB1, STATE0_A, STATE0_A // HGFE
518 pshufd $0xB1, STATE0_B, STATE0_B
519 pshufd $0x1B, STATE1_A, STATE1_A // DCBA
520 pshufd $0x1B, STATE1_B, STATE1_B
521 pshufb SHUF_MASK, STATE0_A
522 pshufb SHUF_MASK, STATE0_B
523 pshufb SHUF_MASK, STATE1_A
524 pshufb SHUF_MASK, STATE1_B
525 movdqu STATE0_A, 1*16(OUT1)
526 movdqu STATE0_B, 1*16(OUT2)
527 movdqu STATE1_A, 0*16(OUT1)
528 movdqu STATE1_B, 0*16(OUT2)
529
530 mov %rbp, %rsp
531 pop %rbp
532 pop %rbx
533 RET
534SYM_FUNC_END(sha256_ni_finup2x)
535
536.section .rodata.cst256.K256, "aM", @progbits, 256
537.align 64
538K256:
539 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
540 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
541 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
542 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
543 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
544 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
545 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
546 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
547 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
548 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
549 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
550 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
551 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
552 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
553 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
554 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
555
556.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
557.align 16
558PSHUFFLE_BYTE_FLIP_MASK:
559 .octa 0x0c0d0e0f08090a0b0405060700010203
560