sha256-ni-asm.S source code [Linux/lib/crypto/x86/sha256-ni-asm.S]

1	/*
2	* Intel SHA Extensions optimized implementation of a SHA-256 update function
3	*
4	* This file is provided under a dual BSD/GPLv2 license. When using or
5	* redistributing this file, you may do so under either license.
6	*
7	* GPL LICENSE SUMMARY
8	*
9	* Copyright(c) 2015 Intel Corporation.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of version 2 of the GNU General Public License as
13	* published by the Free Software Foundation.
14	*
15	* This program is distributed in the hope that it will be useful, but
16	* WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.
19	*
20	* Contact Information:
21	* Sean Gulley <sean.m.gulley@intel.com>
22	* Tim Chen <tim.c.chen@linux.intel.com>
23	*
24	* BSD LICENSE
25	*
26	* Copyright(c) 2015 Intel Corporation.
27	*
28	* Redistribution and use in source and binary forms, with or without
29	* modification, are permitted provided that the following conditions
30	* are met:
31	*
32	* * Redistributions of source code must retain the above copyright
33	* notice, this list of conditions and the following disclaimer.
34	* * Redistributions in binary form must reproduce the above copyright
35	* notice, this list of conditions and the following disclaimer in
36	* the documentation and/or other materials provided with the
37	* distribution.
38	* * Neither the name of Intel Corporation nor the names of its
39	* contributors may be used to endorse or promote products derived
40	* from this software without specific prior written permission.
41	*
42	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53	*
54	*/
55
56	#include <linux/linkage.h>
57
58	#define STATE_PTR %rdi /* 1st arg */
59	#define DATA_PTR %rsi /* 2nd arg */
60	#define NUM_BLKS %rdx /* 3rd arg */
61
62	#define SHA256CONSTANTS %rax
63
64	#define MSG %xmm0 /* sha256rnds2 implicit operand */
65	#define STATE0 %xmm1
66	#define STATE1 %xmm2
67	#define MSG0 %xmm3
68	#define MSG1 %xmm4
69	#define MSG2 %xmm5
70	#define MSG3 %xmm6
71	#define TMP %xmm7
72
73	#define SHUF_MASK %xmm8
74
75	#define ABEF_SAVE %xmm9
76	#define CDGH_SAVE %xmm10
77
78	.macro do_4rounds i, m0, m1, m2, m3
79	.if \i < `16`
80	movdqu \i*`4`(DATA_PTR), \m0
81	pshufb SHUF_MASK, \m0
82	.endif
83	movdqa (\i-`32`)*`4`(SHA256CONSTANTS), MSG
84	paddd \m0, MSG
85	sha256rnds2 STATE0, STATE1
86	.if \i >= `12` && \i < `60`
87	movdqa \m0, TMP
88	palignr $`4`, \m3, TMP
89	paddd TMP, \m1
90	sha256msg2 \m0, \m1
91	.endif
92	punpckhqdq MSG, MSG
93	sha256rnds2 STATE1, STATE0
94	.if \i >= `4` && \i < `52`
95	sha256msg1 \m0, \m3
96	.endif
97	.endm
98
99	/*
100	* Intel SHA Extensions optimized implementation of a SHA-256 block function
101	*
102	* This function takes a pointer to the current SHA-256 state, a pointer to the
103	* input data, and the number of 64-byte blocks to process. Once all blocks
104	* have been processed, the state is updated with the new state. This function
105	* only processes complete blocks. State initialization, buffering of partial
106	* blocks, and digest finalization is expected to be handled elsewhere.
107	*
108	* void sha256_ni_transform(struct sha256_block_state *state,
109	* const u8 *data, size_t nblocks);
110	*/
111	.text
112	SYM_FUNC_START(sha256_ni_transform)
113
114	shl $`6`, NUM_BLKS / convert to bytes /
115	add DATA_PTR, NUM_BLKS / pointer to end of data /
116
117	/*
118	* load initial hash values
119	* Need to reorder these appropriately
120	* DCBA, HGFE -> ABEF, CDGH
121	*/
122	movdqu `0``16`(STATE_PTR), STATE0 /* DCBA /
123	movdqu `1``16`(STATE_PTR), STATE1 /* HGFE /
124
125	movdqa STATE0, TMP
126	punpcklqdq STATE1, STATE0 / FEBA /
127	punpckhqdq TMP, STATE1 / DCHG /
128	pshufd $`0x1B`, STATE0, STATE0 / ABEF /
129	pshufd $`0xB1`, STATE1, STATE1 / CDGH /
130
131	movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
132	lea K256+`32`*`4`(%rip), SHA256CONSTANTS
133
134	.Lloop0:
135	/ Save hash values for addition after rounds /
136	movdqa STATE0, ABEF_SAVE
137	movdqa STATE1, CDGH_SAVE
138
139	.irp i, `0`, `16`, `32`, `48`
140	do_4rounds (\i + `0`), MSG0, MSG1, MSG2, MSG3
141	do_4rounds (\i + `4`), MSG1, MSG2, MSG3, MSG0
142	do_4rounds (\i + `8`), MSG2, MSG3, MSG0, MSG1
143	do_4rounds (\i + `12`), MSG3, MSG0, MSG1, MSG2
144	.endr
145
146	/ Add current hash values with previously saved /
147	paddd ABEF_SAVE, STATE0
148	paddd CDGH_SAVE, STATE1
149
150	/ Increment data pointer and loop if more to process /
151	add $`64`, DATA_PTR
152	cmp NUM_BLKS, DATA_PTR
153	jne .Lloop0
154
155	/ Write hash values back in the correct order /
156	movdqa STATE0, TMP
157	punpcklqdq STATE1, STATE0 / GHEF /
158	punpckhqdq TMP, STATE1 / ABCD /
159	pshufd $`0xB1`, STATE0, STATE0 / HGFE /
160	pshufd $`0x1B`, STATE1, STATE1 / DCBA /
161
162	movdqu STATE1, `0`*`16`(STATE_PTR)
163	movdqu STATE0, `1`*`16`(STATE_PTR)
164
165	RET
166	SYM_FUNC_END(sha256_ni_transform)
167
168	#undef DIGEST_PTR
169	#undef DATA_PTR
170	#undef NUM_BLKS
171	#undef SHA256CONSTANTS
172	#undef MSG
173	#undef STATE0
174	#undef STATE1
175	#undef MSG0
176	#undef MSG1
177	#undef MSG2
178	#undef MSG3
179	#undef TMP
180	#undef SHUF_MASK
181	#undef ABEF_SAVE
182	#undef CDGH_SAVE
183
184	// parameters for sha256_ni_finup2x()
185	#define CTX %rdi
186	#define DATA1 %rsi
187	#define DATA2 %rdx
188	#define LEN %ecx
189	#define LEN8 %cl
190	#define LEN64 %rcx
191	#define OUT1 %r8
192	#define OUT2 %r9
193
194	// other scalar variables
195	#define SHA256CONSTANTS %rax
196	#define COUNT %r10
197	#define COUNT32 %r10d
198	#define FINAL_STEP %r11d
199
200	// rbx is used as a temporary.
201
202	#define MSG %xmm0 // sha256rnds2 implicit operand
203	#define STATE0_A %xmm1
204	#define STATE1_A %xmm2
205	#define STATE0_B %xmm3
206	#define STATE1_B %xmm4
207	#define TMP_A %xmm5
208	#define TMP_B %xmm6
209	#define MSG0_A %xmm7
210	#define MSG1_A %xmm8
211	#define MSG2_A %xmm9
212	#define MSG3_A %xmm10
213	#define MSG0_B %xmm11
214	#define MSG1_B %xmm12
215	#define MSG2_B %xmm13
216	#define MSG3_B %xmm14
217	#define SHUF_MASK %xmm15
218
219	#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state)
220	#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
221	#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
222
223	// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b
224	// contain the current 4 message schedule words for the first and second message
225	// respectively.
226	//
227	// If not all the message schedule words have been computed yet, then this also
228	// computes 4 more message schedule words for each message. m1_a-m3_a contain
229	// the next 3 groups of 4 message schedule words for the first message, and
230	// likewise m1_b-m3_b for the second. After consuming the current value of
231	// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
232	// likewise for _b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the*
233	// current (m1_a, m2_a, m3_a, m0_a), and likewise for _b, so the caller must*
234	// cycle through the registers accordingly.
235	.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b
236	movdqa (\i-`32`)*`4`(SHA256CONSTANTS), TMP_A
237	movdqa TMP_A, TMP_B
238	paddd \m0_a, TMP_A
239	paddd \m0_b, TMP_B
240	.if \i < `48`
241	sha256msg1 \m1_a, \m0_a
242	sha256msg1 \m1_b, \m0_b
243	.endif
244	movdqa TMP_A, MSG
245	sha256rnds2 STATE0_A, STATE1_A
246	movdqa TMP_B, MSG
247	sha256rnds2 STATE0_B, STATE1_B
248	pshufd $`0x0E`, TMP_A, MSG
249	sha256rnds2 STATE1_A, STATE0_A
250	pshufd $`0x0E`, TMP_B, MSG
251	sha256rnds2 STATE1_B, STATE0_B
252	.if \i < `48`
253	movdqa \m3_a, TMP_A
254	movdqa \m3_b, TMP_B
255	palignr $`4`, \m2_a, TMP_A
256	palignr $`4`, \m2_b, TMP_B
257	paddd TMP_A, \m0_a
258	paddd TMP_B, \m0_b
259	sha256msg2 \m3_a, \m0_a
260	sha256msg2 \m3_b, \m0_b
261	.endif
262	.endm
263
264	//
265	// void sha256_ni_finup2x(const struct __sha256_ctx ctx,*
266	// const u8 data1, const u8 data2, int len,
267	// u8 out1[SHA256_DIGEST_SIZE],
268	// u8 out2[SHA256_DIGEST_SIZE]);
269	//
270	// This function computes the SHA-256 digests of two messages \|data1\| and
271	// \|data2\| that are both \|len\| bytes long, starting from the initial context
272	// \|ctx\|. \|len\| must be at least SHA256_BLOCK_SIZE.
273	//
274	// The instructions for the two SHA-256 operations are interleaved. On many
275	// CPUs, this is almost twice as fast as hashing each message individually due
276	// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
277	//
278	SYM_FUNC_START(sha256_ni_finup2x)
279	// Allocate 128 bytes of stack space, 16-byte aligned.
280	push %rbx
281	push %rbp
282	mov %rsp, %rbp
283	sub $`128`, %rsp
284	and $~`15`, %rsp
285
286	// Load the shuffle mask for swapping the endianness of 32-bit words.
287	movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
288
289	// Set up pointer to the round constants.
290	lea K256+`32`*`4`(%rip), SHA256CONSTANTS
291
292	// Initially we're not processing the final blocks.
293	xor FINAL_STEP, FINAL_STEP
294
295	// Load the initial state from ctx->state.
296	movdqu OFFSETOF_STATE+`0``16`(CTX), STATE0_A // DCBA*
297	movdqu OFFSETOF_STATE+`1``16`(CTX), STATE1_A // HGFE*
298	movdqa STATE0_A, TMP_A
299	punpcklqdq STATE1_A, STATE0_A // FEBA
300	punpckhqdq TMP_A, STATE1_A // DCHG
301	pshufd $`0x1B`, STATE0_A, STATE0_A // ABEF
302	pshufd $`0xB1`, STATE1_A, STATE1_A // CDGH
303
304	// Load ctx->bytecount. Take the mod 64 of it to get the number of
305	// bytes that are buffered in ctx->buf. Also save it in a register with
306	// LEN added to it.
307	mov LEN, LEN
308	mov OFFSETOF_BYTECOUNT(CTX), %rbx
309	lea (%rbx, LEN64, `1`), COUNT
310	and $`63`, %ebx
311	jz .Lfinup2x_enter_loop // No bytes buffered?
312
313	// %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them
314	// followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we
315	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
316	// unconditionally and rearrange the data as needed.
317
318	movdqu OFFSETOF_BUF+`0`*`16`(CTX), MSG0_A
319	movdqu OFFSETOF_BUF+`1`*`16`(CTX), MSG1_A
320	movdqu OFFSETOF_BUF+`2`*`16`(CTX), MSG2_A
321	movdqu OFFSETOF_BUF+`3`*`16`(CTX), MSG3_A
322	movdqa MSG0_A, `0`*`16`(%rsp)
323	movdqa MSG1_A, `1`*`16`(%rsp)
324	movdqa MSG2_A, `2`*`16`(%rsp)
325	movdqa MSG3_A, `3`*`16`(%rsp)
326
327	movdqu `0`*`16`(DATA1), MSG0_A
328	movdqu `1`*`16`(DATA1), MSG1_A
329	movdqu `2`*`16`(DATA1), MSG2_A
330	movdqu `3`*`16`(DATA1), MSG3_A
331	movdqu MSG0_A, `0`*`16`(%rsp,%rbx)
332	movdqu MSG1_A, `1`*`16`(%rsp,%rbx)
333	movdqu MSG2_A, `2`*`16`(%rsp,%rbx)
334	movdqu MSG3_A, `3`*`16`(%rsp,%rbx)
335	movdqa `0`*`16`(%rsp), MSG0_A
336	movdqa `1`*`16`(%rsp), MSG1_A
337	movdqa `2`*`16`(%rsp), MSG2_A
338	movdqa `3`*`16`(%rsp), MSG3_A
339
340	movdqu `0`*`16`(DATA2), MSG0_B
341	movdqu `1`*`16`(DATA2), MSG1_B
342	movdqu `2`*`16`(DATA2), MSG2_B
343	movdqu `3`*`16`(DATA2), MSG3_B
344	movdqu MSG0_B, `0`*`16`(%rsp,%rbx)
345	movdqu MSG1_B, `1`*`16`(%rsp,%rbx)
346	movdqu MSG2_B, `2`*`16`(%rsp,%rbx)
347	movdqu MSG3_B, `3`*`16`(%rsp,%rbx)
348	movdqa `0`*`16`(%rsp), MSG0_B
349	movdqa `1`*`16`(%rsp), MSG1_B
350	movdqa `2`*`16`(%rsp), MSG2_B
351	movdqa `3`*`16`(%rsp), MSG3_B
352
353	sub $`64`, %rbx // rbx = buffered - 64
354	sub %rbx, DATA1 // DATA1 += 64 - buffered
355	sub %rbx, DATA2 // DATA2 += 64 - buffered
356	add %ebx, LEN // LEN += buffered - 64
357	movdqa STATE0_A, STATE0_B
358	movdqa STATE1_A, STATE1_B
359	jmp .Lfinup2x_loop_have_data
360
361	.Lfinup2x_enter_loop:
362	sub $`64`, LEN
363	movdqa STATE0_A, STATE0_B
364	movdqa STATE1_A, STATE1_B
365	.Lfinup2x_loop:
366	// Load the next two data blocks.
367	movdqu `0`*`16`(DATA1), MSG0_A
368	movdqu `0`*`16`(DATA2), MSG0_B
369	movdqu `1`*`16`(DATA1), MSG1_A
370	movdqu `1`*`16`(DATA2), MSG1_B
371	movdqu `2`*`16`(DATA1), MSG2_A
372	movdqu `2`*`16`(DATA2), MSG2_B
373	movdqu `3`*`16`(DATA1), MSG3_A
374	movdqu `3`*`16`(DATA2), MSG3_B
375	add $`64`, DATA1
376	add $`64`, DATA2
377	.Lfinup2x_loop_have_data:
378	// Convert the words of the data blocks from big endian.
379	pshufb SHUF_MASK, MSG0_A
380	pshufb SHUF_MASK, MSG0_B
381	pshufb SHUF_MASK, MSG1_A
382	pshufb SHUF_MASK, MSG1_B
383	pshufb SHUF_MASK, MSG2_A
384	pshufb SHUF_MASK, MSG2_B
385	pshufb SHUF_MASK, MSG3_A
386	pshufb SHUF_MASK, MSG3_B
387	.Lfinup2x_loop_have_bswapped_data:
388
389	// Save the original state for each block.
390	movdqa STATE0_A, `0`*`16`(%rsp)
391	movdqa STATE0_B, `1`*`16`(%rsp)
392	movdqa STATE1_A, `2`*`16`(%rsp)
393	movdqa STATE1_B, `3`*`16`(%rsp)
394
395	// Do the SHA-256 rounds on each block.
396	.irp i, `0`, `16`, `32`, `48`
397	do_4rounds_2x (\i + `0`), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
398	MSG0_B, MSG1_B, MSG2_B, MSG3_B
399	do_4rounds_2x (\i + `4`), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
400	MSG1_B, MSG2_B, MSG3_B, MSG0_B
401	do_4rounds_2x (\i + `8`), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
402	MSG2_B, MSG3_B, MSG0_B, MSG1_B
403	do_4rounds_2x (\i + `12`), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
404	MSG3_B, MSG0_B, MSG1_B, MSG2_B
405	.endr
406
407	// Add the original state for each block.
408	paddd `0`*`16`(%rsp), STATE0_A
409	paddd `1`*`16`(%rsp), STATE0_B
410	paddd `2`*`16`(%rsp), STATE1_A
411	paddd `3`*`16`(%rsp), STATE1_B
412
413	// Update LEN and loop back if more blocks remain.
414	sub $`64`, LEN
415	jge .Lfinup2x_loop
416
417	// Check if any final blocks need to be handled.
418	// FINAL_STEP = 2: all done
419	// FINAL_STEP = 1: need to do count-only padding block
420	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
421	cmp $`1`, FINAL_STEP
422	jg .Lfinup2x_done
423	je .Lfinup2x_finalize_countonly
424	add $`64`, LEN
425	jz .Lfinup2x_finalize_blockaligned
426
427	// Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block.
428	// To do this, write the padding starting with the 0x80 byte to
429	// &sp[64]. Then for each message, copy the last 64 data bytes to sp
430	// and load from &sp[64 - LEN] to get the needed padding block. This
431	// code relies on the data buffers being >= 64 bytes in length.
432	mov $`64`, %ebx
433	sub LEN, %ebx // ebx = 64 - LEN
434	sub %rbx, DATA1 // DATA1 -= 64 - LEN
435	sub %rbx, DATA2 // DATA2 -= 64 - LEN
436	mov $`0x80`, FINAL_STEP // using FINAL_STEP as a temporary
437	movd FINAL_STEP, MSG0_A
438	pxor MSG1_A, MSG1_A
439	movdqa MSG0_A, `4`*`16`(%rsp)
440	movdqa MSG1_A, `5`*`16`(%rsp)
441	movdqa MSG1_A, `6`*`16`(%rsp)
442	movdqa MSG1_A, `7`*`16`(%rsp)
443	cmp $`56`, LEN
444	jge `1f` // will COUNT spill into its own block?
445	shl $`3`, COUNT
446	bswap COUNT
447	mov COUNT, `56`(%rsp,%rbx)
448	mov $`2`, FINAL_STEP // won't need count-only block
449	jmp `2f`
450	`1`:
451	mov $`1`, FINAL_STEP // will need count-only block
452	`2`:
453	movdqu `0`*`16`(DATA1), MSG0_A
454	movdqu `1`*`16`(DATA1), MSG1_A
455	movdqu `2`*`16`(DATA1), MSG2_A
456	movdqu `3`*`16`(DATA1), MSG3_A
457	movdqa MSG0_A, `0`*`16`(%rsp)
458	movdqa MSG1_A, `1`*`16`(%rsp)
459	movdqa MSG2_A, `2`*`16`(%rsp)
460	movdqa MSG3_A, `3`*`16`(%rsp)
461	movdqu `0`*`16`(%rsp,%rbx), MSG0_A
462	movdqu `1`*`16`(%rsp,%rbx), MSG1_A
463	movdqu `2`*`16`(%rsp,%rbx), MSG2_A
464	movdqu `3`*`16`(%rsp,%rbx), MSG3_A
465
466	movdqu `0`*`16`(DATA2), MSG0_B
467	movdqu `1`*`16`(DATA2), MSG1_B
468	movdqu `2`*`16`(DATA2), MSG2_B
469	movdqu `3`*`16`(DATA2), MSG3_B
470	movdqa MSG0_B, `0`*`16`(%rsp)
471	movdqa MSG1_B, `1`*`16`(%rsp)
472	movdqa MSG2_B, `2`*`16`(%rsp)
473	movdqa MSG3_B, `3`*`16`(%rsp)
474	movdqu `0`*`16`(%rsp,%rbx), MSG0_B
475	movdqu `1`*`16`(%rsp,%rbx), MSG1_B
476	movdqu `2`*`16`(%rsp,%rbx), MSG2_B
477	movdqu `3`*`16`(%rsp,%rbx), MSG3_B
478	jmp .Lfinup2x_loop_have_data
479
480	// Prepare a padding block, either:
481	//
482	// {0x80, 0, 0, 0, ..., count (as __be64)}
483	// This is for a block aligned message.
484	//
485	// { 0, 0, 0, 0, ..., count (as __be64)}
486	// This is for a message whose length mod 64 is >= 56.
487	//
488	// Pre-swap the endianness of the words.
489	.Lfinup2x_finalize_countonly:
490	pxor MSG0_A, MSG0_A
491	jmp `1f`
492
493	.Lfinup2x_finalize_blockaligned:
494	mov $`0x80000000`, %ebx
495	movd %ebx, MSG0_A
496	`1`:
497	pxor MSG1_A, MSG1_A
498	pxor MSG2_A, MSG2_A
499	ror $`29`, COUNT
500	movq COUNT, MSG3_A
501	pslldq $`8`, MSG3_A
502	movdqa MSG0_A, MSG0_B
503	pxor MSG1_B, MSG1_B
504	pxor MSG2_B, MSG2_B
505	movdqa MSG3_A, MSG3_B
506	mov $`2`, FINAL_STEP
507	jmp .Lfinup2x_loop_have_bswapped_data
508
509	.Lfinup2x_done:
510	// Write the two digests with all bytes in the correct order.
511	movdqa STATE0_A, TMP_A
512	movdqa STATE0_B, TMP_B
513	punpcklqdq STATE1_A, STATE0_A // GHEF
514	punpcklqdq STATE1_B, STATE0_B
515	punpckhqdq TMP_A, STATE1_A // ABCD
516	punpckhqdq TMP_B, STATE1_B
517	pshufd $`0xB1`, STATE0_A, STATE0_A // HGFE
518	pshufd $`0xB1`, STATE0_B, STATE0_B
519	pshufd $`0x1B`, STATE1_A, STATE1_A // DCBA
520	pshufd $`0x1B`, STATE1_B, STATE1_B
521	pshufb SHUF_MASK, STATE0_A
522	pshufb SHUF_MASK, STATE0_B
523	pshufb SHUF_MASK, STATE1_A
524	pshufb SHUF_MASK, STATE1_B
525	movdqu STATE0_A, `1`*`16`(OUT1)
526	movdqu STATE0_B, `1`*`16`(OUT2)
527	movdqu STATE1_A, `0`*`16`(OUT1)
528	movdqu STATE1_B, `0`*`16`(OUT2)
529
530	mov %rbp, %rsp
531	pop %rbp
532	pop %rbx
533	RET
534	SYM_FUNC_END(sha256_ni_finup2x)
535
536	.section .rodata.cst256.K256, "aM", @progbits, `256`
537	.align `64`
538	K256:
539	.long `0x428a2f98`,`0x71374491`,`0xb5c0fbcf`,`0xe9b5dba5`
540	.long `0x3956c25b`,`0x59f111f1`,`0x923f82a4`,`0xab1c5ed5`
541	.long `0xd807aa98`,`0x12835b01`,`0x243185be`,`0x550c7dc3`
542	.long `0x72be5d74`,`0x80deb1fe`,`0x9bdc06a7`,`0xc19bf174`
543	.long `0xe49b69c1`,`0xefbe4786`,`0x0fc19dc6`,`0x240ca1cc`
544	.long `0x2de92c6f`,`0x4a7484aa`,`0x5cb0a9dc`,`0x76f988da`
545	.long `0x983e5152`,`0xa831c66d`,`0xb00327c8`,`0xbf597fc7`
546	.long `0xc6e00bf3`,`0xd5a79147`,`0x06ca6351`,`0x14292967`
547	.long `0x27b70a85`,`0x2e1b2138`,`0x4d2c6dfc`,`0x53380d13`
548	.long `0x650a7354`,`0x766a0abb`,`0x81c2c92e`,`0x92722c85`
549	.long `0xa2bfe8a1`,`0xa81a664b`,`0xc24b8b70`,`0xc76c51a3`
550	.long `0xd192e819`,`0xd6990624`,`0xf40e3585`,`0x106aa070`
551	.long `0x19a4c116`,`0x1e376c08`,`0x2748774c`,`0x34b0bcb5`
552	.long `0x391c0cb3`,`0x4ed8aa4a`,`0x5b9cca4f`,`0x682e6ff3`
553	.long `0x748f82ee`,`0x78a5636f`,`0x84c87814`,`0x8cc70208`
554	.long `0x90befffa`,`0xa4506ceb`,`0xbef9a3f7`,`0xc67178f2`
555
556	.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, `16`
557	.align `16`
558	PSHUFFLE_BYTE_FLIP_MASK:
559	.octa `0x0c0d0e0f08090a0b0405060700010203`
560

Browse the source code of Linux/lib/crypto/x86/sha256-ni-asm.S