| 1 | /* | 
|---|
| 2 | *	Implement fast SHA-1 with AVX2 instructions. (x86_64) | 
|---|
| 3 | * | 
|---|
| 4 | * This file is provided under a dual BSD/GPLv2 license.  When using or | 
|---|
| 5 | * redistributing this file, you may do so under either license. | 
|---|
| 6 | * | 
|---|
| 7 | * GPL LICENSE SUMMARY | 
|---|
| 8 | * | 
|---|
| 9 | * Copyright(c) 2014 Intel Corporation. | 
|---|
| 10 | * | 
|---|
| 11 | * This program is free software; you can redistribute it and/or modify | 
|---|
| 12 | * it under the terms of version 2 of the GNU General Public License as | 
|---|
| 13 | * published by the Free Software Foundation. | 
|---|
| 14 | * | 
|---|
| 15 | * This program is distributed in the hope that it will be useful, but | 
|---|
| 16 | * WITHOUT ANY WARRANTY; without even the implied warranty of | 
|---|
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
|---|
| 18 | * General Public License for more details. | 
|---|
| 19 | * | 
|---|
| 20 | * Contact Information: | 
|---|
| 21 | * Ilya Albrekht <ilya.albrekht@intel.com> | 
|---|
| 22 | * Maxim Locktyukhin <maxim.locktyukhin@intel.com> | 
|---|
| 23 | * Ronen Zohar <ronen.zohar@intel.com> | 
|---|
| 24 | * Chandramouli Narayanan <mouli@linux.intel.com> | 
|---|
| 25 | * | 
|---|
| 26 | * BSD LICENSE | 
|---|
| 27 | * | 
|---|
| 28 | * Copyright(c) 2014 Intel Corporation. | 
|---|
| 29 | * | 
|---|
| 30 | * Redistribution and use in source and binary forms, with or without | 
|---|
| 31 | * modification, are permitted provided that the following conditions | 
|---|
| 32 | * are met: | 
|---|
| 33 | * | 
|---|
| 34 | * Redistributions of source code must retain the above copyright | 
|---|
| 35 | * notice, this list of conditions and the following disclaimer. | 
|---|
| 36 | * Redistributions in binary form must reproduce the above copyright | 
|---|
| 37 | * notice, this list of conditions and the following disclaimer in | 
|---|
| 38 | * the documentation and/or other materials provided with the | 
|---|
| 39 | * distribution. | 
|---|
| 40 | * Neither the name of Intel Corporation nor the names of its | 
|---|
| 41 | * contributors may be used to endorse or promote products derived | 
|---|
| 42 | * from this software without specific prior written permission. | 
|---|
| 43 | * | 
|---|
| 44 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|---|
| 45 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|---|
| 46 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
|---|
| 47 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 
|---|
| 48 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 
|---|
| 49 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 
|---|
| 50 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
|---|
| 51 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
|---|
| 52 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
|---|
| 53 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
|---|
| 54 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|---|
| 55 | * | 
|---|
| 56 | */ | 
|---|
| 57 |  | 
|---|
| 58 | /* | 
|---|
| 59 | * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. | 
|---|
| 60 | * | 
|---|
| 61 | *This implementation is based on the previous SSSE3 release: | 
|---|
| 62 | *Visit http://software.intel.com/en-us/articles/ | 
|---|
| 63 | *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ | 
|---|
| 64 | * | 
|---|
| 65 | * void sha1_transform_avx2(struct sha1_block_state *state, | 
|---|
| 66 | *			    const u8 *data, size_t nblocks); | 
|---|
| 67 | */ | 
|---|
| 68 |  | 
|---|
| 69 | #include <linux/linkage.h> | 
|---|
| 70 |  | 
|---|
| 71 | #define	CTX	%rdi	/* arg1 */ | 
|---|
| 72 | #define BUF	%rsi	/* arg2 */ | 
|---|
| 73 | #define CNT	%rdx	/* arg3 */ | 
|---|
| 74 |  | 
|---|
| 75 | #define	REG_A	%ecx | 
|---|
| 76 | #define	REG_B	%esi | 
|---|
| 77 | #define	REG_C	%edi | 
|---|
| 78 | #define	REG_D	%eax | 
|---|
| 79 | #define	REG_E	%edx | 
|---|
| 80 | #define	REG_TB	%ebx | 
|---|
| 81 | #define	REG_TA	%r12d | 
|---|
| 82 | #define	REG_RA	%rcx | 
|---|
| 83 | #define	REG_RB	%rsi | 
|---|
| 84 | #define	REG_RC	%rdi | 
|---|
| 85 | #define	REG_RD	%rax | 
|---|
| 86 | #define	REG_RE	%rdx | 
|---|
| 87 | #define	REG_RTA	%r12 | 
|---|
| 88 | #define	REG_RTB	%rbx | 
|---|
| 89 | #define	REG_T1	%r11d | 
|---|
| 90 | #define	xmm_mov	vmovups | 
|---|
| 91 | #define	avx2_zeroupper	vzeroupper | 
|---|
| 92 | #define	RND_F1	1 | 
|---|
| 93 | #define	RND_F2	2 | 
|---|
| 94 | #define	RND_F3	3 | 
|---|
| 95 |  | 
|---|
| 96 | .macro REGALLOC | 
|---|
| 97 | .set A, REG_A | 
|---|
| 98 | .set B, REG_B | 
|---|
| 99 | .set C, REG_C | 
|---|
| 100 | .set D, REG_D | 
|---|
| 101 | .set E, REG_E | 
|---|
| 102 | .set TB, REG_TB | 
|---|
| 103 | .set TA, REG_TA | 
|---|
| 104 |  | 
|---|
| 105 | .set RA, REG_RA | 
|---|
| 106 | .set RB, REG_RB | 
|---|
| 107 | .set RC, REG_RC | 
|---|
| 108 | .set RD, REG_RD | 
|---|
| 109 | .set RE, REG_RE | 
|---|
| 110 |  | 
|---|
| 111 | .set RTA, REG_RTA | 
|---|
| 112 | .set RTB, REG_RTB | 
|---|
| 113 |  | 
|---|
| 114 | .set T1, REG_T1 | 
|---|
| 115 | .endm | 
|---|
| 116 |  | 
|---|
| 117 | #define HASH_PTR	%r9 | 
|---|
| 118 | #define BLOCKS_CTR	%r8 | 
|---|
| 119 | #define BUFFER_PTR	%r10 | 
|---|
| 120 | #define BUFFER_PTR2	%r13 | 
|---|
| 121 |  | 
|---|
| 122 | #define PRECALC_BUF	%r14 | 
|---|
| 123 | #define WK_BUF		%r15 | 
|---|
| 124 |  | 
|---|
| 125 | #define W_TMP		%xmm0 | 
|---|
| 126 | #define WY_TMP		%ymm0 | 
|---|
| 127 | #define WY_TMP2		%ymm9 | 
|---|
| 128 |  | 
|---|
| 129 | # AVX2 variables | 
|---|
| 130 | #define WY0		%ymm3 | 
|---|
| 131 | #define WY4		%ymm5 | 
|---|
| 132 | #define WY08		%ymm7 | 
|---|
| 133 | #define WY12		%ymm8 | 
|---|
| 134 | #define WY16		%ymm12 | 
|---|
| 135 | #define WY20		%ymm13 | 
|---|
| 136 | #define WY24		%ymm14 | 
|---|
| 137 | #define WY28		%ymm15 | 
|---|
| 138 |  | 
|---|
| 139 | #define YMM_SHUFB_BSWAP	%ymm10 | 
|---|
| 140 |  | 
|---|
| 141 | /* | 
|---|
| 142 | * Keep 2 iterations precalculated at a time: | 
|---|
| 143 | *    - 80 DWORDs per iteration * 2 | 
|---|
| 144 | */ | 
|---|
| 145 | #define W_SIZE		(80*2*2 +16) | 
|---|
| 146 |  | 
|---|
| 147 | #define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) | 
|---|
| 148 | #define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF) | 
|---|
| 149 |  | 
|---|
| 150 |  | 
|---|
| 151 | .macro UPDATE_HASH  hash, val | 
|---|
| 152 | add	\hash, \val | 
|---|
| 153 | mov	\val, \hash | 
|---|
| 154 | .endm | 
|---|
| 155 |  | 
|---|
| 156 | .macro PRECALC_RESET_WY | 
|---|
| 157 | .set WY_00, WY0 | 
|---|
| 158 | .set WY_04, WY4 | 
|---|
| 159 | .set WY_08, WY08 | 
|---|
| 160 | .set WY_12, WY12 | 
|---|
| 161 | .set WY_16, WY16 | 
|---|
| 162 | .set WY_20, WY20 | 
|---|
| 163 | .set WY_24, WY24 | 
|---|
| 164 | .set WY_28, WY28 | 
|---|
| 165 | .set WY_32, WY_00 | 
|---|
| 166 | .endm | 
|---|
| 167 |  | 
|---|
| 168 | .macro PRECALC_ROTATE_WY | 
|---|
| 169 | /* Rotate macros */ | 
|---|
| 170 | .set WY_32, WY_28 | 
|---|
| 171 | .set WY_28, WY_24 | 
|---|
| 172 | .set WY_24, WY_20 | 
|---|
| 173 | .set WY_20, WY_16 | 
|---|
| 174 | .set WY_16, WY_12 | 
|---|
| 175 | .set WY_12, WY_08 | 
|---|
| 176 | .set WY_08, WY_04 | 
|---|
| 177 | .set WY_04, WY_00 | 
|---|
| 178 | .set WY_00, WY_32 | 
|---|
| 179 |  | 
|---|
| 180 | /* Define register aliases */ | 
|---|
| 181 | .set WY, WY_00 | 
|---|
| 182 | .set WY_minus_04, WY_04 | 
|---|
| 183 | .set WY_minus_08, WY_08 | 
|---|
| 184 | .set WY_minus_12, WY_12 | 
|---|
| 185 | .set WY_minus_16, WY_16 | 
|---|
| 186 | .set WY_minus_20, WY_20 | 
|---|
| 187 | .set WY_minus_24, WY_24 | 
|---|
| 188 | .set WY_minus_28, WY_28 | 
|---|
| 189 | .set WY_minus_32, WY | 
|---|
| 190 | .endm | 
|---|
| 191 |  | 
|---|
| 192 | .macro PRECALC_00_15 | 
|---|
| 193 | .if (i == 0) # Initialize and rotate registers | 
|---|
| 194 | PRECALC_RESET_WY | 
|---|
| 195 | PRECALC_ROTATE_WY | 
|---|
| 196 | .endif | 
|---|
| 197 |  | 
|---|
| 198 | /* message scheduling pre-compute for rounds 0-15 */ | 
|---|
| 199 | .if   ((i & 7) == 0) | 
|---|
| 200 | /* | 
|---|
| 201 | * blended AVX2 and ALU instruction scheduling | 
|---|
| 202 | * 1 vector iteration per 8 rounds | 
|---|
| 203 | */ | 
|---|
| 204 | vmovdqu (i * 2)(BUFFER_PTR), W_TMP | 
|---|
| 205 | .elseif ((i & 7) == 1) | 
|---|
| 206 | vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ | 
|---|
| 207 | WY_TMP, WY_TMP | 
|---|
| 208 | .elseif ((i & 7) == 2) | 
|---|
| 209 | vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY | 
|---|
| 210 | .elseif ((i & 7) == 4) | 
|---|
| 211 | vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP | 
|---|
| 212 | .elseif ((i & 7) == 7) | 
|---|
| 213 | vmovdqu  WY_TMP, PRECALC_WK(i&~7) | 
|---|
| 214 |  | 
|---|
| 215 | PRECALC_ROTATE_WY | 
|---|
| 216 | .endif | 
|---|
| 217 | .endm | 
|---|
| 218 |  | 
|---|
| 219 | .macro PRECALC_16_31 | 
|---|
| 220 | /* | 
|---|
| 221 | * message scheduling pre-compute for rounds 16-31 | 
|---|
| 222 | * calculating last 32 w[i] values in 8 XMM registers | 
|---|
| 223 | * pre-calculate K+w[i] values and store to mem | 
|---|
| 224 | * for later load by ALU add instruction | 
|---|
| 225 | * | 
|---|
| 226 | * "brute force" vectorization for rounds 16-31 only | 
|---|
| 227 | * due to w[i]->w[i-3] dependency | 
|---|
| 228 | */ | 
|---|
| 229 | .if   ((i & 7) == 0) | 
|---|
| 230 | /* | 
|---|
| 231 | * blended AVX2 and ALU instruction scheduling | 
|---|
| 232 | * 1 vector iteration per 8 rounds | 
|---|
| 233 | */ | 
|---|
| 234 | /* w[i-14] */ | 
|---|
| 235 | vpalignr	$8, WY_minus_16, WY_minus_12, WY | 
|---|
| 236 | vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */ | 
|---|
| 237 | .elseif ((i & 7) == 1) | 
|---|
| 238 | vpxor	WY_minus_08, WY, WY | 
|---|
| 239 | vpxor	WY_minus_16, WY_TMP, WY_TMP | 
|---|
| 240 | .elseif ((i & 7) == 2) | 
|---|
| 241 | vpxor	WY_TMP, WY, WY | 
|---|
| 242 | vpslldq	$12, WY, WY_TMP2 | 
|---|
| 243 | .elseif ((i & 7) == 3) | 
|---|
| 244 | vpslld	$1, WY, WY_TMP | 
|---|
| 245 | vpsrld	$31, WY, WY | 
|---|
| 246 | .elseif ((i & 7) == 4) | 
|---|
| 247 | vpor	WY, WY_TMP, WY_TMP | 
|---|
| 248 | vpslld	$2, WY_TMP2, WY | 
|---|
| 249 | .elseif ((i & 7) == 5) | 
|---|
| 250 | vpsrld	$30, WY_TMP2, WY_TMP2 | 
|---|
| 251 | vpxor	WY, WY_TMP, WY_TMP | 
|---|
| 252 | .elseif ((i & 7) == 7) | 
|---|
| 253 | vpxor	WY_TMP2, WY_TMP, WY | 
|---|
| 254 | vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP | 
|---|
| 255 | vmovdqu	WY_TMP, PRECALC_WK(i&~7) | 
|---|
| 256 |  | 
|---|
| 257 | PRECALC_ROTATE_WY | 
|---|
| 258 | .endif | 
|---|
| 259 | .endm | 
|---|
| 260 |  | 
|---|
| 261 | .macro PRECALC_32_79 | 
|---|
| 262 | /* | 
|---|
| 263 | * in SHA-1 specification: | 
|---|
| 264 | * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1 | 
|---|
| 265 | * instead we do equal: | 
|---|
| 266 | * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 | 
|---|
| 267 | * allows more efficient vectorization | 
|---|
| 268 | * since w[i]=>w[i-3] dependency is broken | 
|---|
| 269 | */ | 
|---|
| 270 |  | 
|---|
| 271 | .if   ((i & 7) == 0) | 
|---|
| 272 | /* | 
|---|
| 273 | * blended AVX2 and ALU instruction scheduling | 
|---|
| 274 | * 1 vector iteration per 8 rounds | 
|---|
| 275 | */ | 
|---|
| 276 | vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP | 
|---|
| 277 | .elseif ((i & 7) == 1) | 
|---|
| 278 | /* W is W_minus_32 before xor */ | 
|---|
| 279 | vpxor	WY_minus_28, WY, WY | 
|---|
| 280 | .elseif ((i & 7) == 2) | 
|---|
| 281 | vpxor	WY_minus_16, WY_TMP, WY_TMP | 
|---|
| 282 | .elseif ((i & 7) == 3) | 
|---|
| 283 | vpxor	WY_TMP, WY, WY | 
|---|
| 284 | .elseif ((i & 7) == 4) | 
|---|
| 285 | vpslld	$2, WY, WY_TMP | 
|---|
| 286 | .elseif ((i & 7) == 5) | 
|---|
| 287 | vpsrld	$30, WY, WY | 
|---|
| 288 | vpor	WY, WY_TMP, WY | 
|---|
| 289 | .elseif ((i & 7) == 7) | 
|---|
| 290 | vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP | 
|---|
| 291 | vmovdqu	WY_TMP, PRECALC_WK(i&~7) | 
|---|
| 292 |  | 
|---|
| 293 | PRECALC_ROTATE_WY | 
|---|
| 294 | .endif | 
|---|
| 295 | .endm | 
|---|
| 296 |  | 
|---|
| 297 | .macro PRECALC r, s | 
|---|
| 298 | .set i, \r | 
|---|
| 299 |  | 
|---|
| 300 | .if (i < 40) | 
|---|
| 301 | .set K_XMM, 32*0 | 
|---|
| 302 | .elseif (i < 80) | 
|---|
| 303 | .set K_XMM, 32*1 | 
|---|
| 304 | .elseif (i < 120) | 
|---|
| 305 | .set K_XMM, 32*2 | 
|---|
| 306 | .else | 
|---|
| 307 | .set K_XMM, 32*3 | 
|---|
| 308 | .endif | 
|---|
| 309 |  | 
|---|
| 310 | .if (i<32) | 
|---|
| 311 | PRECALC_00_15	\s | 
|---|
| 312 | .elseif (i<64) | 
|---|
| 313 | PRECALC_16_31	\s | 
|---|
| 314 | .elseif (i < 160) | 
|---|
| 315 | PRECALC_32_79	\s | 
|---|
| 316 | .endif | 
|---|
| 317 | .endm | 
|---|
| 318 |  | 
|---|
| 319 | .macro ROTATE_STATE | 
|---|
| 320 | .set T_REG, E | 
|---|
| 321 | .set E, D | 
|---|
| 322 | .set D, C | 
|---|
| 323 | .set C, B | 
|---|
| 324 | .set B, TB | 
|---|
| 325 | .set TB, A | 
|---|
| 326 | .set A, T_REG | 
|---|
| 327 |  | 
|---|
| 328 | .set T_REG, RE | 
|---|
| 329 | .set RE, RD | 
|---|
| 330 | .set RD, RC | 
|---|
| 331 | .set RC, RB | 
|---|
| 332 | .set RB, RTB | 
|---|
| 333 | .set RTB, RA | 
|---|
| 334 | .set RA, T_REG | 
|---|
| 335 | .endm | 
|---|
| 336 |  | 
|---|
| 337 | /* Macro relies on saved ROUND_Fx */ | 
|---|
| 338 |  | 
|---|
| 339 | .macro RND_FUN f, r | 
|---|
| 340 | .if (\f == RND_F1) | 
|---|
| 341 | ROUND_F1	\r | 
|---|
| 342 | .elseif (\f == RND_F2) | 
|---|
| 343 | ROUND_F2	\r | 
|---|
| 344 | .elseif (\f == RND_F3) | 
|---|
| 345 | ROUND_F3	\r | 
|---|
| 346 | .endif | 
|---|
| 347 | .endm | 
|---|
| 348 |  | 
|---|
| 349 | .macro RR r | 
|---|
| 350 | .set round_id, (\r % 80) | 
|---|
| 351 |  | 
|---|
| 352 | .if (round_id == 0)        /* Precalculate F for first round */ | 
|---|
| 353 | .set ROUND_FUNC, RND_F1 | 
|---|
| 354 | mov	B, TB | 
|---|
| 355 |  | 
|---|
| 356 | rorx	$(32-30), B, B    /* b>>>2 */ | 
|---|
| 357 | andn	D, TB, T1 | 
|---|
| 358 | and	C, TB | 
|---|
| 359 | xor	T1, TB | 
|---|
| 360 | .endif | 
|---|
| 361 |  | 
|---|
| 362 | RND_FUN ROUND_FUNC, \r | 
|---|
| 363 | ROTATE_STATE | 
|---|
| 364 |  | 
|---|
| 365 | .if   (round_id == 18) | 
|---|
| 366 | .set ROUND_FUNC, RND_F2 | 
|---|
| 367 | .elseif (round_id == 38) | 
|---|
| 368 | .set ROUND_FUNC, RND_F3 | 
|---|
| 369 | .elseif (round_id == 58) | 
|---|
| 370 | .set ROUND_FUNC, RND_F2 | 
|---|
| 371 | .endif | 
|---|
| 372 |  | 
|---|
| 373 | .set round_id, ( (\r+1) % 80) | 
|---|
| 374 |  | 
|---|
| 375 | RND_FUN ROUND_FUNC, (\r+1) | 
|---|
| 376 | ROTATE_STATE | 
|---|
| 377 | .endm | 
|---|
| 378 |  | 
|---|
| 379 | .macro ROUND_F1 r | 
|---|
| 380 | add	WK(\r), E | 
|---|
| 381 |  | 
|---|
| 382 | andn	C, A, T1			/* ~b&d */ | 
|---|
| 383 | lea	(RE,RTB), E		/* Add F from the previous round */ | 
|---|
| 384 |  | 
|---|
| 385 | rorx	$(32-5), A, TA		/* T2 = A >>> 5 */ | 
|---|
| 386 | rorx	$(32-30),A, TB		/* b>>>2 for next round */ | 
|---|
| 387 |  | 
|---|
| 388 | PRECALC	(\r)			/* msg scheduling for next 2 blocks */ | 
|---|
| 389 |  | 
|---|
| 390 | /* | 
|---|
| 391 | * Calculate F for the next round | 
|---|
| 392 | * (b & c) ^ andn[b, d] | 
|---|
| 393 | */ | 
|---|
| 394 | and	B, A			/* b&c */ | 
|---|
| 395 | xor	T1, A			/* F1 = (b&c) ^ (~b&d) */ | 
|---|
| 396 |  | 
|---|
| 397 | lea	(RE,RTA), E		/* E += A >>> 5 */ | 
|---|
| 398 | .endm | 
|---|
| 399 |  | 
|---|
| 400 | .macro ROUND_F2 r | 
|---|
| 401 | add	WK(\r), E | 
|---|
| 402 | lea	(RE,RTB), E		/* Add F from the previous round */ | 
|---|
| 403 |  | 
|---|
| 404 | /* Calculate F for the next round */ | 
|---|
| 405 | rorx	$(32-5), A, TA		/* T2 = A >>> 5 */ | 
|---|
| 406 | .if ((round_id) < 79) | 
|---|
| 407 | rorx	$(32-30), A, TB	/* b>>>2 for next round */ | 
|---|
| 408 | .endif | 
|---|
| 409 | PRECALC	(\r)			/* msg scheduling for next 2 blocks */ | 
|---|
| 410 |  | 
|---|
| 411 | .if ((round_id) < 79) | 
|---|
| 412 | xor	B, A | 
|---|
| 413 | .endif | 
|---|
| 414 |  | 
|---|
| 415 | add	TA, E			/* E += A >>> 5 */ | 
|---|
| 416 |  | 
|---|
| 417 | .if ((round_id) < 79) | 
|---|
| 418 | xor	C, A | 
|---|
| 419 | .endif | 
|---|
| 420 | .endm | 
|---|
| 421 |  | 
|---|
| 422 | .macro ROUND_F3 r | 
|---|
| 423 | add	WK(\r), E | 
|---|
| 424 | PRECALC	(\r)			/* msg scheduling for next 2 blocks */ | 
|---|
| 425 |  | 
|---|
| 426 | lea	(RE,RTB), E		/* Add F from the previous round */ | 
|---|
| 427 |  | 
|---|
| 428 | mov	B, T1 | 
|---|
| 429 | or	A, T1 | 
|---|
| 430 |  | 
|---|
| 431 | rorx	$(32-5), A, TA		/* T2 = A >>> 5 */ | 
|---|
| 432 | rorx	$(32-30), A, TB		/* b>>>2 for next round */ | 
|---|
| 433 |  | 
|---|
| 434 | /* Calculate F for the next round | 
|---|
| 435 | * (b and c) or (d and (b or c)) | 
|---|
| 436 | */ | 
|---|
| 437 | and	C, T1 | 
|---|
| 438 | and	B, A | 
|---|
| 439 | or	T1, A | 
|---|
| 440 |  | 
|---|
| 441 | add	TA, E			/* E += A >>> 5 */ | 
|---|
| 442 |  | 
|---|
| 443 | .endm | 
|---|
| 444 |  | 
|---|
| 445 | /* Add constant only if (%2 > %3) condition met (uses RTA as temp) | 
|---|
| 446 | * %1 + %2 >= %3 ? %4 : 0 | 
|---|
| 447 | */ | 
|---|
| 448 | .macro ADD_IF_GE a, b, c, d | 
|---|
| 449 | mov     \a, RTA | 
|---|
| 450 | add     $\d, RTA | 
|---|
| 451 | cmp     $\c, \b | 
|---|
| 452 | cmovge  RTA, \a | 
|---|
| 453 | .endm | 
|---|
| 454 |  | 
|---|
| 455 | /* | 
|---|
| 456 | * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining | 
|---|
| 457 | */ | 
|---|
| 458 | .macro SHA1_PIPELINED_MAIN_BODY | 
|---|
| 459 |  | 
|---|
| 460 | REGALLOC | 
|---|
| 461 |  | 
|---|
| 462 | mov	(HASH_PTR), A | 
|---|
| 463 | mov	4(HASH_PTR), B | 
|---|
| 464 | mov	8(HASH_PTR), C | 
|---|
| 465 | mov	12(HASH_PTR), D | 
|---|
| 466 | mov	16(HASH_PTR), E | 
|---|
| 467 |  | 
|---|
| 468 | mov	%rsp, PRECALC_BUF | 
|---|
| 469 | lea	(2*4*80+32)(%rsp), WK_BUF | 
|---|
| 470 |  | 
|---|
| 471 | # Precalc WK for first 2 blocks | 
|---|
| 472 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 | 
|---|
| 473 | .set i, 0 | 
|---|
| 474 | .rept    160 | 
|---|
| 475 | PRECALC i | 
|---|
| 476 | .set i, i + 1 | 
|---|
| 477 | .endr | 
|---|
| 478 |  | 
|---|
| 479 | /* Go to next block if needed */ | 
|---|
| 480 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 | 
|---|
| 481 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 | 
|---|
| 482 | xchg	WK_BUF, PRECALC_BUF | 
|---|
| 483 |  | 
|---|
| 484 | .align 32 | 
|---|
| 485 | .L_loop: | 
|---|
| 486 | /* | 
|---|
| 487 | * code loops through more than one block | 
|---|
| 488 | * we use K_BASE value as a signal of a last block, | 
|---|
| 489 | * it is set below by: cmovae BUFFER_PTR, K_BASE | 
|---|
| 490 | */ | 
|---|
| 491 | test BLOCKS_CTR, BLOCKS_CTR | 
|---|
| 492 | jnz .L_begin | 
|---|
| 493 | .align 32 | 
|---|
| 494 | jmp	.L_end | 
|---|
| 495 | .align 32 | 
|---|
| 496 | .L_begin: | 
|---|
| 497 |  | 
|---|
| 498 | /* | 
|---|
| 499 | * Do first block | 
|---|
| 500 | * rounds: 0,2,4,6,8 | 
|---|
| 501 | */ | 
|---|
| 502 | .set j, 0 | 
|---|
| 503 | .rept 5 | 
|---|
| 504 | RR	j | 
|---|
| 505 | .set j, j+2 | 
|---|
| 506 | .endr | 
|---|
| 507 |  | 
|---|
| 508 | /* | 
|---|
| 509 | * rounds: | 
|---|
| 510 | * 10,12,14,16,18 | 
|---|
| 511 | * 20,22,24,26,28 | 
|---|
| 512 | * 30,32,34,36,38 | 
|---|
| 513 | * 40,42,44,46,48 | 
|---|
| 514 | * 50,52,54,56,58 | 
|---|
| 515 | */ | 
|---|
| 516 | .rept 25 | 
|---|
| 517 | RR	j | 
|---|
| 518 | .set j, j+2 | 
|---|
| 519 | .endr | 
|---|
| 520 |  | 
|---|
| 521 | /* Update Counter */ | 
|---|
| 522 | sub $1, BLOCKS_CTR | 
|---|
| 523 | /* Move to the next block only if needed*/ | 
|---|
| 524 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 | 
|---|
| 525 | /* | 
|---|
| 526 | * rounds | 
|---|
| 527 | * 60,62,64,66,68 | 
|---|
| 528 | * 70,72,74,76,78 | 
|---|
| 529 | */ | 
|---|
| 530 | .rept 10 | 
|---|
| 531 | RR	j | 
|---|
| 532 | .set j, j+2 | 
|---|
| 533 | .endr | 
|---|
| 534 |  | 
|---|
| 535 | UPDATE_HASH	(HASH_PTR), A | 
|---|
| 536 | UPDATE_HASH	4(HASH_PTR), TB | 
|---|
| 537 | UPDATE_HASH	8(HASH_PTR), C | 
|---|
| 538 | UPDATE_HASH	12(HASH_PTR), D | 
|---|
| 539 | UPDATE_HASH	16(HASH_PTR), E | 
|---|
| 540 |  | 
|---|
| 541 | test	BLOCKS_CTR, BLOCKS_CTR | 
|---|
| 542 | jz	.L_loop | 
|---|
| 543 |  | 
|---|
| 544 | mov	TB, B | 
|---|
| 545 |  | 
|---|
| 546 | /* Process second block */ | 
|---|
| 547 | /* | 
|---|
| 548 | * rounds | 
|---|
| 549 | *  0+80, 2+80, 4+80, 6+80, 8+80 | 
|---|
| 550 | * 10+80,12+80,14+80,16+80,18+80 | 
|---|
| 551 | */ | 
|---|
| 552 |  | 
|---|
| 553 | .set j, 0 | 
|---|
| 554 | .rept 10 | 
|---|
| 555 | RR	j+80 | 
|---|
| 556 | .set j, j+2 | 
|---|
| 557 | .endr | 
|---|
| 558 |  | 
|---|
| 559 | /* | 
|---|
| 560 | * rounds | 
|---|
| 561 | * 20+80,22+80,24+80,26+80,28+80 | 
|---|
| 562 | * 30+80,32+80,34+80,36+80,38+80 | 
|---|
| 563 | */ | 
|---|
| 564 | .rept 10 | 
|---|
| 565 | RR	j+80 | 
|---|
| 566 | .set j, j+2 | 
|---|
| 567 | .endr | 
|---|
| 568 |  | 
|---|
| 569 | /* | 
|---|
| 570 | * rounds | 
|---|
| 571 | * 40+80,42+80,44+80,46+80,48+80 | 
|---|
| 572 | * 50+80,52+80,54+80,56+80,58+80 | 
|---|
| 573 | */ | 
|---|
| 574 | .rept 10 | 
|---|
| 575 | RR	j+80 | 
|---|
| 576 | .set j, j+2 | 
|---|
| 577 | .endr | 
|---|
| 578 |  | 
|---|
| 579 | /* update counter */ | 
|---|
| 580 | sub     $1, BLOCKS_CTR | 
|---|
| 581 | /* Move to the next block only if needed*/ | 
|---|
| 582 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 | 
|---|
| 583 |  | 
|---|
| 584 | /* | 
|---|
| 585 | * rounds | 
|---|
| 586 | * 60+80,62+80,64+80,66+80,68+80 | 
|---|
| 587 | * 70+80,72+80,74+80,76+80,78+80 | 
|---|
| 588 | */ | 
|---|
| 589 | .rept 10 | 
|---|
| 590 | RR	j+80 | 
|---|
| 591 | .set j, j+2 | 
|---|
| 592 | .endr | 
|---|
| 593 |  | 
|---|
| 594 | UPDATE_HASH	(HASH_PTR), A | 
|---|
| 595 | UPDATE_HASH	4(HASH_PTR), TB | 
|---|
| 596 | UPDATE_HASH	8(HASH_PTR), C | 
|---|
| 597 | UPDATE_HASH	12(HASH_PTR), D | 
|---|
| 598 | UPDATE_HASH	16(HASH_PTR), E | 
|---|
| 599 |  | 
|---|
| 600 | /* Reset state for AVX2 reg permutation */ | 
|---|
| 601 | mov	A, TA | 
|---|
| 602 | mov	TB, A | 
|---|
| 603 | mov	C, TB | 
|---|
| 604 | mov	E, C | 
|---|
| 605 | mov	D, B | 
|---|
| 606 | mov	TA, D | 
|---|
| 607 |  | 
|---|
| 608 | REGALLOC | 
|---|
| 609 |  | 
|---|
| 610 | xchg	WK_BUF, PRECALC_BUF | 
|---|
| 611 |  | 
|---|
| 612 | jmp	.L_loop | 
|---|
| 613 |  | 
|---|
| 614 | .align 32 | 
|---|
| 615 | .L_end: | 
|---|
| 616 |  | 
|---|
| 617 | .endm | 
|---|
| 618 | /* | 
|---|
| 619 | * macro implements SHA-1 function's body for several 64-byte blocks | 
|---|
| 620 | * param: function's name | 
|---|
| 621 | */ | 
|---|
| 622 | .macro SHA1_VECTOR_ASM  name | 
|---|
| 623 | SYM_FUNC_START(\name) | 
|---|
| 624 |  | 
|---|
| 625 | push	%rbx | 
|---|
| 626 | push	%r12 | 
|---|
| 627 | push	%r13 | 
|---|
| 628 | push	%r14 | 
|---|
| 629 | push	%r15 | 
|---|
| 630 |  | 
|---|
| 631 | RESERVE_STACK  = (W_SIZE*4 + 8+24) | 
|---|
| 632 |  | 
|---|
| 633 | /* Align stack */ | 
|---|
| 634 | push	%rbp | 
|---|
| 635 | mov	%rsp, %rbp | 
|---|
| 636 | and	$~(0x20-1), %rsp | 
|---|
| 637 | sub	$RESERVE_STACK, %rsp | 
|---|
| 638 |  | 
|---|
| 639 | avx2_zeroupper | 
|---|
| 640 |  | 
|---|
| 641 | /* Setup initial values */ | 
|---|
| 642 | mov	CTX, HASH_PTR | 
|---|
| 643 | mov	BUF, BUFFER_PTR | 
|---|
| 644 |  | 
|---|
| 645 | mov	BUF, BUFFER_PTR2 | 
|---|
| 646 | mov	CNT, BLOCKS_CTR | 
|---|
| 647 |  | 
|---|
| 648 | xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP | 
|---|
| 649 |  | 
|---|
| 650 | SHA1_PIPELINED_MAIN_BODY | 
|---|
| 651 |  | 
|---|
| 652 | avx2_zeroupper | 
|---|
| 653 |  | 
|---|
| 654 | mov	%rbp, %rsp | 
|---|
| 655 | pop	%rbp | 
|---|
| 656 |  | 
|---|
| 657 | pop	%r15 | 
|---|
| 658 | pop	%r14 | 
|---|
| 659 | pop	%r13 | 
|---|
| 660 | pop	%r12 | 
|---|
| 661 | pop	%rbx | 
|---|
| 662 |  | 
|---|
| 663 | RET | 
|---|
| 664 |  | 
|---|
| 665 | SYM_FUNC_END(\name) | 
|---|
| 666 | .endm | 
|---|
| 667 |  | 
|---|
| 668 | .section .rodata | 
|---|
| 669 |  | 
|---|
| 670 | #define K1 0x5a827999 | 
|---|
| 671 | #define K2 0x6ed9eba1 | 
|---|
| 672 | #define K3 0x8f1bbcdc | 
|---|
| 673 | #define K4 0xca62c1d6 | 
|---|
| 674 |  | 
|---|
| 675 | .align 128 | 
|---|
| 676 | K_XMM_AR: | 
|---|
| 677 | .long K1, K1, K1, K1 | 
|---|
| 678 | .long K1, K1, K1, K1 | 
|---|
| 679 | .long K2, K2, K2, K2 | 
|---|
| 680 | .long K2, K2, K2, K2 | 
|---|
| 681 | .long K3, K3, K3, K3 | 
|---|
| 682 | .long K3, K3, K3, K3 | 
|---|
| 683 | .long K4, K4, K4, K4 | 
|---|
| 684 | .long K4, K4, K4, K4 | 
|---|
| 685 |  | 
|---|
| 686 | BSWAP_SHUFB_CTL: | 
|---|
| 687 | .long 0x00010203 | 
|---|
| 688 | .long 0x04050607 | 
|---|
| 689 | .long 0x08090a0b | 
|---|
| 690 | .long 0x0c0d0e0f | 
|---|
| 691 | .long 0x00010203 | 
|---|
| 692 | .long 0x04050607 | 
|---|
| 693 | .long 0x08090a0b | 
|---|
| 694 | .long 0x0c0d0e0f | 
|---|
| 695 | .text | 
|---|
| 696 |  | 
|---|
| 697 | SHA1_VECTOR_ASM     sha1_transform_avx2 | 
|---|
| 698 |  | 
|---|