| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ | 
|---|
| 2 | // | 
|---|
| 3 | // Template to generate [V]PCLMULQDQ-based CRC functions for x86 | 
|---|
| 4 | // | 
|---|
| 5 | // Copyright 2025 Google LLC | 
|---|
| 6 | // | 
|---|
| 7 | // Author: Eric Biggers <ebiggers@google.com> | 
|---|
| 8 |  | 
|---|
| 9 | #include <linux/linkage.h> | 
|---|
| 10 | #include <linux/objtool.h> | 
|---|
| 11 |  | 
|---|
| 12 | // Offsets within the generated constants table | 
|---|
| 13 | .set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only | 
|---|
| 14 | .set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next | 
|---|
| 15 | .set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next | 
|---|
| 16 | .set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next | 
|---|
| 17 | .set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next | 
|---|
| 18 | .set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS,	0*16	// must be 0 | 
|---|
| 19 | .set OFFSETOF_SHUF_TABLE,			1*16 | 
|---|
| 20 | .set OFFSETOF_BARRETT_REDUCTION_CONSTS,		4*16 | 
|---|
| 21 |  | 
|---|
| 22 | // Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the | 
|---|
| 23 | // corresponding non-VEX instruction plus any needed moves.  The supported | 
|---|
| 24 | // instruction formats are: | 
|---|
| 25 | // | 
|---|
| 26 | //     - Two-arg [src, dst], where the non-VEX format is the same. | 
|---|
| 27 | //     - Three-arg [src1, src2, dst] where the non-VEX format is | 
|---|
| 28 | //	 [src1, src2_and_dst].  If src2 != dst, then src1 must != dst too. | 
|---|
| 29 | // | 
|---|
| 30 | // \insn gives the instruction without a "v" prefix and including any immediate | 
|---|
| 31 | // argument if needed to make the instruction follow one of the above formats. | 
|---|
| 32 | // If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to | 
|---|
| 33 | // it first; this is needed when \arg1 is an unaligned mem operand. | 
|---|
| 34 | .macro	_cond_vex	insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp | 
|---|
| 35 | .if AVX_LEVEL == 0 | 
|---|
| 36 | // VEX not allowed.  Emulate it. | 
|---|
| 37 | .ifnb \arg3 // Three-arg [src1, src2, dst] | 
|---|
| 38 | .ifc "\arg2", "\arg3"// src2 == dst? | 
|---|
| 39 | .ifnb \unaligned_mem_tmp | 
|---|
| 40 | movdqu		\arg1, \unaligned_mem_tmp | 
|---|
| 41 | \insn		\unaligned_mem_tmp, \arg3 | 
|---|
| 42 | .else | 
|---|
| 43 | \insn		\arg1, \arg3 | 
|---|
| 44 | .endif | 
|---|
| 45 | .else // src2 != dst | 
|---|
| 46 | .ifc "\arg1", "\arg3" | 
|---|
| 47 | .error "Can't have src1 == dst when src2 != dst" | 
|---|
| 48 | .endif | 
|---|
| 49 | .ifnb \unaligned_mem_tmp | 
|---|
| 50 | movdqu		\arg1, \unaligned_mem_tmp | 
|---|
| 51 | movdqa		\arg2, \arg3 | 
|---|
| 52 | \insn		\unaligned_mem_tmp, \arg3 | 
|---|
| 53 | .else | 
|---|
| 54 | movdqa		\arg2, \arg3 | 
|---|
| 55 | \insn		\arg1, \arg3 | 
|---|
| 56 | .endif | 
|---|
| 57 | .endif | 
|---|
| 58 | .else // Two-arg [src, dst] | 
|---|
| 59 | .ifnb \unaligned_mem_tmp | 
|---|
| 60 | movdqu		\arg1, \unaligned_mem_tmp | 
|---|
| 61 | \insn		\unaligned_mem_tmp, \arg2 | 
|---|
| 62 | .else | 
|---|
| 63 | \insn		\arg1, \arg2 | 
|---|
| 64 | .endif | 
|---|
| 65 | .endif | 
|---|
| 66 | .else | 
|---|
| 67 | // VEX is allowed.  Emit the desired instruction directly. | 
|---|
| 68 | .ifnb \arg3 | 
|---|
| 69 | v\insn		\arg1, \arg2, \arg3 | 
|---|
| 70 | .else | 
|---|
| 71 | v\insn		\arg1, \arg2 | 
|---|
| 72 | .endif | 
|---|
| 73 | .endif | 
|---|
| 74 | .endm | 
|---|
| 75 |  | 
|---|
| 76 | // Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector | 
|---|
| 77 | // register of length VL. | 
|---|
| 78 | .macro	_vbroadcast	src, dst | 
|---|
| 79 | .if VL == 16 | 
|---|
| 80 | _cond_vex movdqa,	\src, \dst | 
|---|
| 81 | .elseif VL == 32 | 
|---|
| 82 | vbroadcasti128		\src, \dst | 
|---|
| 83 | .else | 
|---|
| 84 | vbroadcasti32x4		\src, \dst | 
|---|
| 85 | .endif | 
|---|
| 86 | .endm | 
|---|
| 87 |  | 
|---|
| 88 | // Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC | 
|---|
| 89 | // is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. | 
|---|
| 90 | .macro	_load_data	vl, src, bswap_mask, dst | 
|---|
| 91 | .if \vl < 64 | 
|---|
| 92 | _cond_vex movdqu, "\src", \dst | 
|---|
| 93 | .else | 
|---|
| 94 | vmovdqu8		\src, \dst | 
|---|
| 95 | .endif | 
|---|
| 96 | .if !LSB_CRC | 
|---|
| 97 | _cond_vex pshufb,	\bswap_mask, \dst, \dst | 
|---|
| 98 | .endif | 
|---|
| 99 | .endm | 
|---|
| 100 |  | 
|---|
| 101 | .macro	_prepare_v0	vl, v0, v1, bswap_mask | 
|---|
| 102 | .if LSB_CRC | 
|---|
| 103 | .if \vl < 64 | 
|---|
| 104 | _cond_vex pxor,		(BUF), \v0, \v0, unaligned_mem_tmp=\v1 | 
|---|
| 105 | .else | 
|---|
| 106 | vpxorq			(BUF), \v0, \v0 | 
|---|
| 107 | .endif | 
|---|
| 108 | .else | 
|---|
| 109 | _load_data		\vl, (BUF), \bswap_mask, \v1 | 
|---|
| 110 | .if \vl < 64 | 
|---|
| 111 | _cond_vex pxor,		\v1, \v0, \v0 | 
|---|
| 112 | .else | 
|---|
| 113 | vpxorq			\v1, \v0, \v0 | 
|---|
| 114 | .endif | 
|---|
| 115 | .endif | 
|---|
| 116 | .endm | 
|---|
| 117 |  | 
|---|
| 118 | // The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for | 
|---|
| 119 | // msb-first order or the physically high qword for lsb-first order | 
|---|
| 120 | #define LO64_TERMS 0 | 
|---|
| 121 |  | 
|---|
| 122 | // The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high | 
|---|
| 123 | // qword for msb-first order or the physically low qword for lsb-first order | 
|---|
| 124 | #define HI64_TERMS 1 | 
|---|
| 125 |  | 
|---|
| 126 | // Multiply the given \src1_terms of each 128-bit lane of \src1 by the given | 
|---|
| 127 | // \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. | 
|---|
| 128 | .macro	_pclmulqdq	src1, src1_terms, src2, src2_terms, dst | 
|---|
| 129 | _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ | 
|---|
| 130 | \src1, \src2, \dst | 
|---|
| 131 | .endm | 
|---|
| 132 |  | 
|---|
| 133 | // Fold \acc into \data and store the result back into \acc.  \data can be an | 
|---|
| 134 | // unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no | 
|---|
| 135 | // byte-reflection is needed; otherwise it must be a vector register.  \consts | 
|---|
| 136 | // is a vector register containing the needed fold constants, and \tmp is a | 
|---|
| 137 | // temporary vector register.  All arguments must be the same length. | 
|---|
| 138 | .macro	_fold_vec	acc, data, consts, tmp | 
|---|
| 139 | _pclmulqdq	\consts, HI64_TERMS, \acc, HI64_TERMS, \tmp | 
|---|
| 140 | _pclmulqdq	\consts, LO64_TERMS, \acc, LO64_TERMS, \acc | 
|---|
| 141 | .if AVX_LEVEL <= 2 | 
|---|
| 142 | _cond_vex pxor,	\data, \tmp, \tmp | 
|---|
| 143 | _cond_vex pxor,	\tmp, \acc, \acc | 
|---|
| 144 | .else | 
|---|
| 145 | vpternlogq	$0x96, \data, \tmp, \acc | 
|---|
| 146 | .endif | 
|---|
| 147 | .endm | 
|---|
| 148 |  | 
|---|
| 149 | // Fold \acc into \data and store the result back into \acc.  \data is an | 
|---|
| 150 | // unaligned mem operand, \consts is a vector register containing the needed | 
|---|
| 151 | // fold constants, \bswap_mask is a vector register containing the | 
|---|
| 152 | // byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are | 
|---|
| 153 | // temporary vector registers.  All arguments must have length \vl. | 
|---|
| 154 | .macro	_fold_vec_mem	vl, acc, data, consts, bswap_mask, tmp1, tmp2 | 
|---|
| 155 | .if AVX_LEVEL == 0 || !LSB_CRC | 
|---|
| 156 | _load_data	\vl, \data, \bswap_mask, \tmp1 | 
|---|
| 157 | _fold_vec	\acc, \tmp1, \consts, \tmp2 | 
|---|
| 158 | .else | 
|---|
| 159 | _fold_vec	\acc, \data, \consts, \tmp1 | 
|---|
| 160 | .endif | 
|---|
| 161 | .endm | 
|---|
| 162 |  | 
|---|
| 163 | // Load the constants for folding across 2**i vectors of length VL at a time | 
|---|
| 164 | // into all 128-bit lanes of the vector register CONSTS. | 
|---|
| 165 | .macro	_load_vec_folding_consts	i | 
|---|
| 166 | _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ | 
|---|
| 167 | CONSTS | 
|---|
| 168 | .endm | 
|---|
| 169 |  | 
|---|
| 170 | // Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store | 
|---|
| 171 | // the result back into \v0.  If the remaining length mod \vl is nonzero, also | 
|---|
| 172 | // fold \vl data bytes from BUF.  For both operations the fold distance is \vl. | 
|---|
| 173 | // \consts must be a register of length \vl containing the fold constants. | 
|---|
| 174 | .macro	_fold_vec_final	vl, v0, v1, consts, bswap_mask, tmp1, tmp2 | 
|---|
| 175 | _fold_vec	\v0, \v1, \consts, \tmp1 | 
|---|
| 176 | test		$\vl, LEN8 | 
|---|
| 177 | jz		.Lfold_vec_final_done\@ | 
|---|
| 178 | _fold_vec_mem	\vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 | 
|---|
| 179 | add		$\vl, BUF | 
|---|
| 180 | .Lfold_vec_final_done\@: | 
|---|
| 181 | .endm | 
|---|
| 182 |  | 
|---|
| 183 | // This macro generates the body of a CRC function with the following prototype: | 
|---|
| 184 | // | 
|---|
| 185 | // crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); | 
|---|
| 186 | // | 
|---|
| 187 | // |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. | 
|---|
| 188 | // |buf| is the data to checksum.  |len| is the data length in bytes, which must | 
|---|
| 189 | // be at least 16.  |consts| is a pointer to the fold_across_128_bits_consts | 
|---|
| 190 | // field of the constants struct that was generated for the chosen CRC variant. | 
|---|
| 191 | // | 
|---|
| 192 | // Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. | 
|---|
| 193 | // 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If | 
|---|
| 194 | // the file is compiled in i386 mode, then the maximum supported value is 32. | 
|---|
| 195 | // | 
|---|
| 196 | // \lsb_crc is 1 if the CRC processes the least significant bit of each byte | 
|---|
| 197 | // first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0.  \lsb_crc is 0 | 
|---|
| 198 | // if the CRC processes the most significant bit of each byte first, i.e. maps | 
|---|
| 199 | // bit0 to x^0, bit1 to x^1, bit7 to x^7. | 
|---|
| 200 | // | 
|---|
| 201 | // \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. | 
|---|
| 202 | // | 
|---|
| 203 | // \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or | 
|---|
| 204 | // 512 for AVX512. | 
|---|
| 205 | // | 
|---|
| 206 | // If \vl == 16 && \avx_level == 0, the generated code requires: | 
|---|
| 207 | // PCLMULQDQ && SSE4.1.  (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) | 
|---|
| 208 | // | 
|---|
| 209 | // If \vl == 32 && \avx_level == 2, the generated code requires: | 
|---|
| 210 | // VPCLMULQDQ && AVX2. | 
|---|
| 211 | // | 
|---|
| 212 | // If \vl == 64 && \avx_level == 512, the generated code requires: | 
|---|
| 213 | // VPCLMULQDQ && AVX512BW && AVX512VL. | 
|---|
| 214 | // | 
|---|
| 215 | // Other \vl and \avx_level combinations are either not supported or not useful. | 
|---|
| 216 | .macro	_crc_pclmul	n, lsb_crc, vl, avx_level | 
|---|
| 217 | .set	LSB_CRC,	\lsb_crc | 
|---|
| 218 | .set	VL,		\vl | 
|---|
| 219 | .set	AVX_LEVEL,	\avx_level | 
|---|
| 220 |  | 
|---|
| 221 | // Define aliases for the xmm, ymm, or zmm registers according to VL. | 
|---|
| 222 | .irp i, 0,1,2,3,4,5,6,7 | 
|---|
| 223 | .if VL == 16 | 
|---|
| 224 | .set	V\i,		%xmm\i | 
|---|
| 225 | .set	LOG2_VL,	4 | 
|---|
| 226 | .elseif VL == 32 | 
|---|
| 227 | .set	V\i,		%ymm\i | 
|---|
| 228 | .set	LOG2_VL,	5 | 
|---|
| 229 | .elseif VL == 64 | 
|---|
| 230 | .set	V\i,		%zmm\i | 
|---|
| 231 | .set	LOG2_VL,	6 | 
|---|
| 232 | .else | 
|---|
| 233 | .error "Unsupported vector length" | 
|---|
| 234 | .endif | 
|---|
| 235 | .endr | 
|---|
| 236 | // Define aliases for the function parameters. | 
|---|
| 237 | // Note: when crc_t is shorter than u32, zero-extension to 32 bits is | 
|---|
| 238 | // guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed | 
|---|
| 239 | // when crc_t is shorter than u64. | 
|---|
| 240 | #ifdef __x86_64__ | 
|---|
| 241 | .if \n <= 32 | 
|---|
| 242 | .set	CRC,		%edi | 
|---|
| 243 | .else | 
|---|
| 244 | .set	CRC,		%rdi | 
|---|
| 245 | .endif | 
|---|
| 246 | .set	BUF,		%rsi | 
|---|
| 247 | .set	LEN,		%rdx | 
|---|
| 248 | .set	LEN32,		%edx | 
|---|
| 249 | .set	LEN8,		%dl | 
|---|
| 250 | .set	CONSTS_PTR,	%rcx | 
|---|
| 251 | #else | 
|---|
| 252 | // 32-bit support, assuming -mregparm=3 and not including support for | 
|---|
| 253 | // CRC-64 (which would use both eax and edx to pass the crc parameter). | 
|---|
| 254 | .set	CRC,		%eax | 
|---|
| 255 | .set	BUF,		%edx | 
|---|
| 256 | .set	LEN,		%ecx | 
|---|
| 257 | .set	LEN32,		%ecx | 
|---|
| 258 | .set	LEN8,		%cl | 
|---|
| 259 | .set	CONSTS_PTR,	%ebx	// Passed on stack | 
|---|
| 260 | #endif | 
|---|
| 261 |  | 
|---|
| 262 | // Define aliases for some local variables.  V0-V5 are used without | 
|---|
| 263 | // aliases (for accumulators, data, temporary values, etc).  Staying | 
|---|
| 264 | // within the first 8 vector registers keeps the code 32-bit SSE | 
|---|
| 265 | // compatible and reduces the size of 64-bit SSE code slightly. | 
|---|
| 266 | .set	BSWAP_MASK,	V6 | 
|---|
| 267 | .set	BSWAP_MASK_YMM,	%ymm6 | 
|---|
| 268 | .set	BSWAP_MASK_XMM,	%xmm6 | 
|---|
| 269 | .set	CONSTS,		V7 | 
|---|
| 270 | .set	CONSTS_YMM,	%ymm7 | 
|---|
| 271 | .set	CONSTS_XMM,	%xmm7 | 
|---|
| 272 |  | 
|---|
| 273 | // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the | 
|---|
| 274 | // functions generated by this macro are called only by static_call. | 
|---|
| 275 | ANNOTATE_NOENDBR | 
|---|
| 276 |  | 
|---|
| 277 | #ifdef __i386__ | 
|---|
| 278 | push		CONSTS_PTR | 
|---|
| 279 | mov		8(%esp), CONSTS_PTR | 
|---|
| 280 | #endif | 
|---|
| 281 |  | 
|---|
| 282 | // Create a 128-bit vector that contains the initial CRC in the end | 
|---|
| 283 | // representing the high-order polynomial coefficients, and the rest 0. | 
|---|
| 284 | // If the CRC is msb-first, also load the byte-reflection table. | 
|---|
| 285 | .if \n <= 32 | 
|---|
| 286 | _cond_vex movd,	CRC, %xmm0 | 
|---|
| 287 | .else | 
|---|
| 288 | _cond_vex movq,	CRC, %xmm0 | 
|---|
| 289 | .endif | 
|---|
| 290 | .if !LSB_CRC | 
|---|
| 291 | _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 | 
|---|
| 292 | _vbroadcast	OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK | 
|---|
| 293 | .endif | 
|---|
| 294 |  | 
|---|
| 295 | // Load the first vector of data and XOR the initial CRC into the | 
|---|
| 296 | // appropriate end of the first 128-bit lane of data.  If LEN < VL, then | 
|---|
| 297 | // use a short vector and jump ahead to the final reduction.  (LEN >= 16 | 
|---|
| 298 | // is guaranteed here but not necessarily LEN >= VL.) | 
|---|
| 299 | .if VL >= 32 | 
|---|
| 300 | cmp		$VL, LEN | 
|---|
| 301 | jae		.Lat_least_1vec\@ | 
|---|
| 302 | .if VL == 64 | 
|---|
| 303 | cmp		$32, LEN32 | 
|---|
| 304 | jb		.Lless_than_32bytes\@ | 
|---|
| 305 | _prepare_v0	32, %ymm0, %ymm1, BSWAP_MASK_YMM | 
|---|
| 306 | add		$32, BUF | 
|---|
| 307 | jmp		.Lreduce_256bits_to_128bits\@ | 
|---|
| 308 | .Lless_than_32bytes\@: | 
|---|
| 309 | .endif | 
|---|
| 310 | _prepare_v0	16, %xmm0, %xmm1, BSWAP_MASK_XMM | 
|---|
| 311 | add		$16, BUF | 
|---|
| 312 | vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM | 
|---|
| 313 | jmp		.Lcheck_for_partial_block\@ | 
|---|
| 314 | .Lat_least_1vec\@: | 
|---|
| 315 | .endif | 
|---|
| 316 | _prepare_v0	VL, V0, V1, BSWAP_MASK | 
|---|
| 317 |  | 
|---|
| 318 | // Handle VL <= LEN < 4*VL. | 
|---|
| 319 | cmp		$4*VL-1, LEN | 
|---|
| 320 | ja		.Lat_least_4vecs\@ | 
|---|
| 321 | add		$VL, BUF | 
|---|
| 322 | // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. | 
|---|
| 323 | // If VL==16 then load fold_across_128_bits_consts first, as the final | 
|---|
| 324 | // reduction depends on it and it won't be loaded anywhere else. | 
|---|
| 325 | cmp		$2*VL-1, LEN32 | 
|---|
| 326 | .if VL == 16 | 
|---|
| 327 | _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM | 
|---|
| 328 | .endif | 
|---|
| 329 | jbe		.Lreduce_1vec_to_128bits\@ | 
|---|
| 330 | // Otherwise 2*VL <= LEN < 4*VL.  Load one more vector and jump ahead to | 
|---|
| 331 | // the reduction from 2 vectors. | 
|---|
| 332 | _load_data	VL, (BUF), BSWAP_MASK, V1 | 
|---|
| 333 | add		$VL, BUF | 
|---|
| 334 | jmp		.Lreduce_2vecs_to_1\@ | 
|---|
| 335 |  | 
|---|
| 336 | .Lat_least_4vecs\@: | 
|---|
| 337 | // Load 3 more vectors of data. | 
|---|
| 338 | _load_data	VL, 1*VL(BUF), BSWAP_MASK, V1 | 
|---|
| 339 | _load_data	VL, 2*VL(BUF), BSWAP_MASK, V2 | 
|---|
| 340 | _load_data	VL, 3*VL(BUF), BSWAP_MASK, V3 | 
|---|
| 341 | sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32 | 
|---|
| 342 | add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32 | 
|---|
| 343 |  | 
|---|
| 344 | // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next | 
|---|
| 345 | // 4 vectors of data and write the result back to V0-V3. | 
|---|
| 346 | cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32 | 
|---|
| 347 | jbe		.Lreduce_4vecs_to_2\@ | 
|---|
| 348 | _load_vec_folding_consts	2 | 
|---|
| 349 | .Lfold_4vecs_loop\@: | 
|---|
| 350 | _fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 | 
|---|
| 351 | _fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 | 
|---|
| 352 | _fold_vec_mem	VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 | 
|---|
| 353 | _fold_vec_mem	VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 | 
|---|
| 354 | sub		$-4*VL, BUF | 
|---|
| 355 | add		$-4*VL, LEN | 
|---|
| 356 | cmp		$4*VL-1, LEN | 
|---|
| 357 | ja		.Lfold_4vecs_loop\@ | 
|---|
| 358 |  | 
|---|
| 359 | // Fold V0,V1 into V2,V3 and write the result back to V0,V1.  Then fold | 
|---|
| 360 | // two more vectors of data from BUF, if at least that much remains. | 
|---|
| 361 | .Lreduce_4vecs_to_2\@: | 
|---|
| 362 | _load_vec_folding_consts	1 | 
|---|
| 363 | _fold_vec	V0, V2, CONSTS, V4 | 
|---|
| 364 | _fold_vec	V1, V3, CONSTS, V4 | 
|---|
| 365 | test		$2*VL, LEN8 | 
|---|
| 366 | jz		.Lreduce_2vecs_to_1\@ | 
|---|
| 367 | _fold_vec_mem	VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 | 
|---|
| 368 | _fold_vec_mem	VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 | 
|---|
| 369 | sub		$-2*VL, BUF | 
|---|
| 370 |  | 
|---|
| 371 | // Fold V0 into V1 and write the result back to V0.  Then fold one more | 
|---|
| 372 | // vector of data from BUF, if at least that much remains. | 
|---|
| 373 | .Lreduce_2vecs_to_1\@: | 
|---|
| 374 | _load_vec_folding_consts	0 | 
|---|
| 375 | _fold_vec_final	VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 | 
|---|
| 376 |  | 
|---|
| 377 | .Lreduce_1vec_to_128bits\@: | 
|---|
| 378 | .if VL == 64 | 
|---|
| 379 | // Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of | 
|---|
| 380 | // data from BUF, if at least that much remains. | 
|---|
| 381 | vbroadcasti128	OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM | 
|---|
| 382 | vextracti64x4	$1, %zmm0, %ymm1 | 
|---|
| 383 | _fold_vec_final	32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 | 
|---|
| 384 | .Lreduce_256bits_to_128bits\@: | 
|---|
| 385 | .endif | 
|---|
| 386 | .if VL >= 32 | 
|---|
| 387 | // Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of | 
|---|
| 388 | // data from BUF, if at least that much remains. | 
|---|
| 389 | vmovdqa		OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM | 
|---|
| 390 | vextracti128	$1, %ymm0, %xmm1 | 
|---|
| 391 | _fold_vec_final	16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 | 
|---|
| 392 | .Lcheck_for_partial_block\@: | 
|---|
| 393 | .endif | 
|---|
| 394 | and		$15, LEN32 | 
|---|
| 395 | jz		.Lreduce_128bits_to_crc\@ | 
|---|
| 396 |  | 
|---|
| 397 | // 1 <= LEN <= 15 data bytes remain in BUF.  The polynomial is now | 
|---|
| 398 | // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 | 
|---|
| 399 | // and B is the polynomial of the remaining LEN data bytes.  To reduce | 
|---|
| 400 | // this to 128 bits without needing fold constants for each possible | 
|---|
| 401 | // LEN, rearrange this expression into C1*(x^128) + C2, where | 
|---|
| 402 | // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. | 
|---|
| 403 | // Then fold C1 into C2, which is just another fold across 128 bits. | 
|---|
| 404 |  | 
|---|
| 405 | .if !LSB_CRC || AVX_LEVEL == 0 | 
|---|
| 406 | // Load the last 16 data bytes.  Note that originally LEN was >= 16. | 
|---|
| 407 | _load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 | 
|---|
| 408 | .endif // Else will use vpblendvb mem operand later. | 
|---|
| 409 | .if !LSB_CRC | 
|---|
| 410 | neg		LEN	// Needed for indexing shuf_table | 
|---|
| 411 | .endif | 
|---|
| 412 |  | 
|---|
| 413 | // tmp = A*x^(8*LEN) mod x^128 | 
|---|
| 414 | // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] | 
|---|
| 415 | //	i.e. right-shift by LEN bytes. | 
|---|
| 416 | // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] | 
|---|
| 417 | //	i.e. left-shift by LEN bytes. | 
|---|
| 418 | _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 | 
|---|
| 419 | _cond_vex pshufb,	%xmm3, %xmm0, %xmm1 | 
|---|
| 420 |  | 
|---|
| 421 | // C1 = floor(A / x^(128 - 8*LEN)) | 
|---|
| 422 | // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] | 
|---|
| 423 | //	i.e. left-shift by 16-LEN bytes. | 
|---|
| 424 | // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] | 
|---|
| 425 | //	i.e. right-shift by 16-LEN bytes. | 
|---|
| 426 | _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ | 
|---|
| 427 | %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 | 
|---|
| 428 |  | 
|---|
| 429 | // C2 = tmp + B.  This is just a blend of tmp with the last 16 data | 
|---|
| 430 | // bytes (reflected if msb-first).  The blend mask is the shuffle table | 
|---|
| 431 | // that was used to create tmp.  0 selects tmp, and 1 last16databytes. | 
|---|
| 432 | .if AVX_LEVEL == 0 | 
|---|
| 433 | movdqa		%xmm0, %xmm4 | 
|---|
| 434 | movdqa		%xmm3, %xmm0 | 
|---|
| 435 | pblendvb	%xmm2, %xmm1	// uses %xmm0 as implicit operand | 
|---|
| 436 | movdqa		%xmm4, %xmm0 | 
|---|
| 437 | .elseif LSB_CRC | 
|---|
| 438 | vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1 | 
|---|
| 439 | .else | 
|---|
| 440 | vpblendvb	%xmm3, %xmm2, %xmm1, %xmm1 | 
|---|
| 441 | .endif | 
|---|
| 442 |  | 
|---|
| 443 | // Fold C1 into C2 and store the 128-bit result in %xmm0. | 
|---|
| 444 | _fold_vec	%xmm0, %xmm1, CONSTS_XMM, %xmm4 | 
|---|
| 445 |  | 
|---|
| 446 | .Lreduce_128bits_to_crc\@: | 
|---|
| 447 | // Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit | 
|---|
| 448 | // polynomial stored in %xmm0 (using either lsb-first or msb-first bit | 
|---|
| 449 | // order according to LSB_CRC), and G is the CRC's generator polynomial. | 
|---|
| 450 |  | 
|---|
| 451 | // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: | 
|---|
| 452 | // | 
|---|
| 453 | //	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + | 
|---|
| 454 | //	      x^n * (%xmm0 mod x^64) | 
|---|
| 455 | // | 
|---|
| 456 | // Store t0 * x^(64-n) in %xmm0.  I.e., actually do: | 
|---|
| 457 | // | 
|---|
| 458 | //	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + | 
|---|
| 459 | //		 x^64 * (%xmm0 mod x^64) | 
|---|
| 460 | // | 
|---|
| 461 | // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned | 
|---|
| 462 | // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily | 
|---|
| 463 | // select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the | 
|---|
| 464 | // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case | 
|---|
| 465 | // (considering the extra factor of x that gets implicitly introduced by | 
|---|
| 466 | // each pclmulqdq when using lsb-first order), is identical to the | 
|---|
| 467 | // constant that was used earlier for folding the LO64_TERMS across 128 | 
|---|
| 468 | // bits.  Thus it's already available in LO64_TERMS of CONSTS_XMM. | 
|---|
| 469 | _pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 | 
|---|
| 470 | .if LSB_CRC | 
|---|
| 471 | _cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64) | 
|---|
| 472 | .else | 
|---|
| 473 | _cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64) | 
|---|
| 474 | .endif | 
|---|
| 475 | _cond_vex pxor,		%xmm1, %xmm0, %xmm0 | 
|---|
| 476 | // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). | 
|---|
| 477 | // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). | 
|---|
| 478 |  | 
|---|
| 479 | // First step of Barrett reduction: Compute floor(t0 / G).  This is the | 
|---|
| 480 | // polynomial by which G needs to be multiplied to cancel out the x^n | 
|---|
| 481 | // and higher terms of t0, i.e. to reduce t0 mod G.  First do: | 
|---|
| 482 | // | 
|---|
| 483 | //	t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) | 
|---|
| 484 | // | 
|---|
| 485 | // Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in | 
|---|
| 486 | // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest | 
|---|
| 487 | // value that makes enough precision be carried through the calculation. | 
|---|
| 488 | // | 
|---|
| 489 | // The '* x' makes it so the result is floor(t1 / x^64) rather than | 
|---|
| 490 | // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it | 
|---|
| 491 | // can be extracted much more easily in the next step.  In the lsb-first | 
|---|
| 492 | // case the '* x' happens implicitly.  In the msb-first case it must be | 
|---|
| 493 | // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the | 
|---|
| 494 | // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and | 
|---|
| 495 | // the multiplication by the x^64 term is handled using a pxor.  The | 
|---|
| 496 | // pxor causes the low 64 terms of t1 to be wrong, but they are unused. | 
|---|
| 497 | _cond_vex movdqa,	OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM | 
|---|
| 498 | _pclmulqdq		CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 | 
|---|
| 499 | .if !LSB_CRC | 
|---|
| 500 | _cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) | 
|---|
| 501 | .endif | 
|---|
| 502 | // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). | 
|---|
| 503 |  | 
|---|
| 504 | // Second step of Barrett reduction: Cancel out the x^n and higher terms | 
|---|
| 505 | // of t0 by subtracting the needed multiple of G.  This gives the CRC: | 
|---|
| 506 | // | 
|---|
| 507 | //	crc := t0 - (G * floor(t0 / G)) | 
|---|
| 508 | // | 
|---|
| 509 | // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: | 
|---|
| 510 | // | 
|---|
| 511 | //	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) | 
|---|
| 512 | // | 
|---|
| 513 | // Furthermore, since the resulting CRC is n-bit, if mod x^n is | 
|---|
| 514 | // explicitly applied to it then the x^n term of G makes no difference | 
|---|
| 515 | // in the result and can be omitted.  This helps keep the constant | 
|---|
| 516 | // multiplier in 64 bits in most cases.  This gives the following: | 
|---|
| 517 | // | 
|---|
| 518 | //	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) | 
|---|
| 519 | //	crc := (%xmm0 / x^(64-n)) mod x^n | 
|---|
| 520 | // | 
|---|
| 521 | // In the lsb-first case, each pclmulqdq implicitly introduces | 
|---|
| 522 | // an extra factor of x, so in that case the constant that needs to be | 
|---|
| 523 | // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. | 
|---|
| 524 | // For lsb-first CRCs where n=64, the extra factor of x cannot be as | 
|---|
| 525 | // easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to | 
|---|
| 526 | // pclmulqdq and handle the x^0 term (i.e. 1) separately.  (All CRC | 
|---|
| 527 | // polynomials have nonzero x^n and x^0 terms.)  It works out as: the | 
|---|
| 528 | // CRC has be XORed with the physically low qword of %xmm1, representing | 
|---|
| 529 | // floor(t0 / G).  The most efficient way to do that is to move it to | 
|---|
| 530 | // the physically high qword and use a ternlog to combine the two XORs. | 
|---|
| 531 | .if LSB_CRC && \n == 64 | 
|---|
| 532 | _cond_vex punpcklqdq,	%xmm1, %xmm2, %xmm2 | 
|---|
| 533 | _pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 | 
|---|
| 534 | .if AVX_LEVEL <= 2 | 
|---|
| 535 | _cond_vex pxor,		%xmm2, %xmm0, %xmm0 | 
|---|
| 536 | _cond_vex pxor,		%xmm1, %xmm0, %xmm0 | 
|---|
| 537 | .else | 
|---|
| 538 | vpternlogq		$0x96, %xmm2, %xmm1, %xmm0 | 
|---|
| 539 | .endif | 
|---|
| 540 | _cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64 | 
|---|
| 541 | .else | 
|---|
| 542 | _pclmulqdq		CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 | 
|---|
| 543 | _cond_vex pxor,		%xmm1, %xmm0, %xmm0 | 
|---|
| 544 | .if \n == 8 | 
|---|
| 545 | _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 | 
|---|
| 546 | .elseif \n == 16 | 
|---|
| 547 | _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 | 
|---|
| 548 | .elseif \n == 32 | 
|---|
| 549 | _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 | 
|---|
| 550 | .else // \n == 64 && !LSB_CRC | 
|---|
| 551 | _cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64 | 
|---|
| 552 | .endif | 
|---|
| 553 | .endif | 
|---|
| 554 |  | 
|---|
| 555 | .if VL > 16 | 
|---|
| 556 | vzeroupper	// Needed when ymm or zmm registers may have been used. | 
|---|
| 557 | .endif | 
|---|
| 558 | #ifdef __i386__ | 
|---|
| 559 | pop		CONSTS_PTR | 
|---|
| 560 | #endif | 
|---|
| 561 | RET | 
|---|
| 562 | .endm | 
|---|
| 563 |  | 
|---|
| 564 | #define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb)			\ | 
|---|
| 565 | SYM_FUNC_START(prefix##_pclmul_sse);					\ | 
|---|
| 566 | _crc_pclmul	n=bits, lsb_crc=lsb, vl=16, avx_level=0;	\ | 
|---|
| 567 | SYM_FUNC_END(prefix##_pclmul_sse);					\ | 
|---|
| 568 | \ | 
|---|
| 569 | SYM_FUNC_START(prefix##_vpclmul_avx2);					\ | 
|---|
| 570 | _crc_pclmul	n=bits, lsb_crc=lsb, vl=32, avx_level=2;	\ | 
|---|
| 571 | SYM_FUNC_END(prefix##_vpclmul_avx2);					\ | 
|---|
| 572 | \ | 
|---|
| 573 | SYM_FUNC_START(prefix##_vpclmul_avx512);				\ | 
|---|
| 574 | _crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\ | 
|---|
| 575 | SYM_FUNC_END(prefix##_vpclmul_avx512); | 
|---|
| 576 |  | 
|---|