crc-pclmul-template.S source code [Linux/lib/crc/x86/crc-pclmul-template.S]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	//
3	// Template to generate [V]PCLMULQDQ-based CRC functions for x86
4	//
5	// Copyright 2025 Google LLC
6	//
7	// Author: Eric Biggers <ebiggers@google.com>
8
9	#include <linux/linkage.h>
10	#include <linux/objtool.h>
11
12	// Offsets within the generated constants table
13	.set OFFSETOF_BSWAP_MASK, -`5``16` // msb-first CRCs only*
14	.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -`4``16` // must precede next*
15	.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -`3``16` // must precede next*
16	.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -`2``16` // must precede next*
17	.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -`1``16` // must precede next*
18	.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, `0``16` // must be 0*
19	.set OFFSETOF_SHUF_TABLE, `1`*`16`
20	.set OFFSETOF_BARRETT_REDUCTION_CONSTS, `4`*`16`
21
22	// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the
23	// corresponding non-VEX instruction plus any needed moves. The supported
24	// instruction formats are:
25	//
26	// - Two-arg [src, dst], where the non-VEX format is the same.
27	// - Three-arg [src1, src2, dst] where the non-VEX format is
28	// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too.
29	//
30	// \insn gives the instruction without a "v" prefix and including any immediate
31	// argument if needed to make the instruction follow one of the above formats.
32	// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
33	// it first; this is needed when \arg1 is an unaligned mem operand.
34	.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp
35	.if AVX_LEVEL == `0`
36	// VEX not allowed. Emulate it.
37	.ifnb \arg3 // Three-arg [src1, src2, dst]
38	.ifc "\arg2", "\arg3" // src2 == dst?
39	.ifnb \unaligned_mem_tmp
40	movdqu \arg1, \unaligned_mem_tmp
41	\insn \unaligned_mem_tmp, \arg3
42	.else
43	\insn \arg1, \arg3
44	.endif
45	.else // src2 != dst
46	.ifc "\arg1", "\arg3"
47	.error "Can't have src1 == dst when src2 != dst"
48	.endif
49	.ifnb \unaligned_mem_tmp
50	movdqu \arg1, \unaligned_mem_tmp
51	movdqa \arg2, \arg3
52	\insn \unaligned_mem_tmp, \arg3
53	.else
54	movdqa \arg2, \arg3
55	\insn \arg1, \arg3
56	.endif
57	.endif
58	.else // Two-arg [src, dst]
59	.ifnb \unaligned_mem_tmp
60	movdqu \arg1, \unaligned_mem_tmp
61	\insn \unaligned_mem_tmp, \arg2
62	.else
63	\insn \arg1, \arg2
64	.endif
65	.endif
66	.else
67	// VEX is allowed. Emit the desired instruction directly.
68	.ifnb \arg3
69	v\insn \arg1, \arg2, \arg3
70	.else
71	v\insn \arg1, \arg2
72	.endif
73	.endif
74	.endm
75
76	// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
77	// register of length VL.
78	.macro _vbroadcast src, dst
79	.if VL == `16`
80	_cond_vex movdqa, \src, \dst
81	.elseif VL == `32`
82	vbroadcasti128 \src, \dst
83	.else
84	vbroadcasti32x4 \src, \dst
85	.endif
86	.endm
87
88	// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC
89	// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
90	.macro _load_data vl, src, bswap_mask, dst
91	.if \vl < `64`
92	_cond_vex movdqu, "\src", \dst
93	.else
94	vmovdqu8 \src, \dst
95	.endif
96	.if !LSB_CRC
97	_cond_vex pshufb, \bswap_mask, \dst, \dst
98	.endif
99	.endm
100
101	.macro _prepare_v0 vl, v0, v1, bswap_mask
102	.if LSB_CRC
103	.if \vl < `64`
104	_cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1
105	.else
106	vpxorq (BUF), \v0, \v0
107	.endif
108	.else
109	_load_data \vl, (BUF), \bswap_mask, \v1
110	.if \vl < `64`
111	_cond_vex pxor, \v1, \v0, \v0
112	.else
113	vpxorq \v1, \v0, \v0
114	.endif
115	.endif
116	.endm
117
118	// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
119	// msb-first order or the physically high qword for lsb-first order
120	#define LO64_TERMS 0
121
122	// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
123	// qword for msb-first order or the physically low qword for lsb-first order
124	#define HI64_TERMS 1
125
126	// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
127	// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
128	.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst
129	_cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \
130	\src1, \src2, \dst
131	.endm
132
133	// Fold \acc into \data and store the result back into \acc. \data can be an
134	// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
135	// byte-reflection is needed; otherwise it must be a vector register. \consts
136	// is a vector register containing the needed fold constants, and \tmp is a
137	// temporary vector register. All arguments must be the same length.
138	.macro _fold_vec acc, data, consts, tmp
139	_pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp
140	_pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc
141	.if AVX_LEVEL <= `2`
142	_cond_vex pxor, \data, \tmp, \tmp
143	_cond_vex pxor, \tmp, \acc, \acc
144	.else
145	vpternlogq $`0x96`, \data, \tmp, \acc
146	.endif
147	.endm
148
149	// Fold \acc into \data and store the result back into \acc. \data is an
150	// unaligned mem operand, \consts is a vector register containing the needed
151	// fold constants, \bswap_mask is a vector register containing the
152	// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
153	// temporary vector registers. All arguments must have length \vl.
154	.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2
155	.if AVX_LEVEL == `0` \|\| !LSB_CRC
156	_load_data \vl, \data, \bswap_mask, \tmp1
157	_fold_vec \acc, \tmp1, \consts, \tmp2
158	.else
159	_fold_vec \acc, \data, \consts, \tmp1
160	.endif
161	.endm
162
163	// Load the constants for folding across 2i vectors of length VL at a time
164	// into all 128-bit lanes of the vector register CONSTS.
165	.macro _load_vec_folding_consts i
166	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(`4`-LOG2_VL-\i)*`16`(CONSTS_PTR), \
167	CONSTS
168	.endm
169
170	// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store
171	// the result back into \v0. If the remaining length mod \vl is nonzero, also
172	// fold \vl data bytes from BUF. For both operations the fold distance is \vl.
173	// \consts must be a register of length \vl containing the fold constants.
174	.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2
175	_fold_vec \v0, \v1, \consts, \tmp1
176	test $\vl, LEN8
177	jz .Lfold_vec_final_done\@
178	_fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2
179	add $\vl, BUF
180	.Lfold_vec_final_done\@:
181	.endm
182
183	// This macro generates the body of a CRC function with the following prototype:
184	//
185	// crc_t crc_func(crc_t crc, const u8 buf, size_t len, const void consts);
186	//
187	// \|crc\| is the initial CRC, and crc_t is a data type wide enough to hold it.
188	// \|buf\| is the data to checksum. \|len\| is the data length in bytes, which must
189	// be at least 16. \|consts\| is a pointer to the fold_across_128_bits_consts
190	// field of the constants struct that was generated for the chosen CRC variant.
191	//
192	// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g.
193	// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
194	// the file is compiled in i386 mode, then the maximum supported value is 32.
195	//
196	// \lsb_crc is 1 if the CRC processes the least significant bit of each byte
197	// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0
198	// if the CRC processes the most significant bit of each byte first, i.e. maps
199	// bit0 to x^0, bit1 to x^1, bit7 to x^7.
200	//
201	// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
202	//
203	// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or
204	// 512 for AVX512.
205	//
206	// If \vl == 16 && \avx_level == 0, the generated code requires:
207	// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.)
208	//
209	// If \vl == 32 && \avx_level == 2, the generated code requires:
210	// VPCLMULQDQ && AVX2.
211	//
212	// If \vl == 64 && \avx_level == 512, the generated code requires:
213	// VPCLMULQDQ && AVX512BW && AVX512VL.
214	//
215	// Other \vl and \avx_level combinations are either not supported or not useful.
216	.macro _crc_pclmul n, lsb_crc, vl, avx_level
217	.set LSB_CRC, \lsb_crc
218	.set VL, \vl
219	.set AVX_LEVEL, \avx_level
220
221	// Define aliases for the xmm, ymm, or zmm registers according to VL.
222	.irp i, `0`,`1`,`2`,`3`,`4`,`5`,`6`,`7`
223	.if VL == `16`
224	.set V\i, %xmm\i
225	.set LOG2_VL, `4`
226	.elseif VL == `32`
227	.set V\i, %ymm\i
228	.set LOG2_VL, `5`
229	.elseif VL == `64`
230	.set V\i, %zmm\i
231	.set LOG2_VL, `6`
232	.else
233	.error "Unsupported vector length"
234	.endif
235	.endr
236	// Define aliases for the function parameters.
237	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
238	// guaranteed by the ABI. Zero-extension to 64 bits is not* guaranteed*
239	// when crc_t is shorter than u64.
240	#ifdef __x86_64__
241	.if \n <= `32`
242	.set CRC, %edi
243	.else
244	.set CRC, %rdi
245	.endif
246	.set BUF, %rsi
247	.set LEN, %rdx
248	.set LEN32, %edx
249	.set LEN8, %dl
250	.set CONSTS_PTR, %rcx
251	#else
252	// 32-bit support, assuming -mregparm=3 and not including support for
253	// CRC-64 (which would use both eax and edx to pass the crc parameter).
254	.set CRC, %eax
255	.set BUF, %edx
256	.set LEN, %ecx
257	.set LEN32, %ecx
258	.set LEN8, %cl
259	.set CONSTS_PTR, %ebx // Passed on stack
260	#endif
261
262	// Define aliases for some local variables. V0-V5 are used without
263	// aliases (for accumulators, data, temporary values, etc). Staying
264	// within the first 8 vector registers keeps the code 32-bit SSE
265	// compatible and reduces the size of 64-bit SSE code slightly.
266	.set BSWAP_MASK, V6
267	.set BSWAP_MASK_YMM, %ymm6
268	.set BSWAP_MASK_XMM, %xmm6
269	.set CONSTS, V7
270	.set CONSTS_YMM, %ymm7
271	.set CONSTS_XMM, %xmm7
272
273	// Use ANNOTATE_NOENDBR to suppress an objtool warning, since the
274	// functions generated by this macro are called only by static_call.
275	ANNOTATE_NOENDBR
276
277	#ifdef __i386__
278	push CONSTS_PTR
279	mov `8`(%esp), CONSTS_PTR
280	#endif
281
282	// Create a 128-bit vector that contains the initial CRC in the end
283	// representing the high-order polynomial coefficients, and the rest 0.
284	// If the CRC is msb-first, also load the byte-reflection table.
285	.if \n <= `32`
286	_cond_vex movd, CRC, %xmm0
287	.else
288	_cond_vex movq, CRC, %xmm0
289	.endif
290	.if !LSB_CRC
291	_cond_vex pslldq, $(`128`-\n)/`8`, %xmm0, %xmm0
292	_vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK
293	.endif
294
295	// Load the first vector of data and XOR the initial CRC into the
296	// appropriate end of the first 128-bit lane of data. If LEN < VL, then
297	// use a short vector and jump ahead to the final reduction. (LEN >= 16
298	// is guaranteed here but not necessarily LEN >= VL.)
299	.if VL >= `32`
300	cmp $VL, LEN
301	jae .Lat_least_1vec\@
302	.if VL == `64`
303	cmp $`32`, LEN32
304	jb .Lless_than_32bytes\@
305	_prepare_v0 `32`, %ymm0, %ymm1, BSWAP_MASK_YMM
306	add $`32`, BUF
307	jmp .Lreduce_256bits_to_128bits\@
308	.Lless_than_32bytes\@:
309	.endif
310	_prepare_v0 `16`, %xmm0, %xmm1, BSWAP_MASK_XMM
311	add $`16`, BUF
312	vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
313	jmp .Lcheck_for_partial_block\@
314	.Lat_least_1vec\@:
315	.endif
316	_prepare_v0 VL, V0, V1, BSWAP_MASK
317
318	// Handle VL <= LEN < 4VL.*
319	cmp $`4`*VL-`1`, LEN
320	ja .Lat_least_4vecs\@
321	add $VL, BUF
322	// If VL <= LEN < 2VL, then jump ahead to the reduction from 1 vector.*
323	// If VL==16 then load fold_across_128_bits_consts first, as the final
324	// reduction depends on it and it won't be loaded anywhere else.
325	cmp $`2`*VL-`1`, LEN32
326	.if VL == `16`
327	_cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
328	.endif
329	jbe .Lreduce_1vec_to_128bits\@
330	// Otherwise 2VL <= LEN < 4VL. Load one more vector and jump ahead to
331	// the reduction from 2 vectors.
332	_load_data VL, (BUF), BSWAP_MASK, V1
333	add $VL, BUF
334	jmp .Lreduce_2vecs_to_1\@
335
336	.Lat_least_4vecs\@:
337	// Load 3 more vectors of data.
338	_load_data VL, `1`*VL(BUF), BSWAP_MASK, V1
339	_load_data VL, `2`*VL(BUF), BSWAP_MASK, V2
340	_load_data VL, `3`*VL(BUF), BSWAP_MASK, V3
341	sub $-`4`VL, BUF // Shorter than 'add 4VL' when VL=32
342	add $-`4`VL, LEN // Shorter than 'sub 4VL' when VL=32
343
344	// Main loop: while LEN >= 4VL, fold the 4 vectors V0-V3 into the next*
345	// 4 vectors of data and write the result back to V0-V3.
346	cmp $`4`VL-`1`, LEN // Shorter than 'cmp 4VL' when VL=32
347	jbe .Lreduce_4vecs_to_2\@
348	_load_vec_folding_consts `2`
349	.Lfold_4vecs_loop\@:
350	_fold_vec_mem VL, V0, `0`*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
351	_fold_vec_mem VL, V1, `1`*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
352	_fold_vec_mem VL, V2, `2`*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
353	_fold_vec_mem VL, V3, `3`*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
354	sub $-`4`*VL, BUF
355	add $-`4`*VL, LEN
356	cmp $`4`*VL-`1`, LEN
357	ja .Lfold_4vecs_loop\@
358
359	// Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold
360	// two more vectors of data from BUF, if at least that much remains.
361	.Lreduce_4vecs_to_2\@:
362	_load_vec_folding_consts `1`
363	_fold_vec V0, V2, CONSTS, V4
364	_fold_vec V1, V3, CONSTS, V4
365	test $`2`*VL, LEN8
366	jz .Lreduce_2vecs_to_1\@
367	_fold_vec_mem VL, V0, `0`*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
368	_fold_vec_mem VL, V1, `1`*VL(BUF), CONSTS, BSWAP_MASK, V4, V5
369	sub $-`2`*VL, BUF
370
371	// Fold V0 into V1 and write the result back to V0. Then fold one more
372	// vector of data from BUF, if at least that much remains.
373	.Lreduce_2vecs_to_1\@:
374	_load_vec_folding_consts `0`
375	_fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5
376
377	.Lreduce_1vec_to_128bits\@:
378	.if VL == `64`
379	// Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
380	// data from BUF, if at least that much remains.
381	vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM
382	vextracti64x4 $`1`, %zmm0, %ymm1
383	_fold_vec_final `32`, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5
384	.Lreduce_256bits_to_128bits\@:
385	.endif
386	.if VL >= `32`
387	// Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
388	// data from BUF, if at least that much remains.
389	vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM
390	vextracti128 $`1`, %ymm0, %xmm1
391	_fold_vec_final `16`, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5
392	.Lcheck_for_partial_block\@:
393	.endif
394	and $`15`, LEN32
395	jz .Lreduce_128bits_to_crc\@
396
397	// 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now
398	// A(x^(8LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
399	// and B is the polynomial of the remaining LEN data bytes. To reduce
400	// this to 128 bits without needing fold constants for each possible
401	// LEN, rearrange this expression into C1(x^128) + C2, where*
402	// C1 = floor(A / x^(128 - 8LEN)) and C2 = Ax^(8LEN) + B mod x^128.*
403	// Then fold C1 into C2, which is just another fold across 128 bits.
404
405	.if !LSB_CRC \|\| AVX_LEVEL == `0`
406	// Load the last 16 data bytes. Note that originally LEN was >= 16.
407	_load_data `16`, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
408	.endif // Else will use vpblendvb mem operand later.
409	.if !LSB_CRC
410	neg LEN // Needed for indexing shuf_table
411	.endif
412
413	// tmp = Ax^(8LEN) mod x^128
414	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
415	// i.e. right-shift by LEN bytes.
416	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
417	// i.e. left-shift by LEN bytes.
418	_cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3
419	_cond_vex pshufb, %xmm3, %xmm0, %xmm1
420
421	// C1 = floor(A / x^(128 - 8LEN))*
422	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
423	// i.e. left-shift by 16-LEN bytes.
424	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
425	// i.e. right-shift by 16-LEN bytes.
426	_cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \
427	%xmm0, %xmm0, unaligned_mem_tmp=%xmm4
428
429	// C2 = tmp + B. This is just a blend of tmp with the last 16 data
430	// bytes (reflected if msb-first). The blend mask is the shuffle table
431	// that was used to create tmp. 0 selects tmp, and 1 last16databytes.
432	.if AVX_LEVEL == `0`
433	movdqa %xmm0, %xmm4
434	movdqa %xmm3, %xmm0
435	pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand
436	movdqa %xmm4, %xmm0
437	.elseif LSB_CRC
438	vpblendvb %xmm3, -`16`(BUF,LEN), %xmm1, %xmm1
439	.else
440	vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
441	.endif
442
443	// Fold C1 into C2 and store the 128-bit result in %xmm0.
444	_fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4
445
446	.Lreduce_128bits_to_crc\@:
447	// Compute the CRC as %xmm0 x^n mod G. Here %xmm0 means the 128-bit*
448	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
449	// order according to LSB_CRC), and G is the CRC's generator polynomial.
450
451	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
452	//
453	// t0 := (x^(64+n) mod G) floor(%xmm0 / x^64) +*
454	// x^n (%xmm0 mod x^64)*
455	//
456	// Store t0 x^(64-n) in %xmm0. I.e., actually do:*
457	//
458	// %xmm0 := ((x^(64+n) mod G) x^(64-n)) * floor(%xmm0 / x^64) +*
459	// x^64 (%xmm0 mod x^64)*
460	//
461	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
462	// to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily
463	// select it. The 64-bit constant (x^(64+n) mod G) x^(64-n) in the*
464	// msb-first case, or (x^(63+n) mod G) x^(64-n) in the lsb-first case*
465	// (considering the extra factor of x that gets implicitly introduced by
466	// each pclmulqdq when using lsb-first order), is identical to the
467	// constant that was used earlier for folding the LO64_TERMS across 128
468	// bits. Thus it's already available in LO64_TERMS of CONSTS_XMM.
469	_pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1
470	.if LSB_CRC
471	_cond_vex psrldq, $`8`, %xmm0, %xmm0 // x^64 (%xmm0 mod x^64)*
472	.else
473	_cond_vex pslldq, $`8`, %xmm0, %xmm0 // x^64 (%xmm0 mod x^64)*
474	.endif
475	_cond_vex pxor, %xmm1, %xmm0, %xmm0
476	// The HI64_TERMS of %xmm0 now contain floor(t0 / x^n).
477	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) x^(64-n).*
478
479	// First step of Barrett reduction: Compute floor(t0 / G). This is the
480	// polynomial by which G needs to be multiplied to cancel out the x^n
481	// and higher terms of t0, i.e. to reduce t0 mod G. First do:
482	//
483	// t1 := floor(x^(63+n) / G) x * floor(t0 / x^n)*
484	//
485	// Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
486	// x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest
487	// value that makes enough precision be carried through the calculation.
488	//
489	// The ' x' makes it so the result is floor(t1 / x^64) rather than*
490	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
491	// can be extracted much more easily in the next step. In the lsb-first
492	// case the ' x' happens implicitly. In the msb-first case it must be*
493	// done explicitly; floor(x^(63+n) / G) x is a 65-bit constant, so the*
494	// constant passed to pclmulqdq is (floor(x^(63+n) / G) x) - x^64, and*
495	// the multiplication by the x^64 term is handled using a pxor. The
496	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
497	_cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM
498	_pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1
499	.if !LSB_CRC
500	_cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 floor(t0 / x^n)*
501	.endif
502	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
503
504	// Second step of Barrett reduction: Cancel out the x^n and higher terms
505	// of t0 by subtracting the needed multiple of G. This gives the CRC:
506	//
507	// crc := t0 - (G floor(t0 / G))*
508	//
509	// But %xmm0 contains t0 x^(64-n), so it's more convenient to do:*
510	//
511	// crc := ((t0 x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)*
512	//
513	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
514	// explicitly applied to it then the x^n term of G makes no difference
515	// in the result and can be omitted. This helps keep the constant
516	// multiplier in 64 bits in most cases. This gives the following:
517	//
518	// %xmm0 := %xmm0 - (((G - x^n) x^(64-n)) * floor(t0 / G))*
519	// crc := (%xmm0 / x^(64-n)) mod x^n
520	//
521	// In the lsb-first case, each pclmulqdq implicitly introduces
522	// an extra factor of x, so in that case the constant that needs to be
523	// passed to pclmulqdq is actually '(G - x^n) x^(63-n)' when n <= 63.*
524	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
525	// easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
526	// pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC
527	// polynomials have nonzero x^n and x^0 terms.) It works out as: the
528	// CRC has be XORed with the physically low qword of %xmm1, representing
529	// floor(t0 / G). The most efficient way to do that is to move it to
530	// the physically high qword and use a ternlog to combine the two XORs.
531	.if LSB_CRC && \n == `64`
532	_cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2
533	_pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
534	.if AVX_LEVEL <= `2`
535	_cond_vex pxor, %xmm2, %xmm0, %xmm0
536	_cond_vex pxor, %xmm1, %xmm0, %xmm0
537	.else
538	vpternlogq $`0x96`, %xmm2, %xmm1, %xmm0
539	.endif
540	_cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
541	.else
542	_pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1
543	_cond_vex pxor, %xmm1, %xmm0, %xmm0
544	.if \n == `8`
545	_cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8
546	.elseif \n == `16`
547	_cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16
548	.elseif \n == `32`
549	_cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32
550	.else // \n == 64 && !LSB_CRC
551	_cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
552	.endif
553	.endif
554
555	.if VL > `16`
556	vzeroupper // Needed when ymm or zmm registers may have been used.
557	.endif
558	#ifdef __i386__
559	pop CONSTS_PTR
560	#endif
561	RET
562	.endm
563
564	#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \
565	SYM_FUNC_START(prefix##_pclmul_sse); \
566	_crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \
567	SYM_FUNC_END(prefix##_pclmul_sse); \
568	\
569	SYM_FUNC_START(prefix##_vpclmul_avx2); \
570	_crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \
571	SYM_FUNC_END(prefix##_vpclmul_avx2); \
572	\
573	SYM_FUNC_START(prefix##_vpclmul_avx512); \
574	_crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \
575	SYM_FUNC_END(prefix##_vpclmul_avx512);
576

Browse the source code of Linux/lib/crc/x86/crc-pclmul-template.S