| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | /* | 
|---|
| 3 | * x86-optimized CRC32 functions | 
|---|
| 4 | * | 
|---|
| 5 | * Copyright (C) 2008 Intel Corporation | 
|---|
| 6 | * Copyright 2012 Xyratex Technology Limited | 
|---|
| 7 | * Copyright 2024 Google LLC | 
|---|
| 8 | */ | 
|---|
| 9 |  | 
|---|
| 10 | #include "crc-pclmul-template.h" | 
|---|
| 11 |  | 
|---|
| 12 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32); | 
|---|
| 13 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); | 
|---|
| 14 | static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512); | 
|---|
| 15 |  | 
|---|
| 16 | DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); | 
|---|
| 17 |  | 
|---|
| 18 | static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) | 
|---|
| 19 | { | 
|---|
| 20 | CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, | 
|---|
| 21 | have_pclmulqdq); | 
|---|
| 22 | return crc32_le_base(crc, p, len); | 
|---|
| 23 | } | 
|---|
| 24 |  | 
|---|
| 25 | #ifdef CONFIG_X86_64 | 
|---|
| 26 | #define CRC32_INST "crc32q %1, %q0" | 
|---|
| 27 | #else | 
|---|
| 28 | #define CRC32_INST "crc32l %1, %0" | 
|---|
| 29 | #endif | 
|---|
| 30 |  | 
|---|
| 31 | /* | 
|---|
| 32 | * Use carryless multiply version of crc32c when buffer size is >= 512 to | 
|---|
| 33 | * account for FPU state save/restore overhead. | 
|---|
| 34 | */ | 
|---|
| 35 | #define CRC32C_PCLMUL_BREAKEVEN	512 | 
|---|
| 36 |  | 
|---|
| 37 | asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); | 
|---|
| 38 |  | 
|---|
| 39 | static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len) | 
|---|
| 40 | { | 
|---|
| 41 | size_t num_longs; | 
|---|
| 42 |  | 
|---|
| 43 | if (!static_branch_likely(&have_crc32)) | 
|---|
| 44 | return crc32c_base(crc, p, len); | 
|---|
| 45 |  | 
|---|
| 46 | if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN && | 
|---|
| 47 | static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) { | 
|---|
| 48 | /* | 
|---|
| 49 | * Long length, the vector registers are usable, and the CPU is | 
|---|
| 50 | * 64-bit and supports both CRC32 and PCLMULQDQ instructions. | 
|---|
| 51 | * It is worthwhile to divide the data into multiple streams, | 
|---|
| 52 | * CRC them independently, and combine them using PCLMULQDQ. | 
|---|
| 53 | * crc32c_x86_3way() does this using 3 streams, which is the | 
|---|
| 54 | * most that x86_64 CPUs have traditionally been capable of. | 
|---|
| 55 | * | 
|---|
| 56 | * However, due to improved VPCLMULQDQ performance on newer | 
|---|
| 57 | * CPUs, use crc32_lsb_vpclmul_avx512() instead of | 
|---|
| 58 | * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a | 
|---|
| 59 | * "good" implementation of AVX-512. | 
|---|
| 60 | * | 
|---|
| 61 | * Future work: the optimal strategy on Zen 3--5 is actually to | 
|---|
| 62 | * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately, | 
|---|
| 63 | * different numbers of streams and vector lengths are optimal | 
|---|
| 64 | * on each CPU microarchitecture, making it challenging to take | 
|---|
| 65 | * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a | 
|---|
| 66 | * major upgrade.)  For now, just choose between | 
|---|
| 67 | * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter | 
|---|
| 68 | * is needed anyway for crc32_le(), so we just reuse it here. | 
|---|
| 69 | */ | 
|---|
| 70 | kernel_fpu_begin(); | 
|---|
| 71 | if (static_branch_likely(&have_vpclmul_avx512)) | 
|---|
| 72 | crc = crc32_lsb_vpclmul_avx512(crc, p, len, | 
|---|
| 73 | consts_ptr: crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts); | 
|---|
| 74 | else | 
|---|
| 75 | crc = crc32c_x86_3way(crc, buffer: p, len); | 
|---|
| 76 | kernel_fpu_end(); | 
|---|
| 77 | return crc; | 
|---|
| 78 | } | 
|---|
| 79 |  | 
|---|
| 80 | /* | 
|---|
| 81 | * Short length, XMM registers unusable, or the CPU is 32-bit; but the | 
|---|
| 82 | * CPU supports CRC32 instructions.  Just issue a single stream of CRC32 | 
|---|
| 83 | * instructions inline.  While this doesn't use the CPU's CRC32 | 
|---|
| 84 | * throughput very well, it avoids the need to combine streams.  Stream | 
|---|
| 85 | * combination would be inefficient here. | 
|---|
| 86 | */ | 
|---|
| 87 |  | 
|---|
| 88 | for (num_longs = len / sizeof(unsigned long); | 
|---|
| 89 | num_longs != 0; num_longs--, p += sizeof(unsigned long)) | 
|---|
| 90 | asm(CRC32_INST : "+r"(crc) : ASM_INPUT_RM (*(unsigned long *)p)); | 
|---|
| 91 |  | 
|---|
| 92 | if (sizeof(unsigned long) > 4 && (len & 4)) { | 
|---|
| 93 | asm( "crc32l %1, %0": "+r"(crc) : ASM_INPUT_RM (*(u32 *)p)); | 
|---|
| 94 | p += 4; | 
|---|
| 95 | } | 
|---|
| 96 | if (len & 2) { | 
|---|
| 97 | asm( "crc32w %1, %0": "+r"(crc) : ASM_INPUT_RM (*(u16 *)p)); | 
|---|
| 98 | p += 2; | 
|---|
| 99 | } | 
|---|
| 100 | if (len & 1) | 
|---|
| 101 | asm( "crc32b %1, %0": "+r"(crc) : ASM_INPUT_RM (*p)); | 
|---|
| 102 |  | 
|---|
| 103 | return crc; | 
|---|
| 104 | } | 
|---|
| 105 |  | 
|---|
| 106 | #define crc32_be_arch crc32_be_base /* not implemented on this arch */ | 
|---|
| 107 |  | 
|---|
| 108 | #define crc32_mod_init_arch crc32_mod_init_arch | 
|---|
| 109 | static void crc32_mod_init_arch(void) | 
|---|
| 110 | { | 
|---|
| 111 | if (boot_cpu_has(X86_FEATURE_XMM4_2)) | 
|---|
| 112 | static_branch_enable(&have_crc32); | 
|---|
| 113 | if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { | 
|---|
| 114 | static_branch_enable(&have_pclmulqdq); | 
|---|
| 115 | if (have_vpclmul()) { | 
|---|
| 116 | if (have_avx512()) { | 
|---|
| 117 | static_call_update(crc32_lsb_pclmul, | 
|---|
| 118 | crc32_lsb_vpclmul_avx512); | 
|---|
| 119 | static_branch_enable(&have_vpclmul_avx512); | 
|---|
| 120 | } else { | 
|---|
| 121 | static_call_update(crc32_lsb_pclmul, | 
|---|
| 122 | crc32_lsb_vpclmul_avx2); | 
|---|
| 123 | } | 
|---|
| 124 | } | 
|---|
| 125 | } | 
|---|
| 126 | } | 
|---|
| 127 |  | 
|---|
| 128 | static inline u32 crc32_optimizations_arch(void) | 
|---|
| 129 | { | 
|---|
| 130 | u32 optimizations = 0; | 
|---|
| 131 |  | 
|---|
| 132 | if (static_key_enabled(&have_crc32)) | 
|---|
| 133 | optimizations |= CRC32C_OPTIMIZATION; | 
|---|
| 134 | if (static_key_enabled(&have_pclmulqdq)) | 
|---|
| 135 | optimizations |= CRC32_LE_OPTIMIZATION; | 
|---|
| 136 | return optimizations; | 
|---|
| 137 | } | 
|---|
| 138 |  | 
|---|