| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 
|---|
| 2 | #ifndef _LINUX_RECIPROCAL_DIV_H | 
|---|
| 3 | #define _LINUX_RECIPROCAL_DIV_H | 
|---|
| 4 |  | 
|---|
| 5 | #include <linux/types.h> | 
|---|
| 6 |  | 
|---|
| 7 | /* | 
|---|
| 8 | * This algorithm is based on the paper "Division by Invariant | 
|---|
| 9 | * Integers Using Multiplication" by Torbjörn Granlund and Peter | 
|---|
| 10 | * L. Montgomery. | 
|---|
| 11 | * | 
|---|
| 12 | * The assembler implementation from Agner Fog, which this code is | 
|---|
| 13 | * based on, can be found here: | 
|---|
| 14 | * http://www.agner.org/optimize/asmlib.zip | 
|---|
| 15 | * | 
|---|
| 16 | * This optimization for A/B is helpful if the divisor B is mostly | 
|---|
| 17 | * runtime invariant. The reciprocal of B is calculated in the | 
|---|
| 18 | * slow-path with reciprocal_value(). The fast-path can then just use | 
|---|
| 19 | * a much faster multiplication operation with a variable dividend A | 
|---|
| 20 | * to calculate the division A/B. | 
|---|
| 21 | */ | 
|---|
| 22 |  | 
|---|
| 23 | struct reciprocal_value { | 
|---|
| 24 | u32 m; | 
|---|
| 25 | u8 sh1, sh2; | 
|---|
| 26 | }; | 
|---|
| 27 |  | 
|---|
| 28 | /* "reciprocal_value" and "reciprocal_divide" together implement the basic | 
|---|
| 29 | * version of the algorithm described in Figure 4.1 of the paper. | 
|---|
| 30 | */ | 
|---|
| 31 | struct reciprocal_value reciprocal_value(u32 d); | 
|---|
| 32 |  | 
|---|
| 33 | static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) | 
|---|
| 34 | { | 
|---|
| 35 | u32 t = (u32)(((u64)a * R.m) >> 32); | 
|---|
| 36 | return (t + ((a - t) >> R.sh1)) >> R.sh2; | 
|---|
| 37 | } | 
|---|
| 38 |  | 
|---|
| 39 | struct reciprocal_value_adv { | 
|---|
| 40 | u32 m; | 
|---|
| 41 | u8 sh, exp; | 
|---|
| 42 | bool is_wide_m; | 
|---|
| 43 | }; | 
|---|
| 44 |  | 
|---|
| 45 | /* "reciprocal_value_adv" implements the advanced version of the algorithm | 
|---|
| 46 | * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose | 
|---|
| 47 | * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The | 
|---|
| 48 | * exception case could be easily handled before calling "reciprocal_value_adv". | 
|---|
| 49 | * | 
|---|
| 50 | * The advanced version requires more complex calculation to get the reciprocal | 
|---|
| 51 | * multiplier and other control variables, but then could reduce the required | 
|---|
| 52 | * emulation operations. | 
|---|
| 53 | * | 
|---|
| 54 | * It makes no sense to use this advanced version for host divide emulation, | 
|---|
| 55 | * those extra complexities for calculating multiplier etc could completely | 
|---|
| 56 | * waive our saving on emulation operations. | 
|---|
| 57 | * | 
|---|
| 58 | * However, it makes sense to use it for JIT divide code generation for which | 
|---|
| 59 | * we are willing to trade performance of JITed code with that of host. As shown | 
|---|
| 60 | * by the following pseudo code, the required emulation operations could go down | 
|---|
| 61 | * from 6 (the basic version) to 3 or 4. | 
|---|
| 62 | * | 
|---|
| 63 | * To use the result of "reciprocal_value_adv", suppose we want to calculate | 
|---|
| 64 | * n/d, the pseudo C code will be: | 
|---|
| 65 | * | 
|---|
| 66 | *   struct reciprocal_value_adv rvalue; | 
|---|
| 67 | *   u8 pre_shift, exp; | 
|---|
| 68 | * | 
|---|
| 69 | *   // handle exception case. | 
|---|
| 70 | *   if (d >= (1U << 31)) { | 
|---|
| 71 | *     result = n >= d; | 
|---|
| 72 | *     return; | 
|---|
| 73 | *   } | 
|---|
| 74 | * | 
|---|
| 75 | *   rvalue = reciprocal_value_adv(d, 32) | 
|---|
| 76 | *   exp = rvalue.exp; | 
|---|
| 77 | *   if (rvalue.is_wide_m && !(d & 1)) { | 
|---|
| 78 | *     // floor(log2(d & (2^32 -d))) | 
|---|
| 79 | *     pre_shift = fls(d & -d) - 1; | 
|---|
| 80 | *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); | 
|---|
| 81 | *   } else { | 
|---|
| 82 | *     pre_shift = 0; | 
|---|
| 83 | *   } | 
|---|
| 84 | * | 
|---|
| 85 | *   // code generation starts. | 
|---|
| 86 | *   if (imm == 1U << exp) { | 
|---|
| 87 | *     result = n >> exp; | 
|---|
| 88 | *   } else if (rvalue.is_wide_m) { | 
|---|
| 89 | *     // pre_shift must be zero when reached here. | 
|---|
| 90 | *     t = (n * rvalue.m) >> 32; | 
|---|
| 91 | *     result = n - t; | 
|---|
| 92 | *     result >>= 1; | 
|---|
| 93 | *     result += t; | 
|---|
| 94 | *     result >>= rvalue.sh - 1; | 
|---|
| 95 | *   } else { | 
|---|
| 96 | *     if (pre_shift) | 
|---|
| 97 | *       result = n >> pre_shift; | 
|---|
| 98 | *     result = ((u64)result * rvalue.m) >> 32; | 
|---|
| 99 | *     result >>= rvalue.sh; | 
|---|
| 100 | *   } | 
|---|
| 101 | */ | 
|---|
| 102 | struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); | 
|---|
| 103 |  | 
|---|
| 104 | #endif /* _LINUX_RECIPROCAL_DIV_H */ | 
|---|
| 105 |  | 
|---|