| 1 | /* gf128mul.h - GF(2^128) multiplication functions | 
|---|
| 2 | * | 
|---|
| 3 | * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. | 
|---|
| 4 | * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org> | 
|---|
| 5 | * | 
|---|
| 6 | * Based on Dr Brian Gladman's (GPL'd) work published at | 
|---|
| 7 | * http://fp.gladman.plus.com/cryptography_technology/index.htm | 
|---|
| 8 | * See the original copyright notice below. | 
|---|
| 9 | * | 
|---|
| 10 | * This program is free software; you can redistribute it and/or modify it | 
|---|
| 11 | * under the terms of the GNU General Public License as published by the Free | 
|---|
| 12 | * Software Foundation; either version 2 of the License, or (at your option) | 
|---|
| 13 | * any later version. | 
|---|
| 14 | */ | 
|---|
| 15 | /* | 
|---|
| 16 | --------------------------------------------------------------------------- | 
|---|
| 17 | Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved. | 
|---|
| 18 |  | 
|---|
| 19 | LICENSE TERMS | 
|---|
| 20 |  | 
|---|
| 21 | The free distribution and use of this software in both source and binary | 
|---|
| 22 | form is allowed (with or without changes) provided that: | 
|---|
| 23 |  | 
|---|
| 24 | 1. distributions of this source code include the above copyright | 
|---|
| 25 | notice, this list of conditions and the following disclaimer; | 
|---|
| 26 |  | 
|---|
| 27 | 2. distributions in binary form include the above copyright | 
|---|
| 28 | notice, this list of conditions and the following disclaimer | 
|---|
| 29 | in the documentation and/or other associated materials; | 
|---|
| 30 |  | 
|---|
| 31 | 3. the copyright holder's name is not used to endorse products | 
|---|
| 32 | built using this software without specific written permission. | 
|---|
| 33 |  | 
|---|
| 34 | ALTERNATIVELY, provided that this notice is retained in full, this product | 
|---|
| 35 | may be distributed under the terms of the GNU General Public License (GPL), | 
|---|
| 36 | in which case the provisions of the GPL apply INSTEAD OF those given above. | 
|---|
| 37 |  | 
|---|
| 38 | DISCLAIMER | 
|---|
| 39 |  | 
|---|
| 40 | This software is provided 'as is' with no explicit or implied warranties | 
|---|
| 41 | in respect of its properties, including, but not limited to, correctness | 
|---|
| 42 | and/or fitness for purpose. | 
|---|
| 43 | --------------------------------------------------------------------------- | 
|---|
| 44 | Issue Date: 31/01/2006 | 
|---|
| 45 |  | 
|---|
| 46 | An implementation of field multiplication in Galois Field GF(2^128) | 
|---|
| 47 | */ | 
|---|
| 48 |  | 
|---|
| 49 | #ifndef _CRYPTO_GF128MUL_H | 
|---|
| 50 | #define _CRYPTO_GF128MUL_H | 
|---|
| 51 |  | 
|---|
| 52 | #include <asm/byteorder.h> | 
|---|
| 53 | #include <crypto/b128ops.h> | 
|---|
| 54 | #include <linux/slab.h> | 
|---|
| 55 |  | 
|---|
| 56 | /* Comment by Rik: | 
|---|
| 57 | * | 
|---|
| 58 | * For some background on GF(2^128) see for example: | 
|---|
| 59 | * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf | 
|---|
| 60 | * | 
|---|
| 61 | * The elements of GF(2^128) := GF(2)[X]/(X^128-X^7-X^2-X^1-1) can | 
|---|
| 62 | * be mapped to computer memory in a variety of ways. Let's examine | 
|---|
| 63 | * three common cases. | 
|---|
| 64 | * | 
|---|
| 65 | * Take a look at the 16 binary octets below in memory order. The msb's | 
|---|
| 66 | * are left and the lsb's are right. char b[16] is an array and b[0] is | 
|---|
| 67 | * the first octet. | 
|---|
| 68 | * | 
|---|
| 69 | * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000 | 
|---|
| 70 | *   b[0]     b[1]     b[2]     b[3]          b[13]    b[14]    b[15] | 
|---|
| 71 | * | 
|---|
| 72 | * Every bit is a coefficient of some power of X. We can store the bits | 
|---|
| 73 | * in every byte in little-endian order and the bytes themselves also in | 
|---|
| 74 | * little endian order. I will call this lle (little-little-endian). | 
|---|
| 75 | * The above buffer represents the polynomial 1, and X^7+X^2+X^1+1 looks | 
|---|
| 76 | * like 11100001 00000000 .... 00000000 = { 0xE1, 0x00, }. | 
|---|
| 77 | * This format was originally implemented in gf128mul and is used | 
|---|
| 78 | * in GCM (Galois/Counter mode) and in ABL (Arbitrary Block Length). | 
|---|
| 79 | * | 
|---|
| 80 | * Another convention says: store the bits in bigendian order and the | 
|---|
| 81 | * bytes also. This is bbe (big-big-endian). Now the buffer above | 
|---|
| 82 | * represents X^127. X^7+X^2+X^1+1 looks like 00000000 .... 10000111, | 
|---|
| 83 | * b[15] = 0x87 and the rest is 0. LRW uses this convention and bbe | 
|---|
| 84 | * is partly implemented. | 
|---|
| 85 | * | 
|---|
| 86 | * Both of the above formats are easy to implement on big-endian | 
|---|
| 87 | * machines. | 
|---|
| 88 | * | 
|---|
| 89 | * XTS and EME (the latter of which is patent encumbered) use the ble | 
|---|
| 90 | * format (bits are stored in big endian order and the bytes in little | 
|---|
| 91 | * endian). The above buffer represents X^7 in this case and the | 
|---|
| 92 | * primitive polynomial is b[0] = 0x87. | 
|---|
| 93 | * | 
|---|
| 94 | * The common machine word-size is smaller than 128 bits, so to make | 
|---|
| 95 | * an efficient implementation we must split into machine word sizes. | 
|---|
| 96 | * This implementation uses 64-bit words for the moment. Machine | 
|---|
| 97 | * endianness comes into play. The lle format in relation to machine | 
|---|
| 98 | * endianness is discussed below by the original author of gf128mul Dr | 
|---|
| 99 | * Brian Gladman. | 
|---|
| 100 | * | 
|---|
| 101 | * Let's look at the bbe and ble format on a little endian machine. | 
|---|
| 102 | * | 
|---|
| 103 | * bbe on a little endian machine u32 x[4]: | 
|---|
| 104 | * | 
|---|
| 105 | *  MS            x[0]           LS  MS            x[1]		  LS | 
|---|
| 106 | *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 107 | *  103..96 111.104 119.112 127.120  71...64 79...72 87...80 95...88 | 
|---|
| 108 | * | 
|---|
| 109 | *  MS            x[2]           LS  MS            x[3]		  LS | 
|---|
| 110 | *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 111 | *  39...32 47...40 55...48 63...56  07...00 15...08 23...16 31...24 | 
|---|
| 112 | * | 
|---|
| 113 | * ble on a little endian machine | 
|---|
| 114 | * | 
|---|
| 115 | *  MS            x[0]           LS  MS            x[1]		  LS | 
|---|
| 116 | *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 117 | *  31...24 23...16 15...08 07...00  63...56 55...48 47...40 39...32 | 
|---|
| 118 | * | 
|---|
| 119 | *  MS            x[2]           LS  MS            x[3]		  LS | 
|---|
| 120 | *  ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 121 | *  95...88 87...80 79...72 71...64  127.120 199.112 111.104 103..96 | 
|---|
| 122 | * | 
|---|
| 123 | * Multiplications in GF(2^128) are mostly bit-shifts, so you see why | 
|---|
| 124 | * ble (and lbe also) are easier to implement on a little-endian | 
|---|
| 125 | * machine than on a big-endian machine. The converse holds for bbe | 
|---|
| 126 | * and lle. | 
|---|
| 127 | * | 
|---|
| 128 | * Note: to have good alignment, it seems to me that it is sufficient | 
|---|
| 129 | * to keep elements of GF(2^128) in type u64[2]. On 32-bit wordsize | 
|---|
| 130 | * machines this will automatically aligned to wordsize and on a 64-bit | 
|---|
| 131 | * machine also. | 
|---|
| 132 | */ | 
|---|
| 133 | /*	Multiply a GF(2^128) field element by x. Field elements are | 
|---|
| 134 | held in arrays of bytes in which field bits 8n..8n + 7 are held in | 
|---|
| 135 | byte[n], with lower indexed bits placed in the more numerically | 
|---|
| 136 | significant bit positions within bytes. | 
|---|
| 137 |  | 
|---|
| 138 | On little endian machines the bit indexes translate into the bit | 
|---|
| 139 | positions within four 32-bit words in the following way | 
|---|
| 140 |  | 
|---|
| 141 | MS            x[0]           LS  MS            x[1]		  LS | 
|---|
| 142 | ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 143 | 24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39 | 
|---|
| 144 |  | 
|---|
| 145 | MS            x[2]           LS  MS            x[3]		  LS | 
|---|
| 146 | ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 147 | 88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103 | 
|---|
| 148 |  | 
|---|
| 149 | On big endian machines the bit indexes translate into the bit | 
|---|
| 150 | positions within four 32-bit words in the following way | 
|---|
| 151 |  | 
|---|
| 152 | MS            x[0]           LS  MS            x[1]		  LS | 
|---|
| 153 | ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 154 | 00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63 | 
|---|
| 155 |  | 
|---|
| 156 | MS            x[2]           LS  MS            x[3]		  LS | 
|---|
| 157 | ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls | 
|---|
| 158 | 64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127 | 
|---|
| 159 | */ | 
|---|
| 160 |  | 
|---|
| 161 | /*	A slow generic version of gf_mul, implemented for lle | 
|---|
| 162 | * 	It multiplies a and b and puts the result in a */ | 
|---|
| 163 | void gf128mul_lle(be128 *a, const be128 *b); | 
|---|
| 164 |  | 
|---|
| 165 | /* | 
|---|
| 166 | * The following functions multiply a field element by x in | 
|---|
| 167 | * the polynomial field representation.  They use 64-bit word operations | 
|---|
| 168 | * to gain speed but compensate for machine endianness and hence work | 
|---|
| 169 | * correctly on both styles of machine. | 
|---|
| 170 | * | 
|---|
| 171 | * They are defined here for performance. | 
|---|
| 172 | */ | 
|---|
| 173 |  | 
|---|
| 174 | static inline u64 gf128mul_mask_from_bit(u64 x, int which) | 
|---|
| 175 | { | 
|---|
| 176 | /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */ | 
|---|
| 177 | return ((s64)(x << (63 - which)) >> 63); | 
|---|
| 178 | } | 
|---|
| 179 |  | 
|---|
| 180 | static inline void gf128mul_x_lle(be128 *r, const be128 *x) | 
|---|
| 181 | { | 
|---|
| 182 | u64 a = be64_to_cpu(x->a); | 
|---|
| 183 | u64 b = be64_to_cpu(x->b); | 
|---|
| 184 |  | 
|---|
| 185 | /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48 | 
|---|
| 186 | * (see crypto/gf128mul.c): */ | 
|---|
| 187 | u64 _tt = gf128mul_mask_from_bit(x: b, which: 0) & ((u64)0xe1 << 56); | 
|---|
| 188 |  | 
|---|
| 189 | r->b = cpu_to_be64((b >> 1) | (a << 63)); | 
|---|
| 190 | r->a = cpu_to_be64((a >> 1) ^ _tt); | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | static inline void gf128mul_x_bbe(be128 *r, const be128 *x) | 
|---|
| 194 | { | 
|---|
| 195 | u64 a = be64_to_cpu(x->a); | 
|---|
| 196 | u64 b = be64_to_cpu(x->b); | 
|---|
| 197 |  | 
|---|
| 198 | /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */ | 
|---|
| 199 | u64 _tt = gf128mul_mask_from_bit(x: a, which: 63) & 0x87; | 
|---|
| 200 |  | 
|---|
| 201 | r->a = cpu_to_be64((a << 1) | (b >> 63)); | 
|---|
| 202 | r->b = cpu_to_be64((b << 1) ^ _tt); | 
|---|
| 203 | } | 
|---|
| 204 |  | 
|---|
| 205 | /* needed by XTS */ | 
|---|
| 206 | static inline void gf128mul_x_ble(le128 *r, const le128 *x) | 
|---|
| 207 | { | 
|---|
| 208 | u64 a = le64_to_cpu(x->a); | 
|---|
| 209 | u64 b = le64_to_cpu(x->b); | 
|---|
| 210 |  | 
|---|
| 211 | /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */ | 
|---|
| 212 | u64 _tt = gf128mul_mask_from_bit(x: a, which: 63) & 0x87; | 
|---|
| 213 |  | 
|---|
| 214 | r->a = cpu_to_le64((a << 1) | (b >> 63)); | 
|---|
| 215 | r->b = cpu_to_le64((b << 1) ^ _tt); | 
|---|
| 216 | } | 
|---|
| 217 |  | 
|---|
| 218 | /* 4k table optimization */ | 
|---|
| 219 |  | 
|---|
| 220 | struct gf128mul_4k { | 
|---|
| 221 | be128 t[256]; | 
|---|
| 222 | }; | 
|---|
| 223 |  | 
|---|
| 224 | struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g); | 
|---|
| 225 | void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t); | 
|---|
| 226 | void gf128mul_x8_ble(le128 *r, const le128 *x); | 
|---|
| 227 | static inline void gf128mul_free_4k(struct gf128mul_4k *t) | 
|---|
| 228 | { | 
|---|
| 229 | kfree_sensitive(objp: t); | 
|---|
| 230 | } | 
|---|
| 231 |  | 
|---|
| 232 |  | 
|---|
| 233 | /* 64k table optimization, implemented for bbe */ | 
|---|
| 234 |  | 
|---|
| 235 | struct gf128mul_64k { | 
|---|
| 236 | struct gf128mul_4k *t[16]; | 
|---|
| 237 | }; | 
|---|
| 238 |  | 
|---|
| 239 | /* First initialize with the constant factor with which you | 
|---|
| 240 | * want to multiply and then call gf128mul_64k_bbe with the other | 
|---|
| 241 | * factor in the first argument, and the table in the second. | 
|---|
| 242 | * Afterwards, the result is stored in *a. | 
|---|
| 243 | */ | 
|---|
| 244 | struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g); | 
|---|
| 245 | void gf128mul_free_64k(struct gf128mul_64k *t); | 
|---|
| 246 | void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t); | 
|---|
| 247 |  | 
|---|
| 248 | #endif /* _CRYPTO_GF128MUL_H */ | 
|---|
| 249 |  | 
|---|