f489225d011a4a08876a73d2fa0367e3c2f37392
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # bitmanipulation
4
5 **DRAFT STATUS**
6
7 this extension amalgamates bitnanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX. Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
8 Vectorisation Context is provided by [[openpower/sv]].
9
10
11 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD". It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs. Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
12
13 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science. however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added. The expectation is that these operations would all be covered by the same pipeline.
14
15 note that there are brownfield spaces below that could incorporate some of the set-before-first and other operations.
16
17 # summary
18
19 minor opcode allocation
20
21 | 28.30 |31| name |
22 | ------ |--| --------- |
23 | 00 |Rc| ternaryi |
24 | 001 |Rc| ternary |
25 | 010 |Rc| bitmask |
26 | 011 |Rc| gf* |
27 | 101 |1 | ternaryv |
28 | 101 |0 | ternarycr |
29 | 110 |Rc| 1/2-op |
30 | 111 |Rc| 3-op |
31
32 1-op and variants
33
34 | dest | src1 | subop | op |
35 | ---- | ---- | ----- | -------- |
36 | RT | RA | .. | bmatflip |
37
38 2-op and variants
39
40 | dest | src1 | src2 | subop | op |
41 | ---- | ---- | ---- | ----- | -------- |
42 | RT | RA | RB | or | bmatflip |
43 | RT | RA | RB | xor | bmatflip |
44 | RT | RA | RB | bdep | dep/ext |
45 | RT | RA | RB | bext | dep/ext |
46 | RT | RA | RB | | grev |
47 | RT | RA | RB | | clmul* |
48 | RT | RA | RB | | gorc |
49 | RT | RA | RB | shuf | shuffle |
50 | RT | RA | RB | unshuf| shuffle |
51 | RT | RA | RB | width | xperm |
52 | RT | RA | RB | type | minmax |
53 | RT | RA | RB | | |
54 | RT | RA | RB | | |
55 | RT | RA | RB | | |
56
57 3 ops
58
59 * bitmask set/extract
60 * ternary bitops
61 * GF
62
63 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
64 | -- | -- | --- | --- | ----- | -------- |--| ------ |
65 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
66 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| ternaryi |
67 | NN | RS | RA | RB | RC | 00 011 |Rc| gfmul |
68 | NN | RS | RA | RB | RC | 01 011 |Rc| gfadd |
69 | NN | RT | RA | RB | deg | 10 011 |Rc| gfinv |
70 | NN | RS | RA | RB | deg | 11 011 |Rc| gfmuli |
71 | NN | RS | RA | RB | deg | 11 111 |Rc| gfaddi |
72
73 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
74 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
75 | NN | RT | RA | imm | mask | 101 |1 | ternaryv |
76
77 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
78 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
79 | NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternarycr |
80
81 ops
82
83 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
84 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
85 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
86 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
87 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
88 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
89 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
90 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
91 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
92 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
93 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
94 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
95 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
96 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
97 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
98 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
99 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
100 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
101 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
102 | NN | RA | RB | RC | 01 | 1 | 0010 110 |Rc| clmul |
103 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
104 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
105 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
106 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
107 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
108 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
109 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
110 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
111 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
112 | NN | RA | RB | RC | 11 | 0 | 1110 110 |Rc| clmulr |
113 | NN | RA | RB | RC | 11 | 1 | 1110 110 |Rc| clmulh |
114 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
115
116 # count leading/trailing zeros with mask
117
118 in v3.1 p105
119
120 ```
121 count = 0
122 do i = 0 to 63 if((RB)i=1) then do
123 if((RS)i=1) then break end end count ← count + 1
124 RA ← EXTZ64(count)
125 ```
126
127 # bit to byte permute
128
129 similar to matrix permute in RV bitmanip, which has XOR and OR variants
130
131 do j = 0 to 7
132 do k = 0 to 7
133 b = VSR[VRB+32].dword[i].byte[k].bit[j]
134 VSR[VRT+32].dword[i].byte[j].bit[k] = b
135
136 # bit deposit
137
138 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
139
140 do while(m < 64)
141 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
142 result = VSR[VRA+32].dword[i].bit[63-k]
143 VSR[VRT+32].dword[i].bit[63-m] = result
144 k = k + 1
145 m = m + 1
146
147 ```
148
149 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
150 {
151 uint_xlen_t r = 0;
152 for (int i = 0, j = 0; i < XLEN; i++)
153 if ((RB >> i) & 1) {
154 if ((RA >> j) & 1)
155 r |= uint_xlen_t(1) << i;
156 j++;
157 }
158 return r;
159 }
160
161 ```
162
163 # bit extract
164
165 other way round: identical to RV bext, found in v3.1 p196
166
167 ```
168 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
169 {
170 uint_xlen_t r = 0;
171 for (int i = 0, j = 0; i < XLEN; i++)
172 if ((RB >> i) & 1) {
173 if ((RA >> i) & 1)
174 r |= uint_xlen_t(1) << j;
175 j++;
176 }
177 return r;
178 }
179 ```
180
181 # centrifuge
182
183 found in v3.1 p106
184
185 ```
186 ptr0 ← 0 ptr1 ← 0 do i = 0 to 63 if((RB)i=0) then do
187 resultptr0 ← (RS)i end ptr0 ← ptr0 + 1
188 if((RB)63-i==1) then do
189 result63-ptr1 ← (RS)63-i end end ptr1 ← ptr1 + 1
190 RA ← result
191 ```
192
193 # int min/max
194
195 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
196
197 signed/unsigned min/max gives more flexibility.
198
199 ```
200 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
201 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
202 }
203 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
204 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
205 }
206 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
207 { return rs1 < rs2 ? rs1 : rs2;
208 }
209 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
210 { return rs1 > rs2 ? rs1 : rs2;
211 }
212 ```
213
214
215 # ternary bitops
216
217 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
218
219 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
220 | -- | -- | --- | --- | ----- | -------- |--|
221 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
222
223 for i in range(64):
224 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
225 RT[i] = (imm & (1<<idx)) != 0
226
227 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
228
229 a 4 operand variant which becomes more along the lines of an FPGA:
230
231 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
232 | -- | -- | --- | --- | --- | -------- |--|
233 | NN | RT | RA | RB | RC | mode 001 |Rc|
234
235 for i in range(64):
236 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
237 RT[i] = (RC & (1<<idx)) != 0
238
239 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
240 3 modes.
241
242 also, another possible variant involving swizzle and vec4:
243
244 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
245 | -- | -- | --- | ----- | ---- | ----- |--|
246 | NN | RT | RA | imm | mask | 101 |1 |
247
248 for i in range(8):
249 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
250 res = (imm & (1<<idx)) != 0
251 for j in range(3):
252 if mask[j]: RT[i+j*8] = res
253
254 another mode selection would be CRs not Ints.
255
256 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
257 | -- | -- | --- | --- |- |-----|----- | -----|--|
258 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
259
260 for i in range(4):
261 if not mask[i] continue
262 idx = crregs[BA][i] << 2 |
263 crregs[BB][i] << 1 |
264 crregs[BC][i]
265 crregs[BA][i] = (imm & (1<<idx)) != 0
266
267 # bitmask set
268
269 based on RV bitmanip singlebit set, instruction format similar to shift
270 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask rldicl but only immediate version).
271 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
272
273 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate. for the immediate version this does not matter, so a bmexti is not required.
274 bmrev however there is no direct equivalent and consequently a bmrevi is required.
275
276 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
277 | -- | -- | --- | --- | --- | ------- |--| ----- |
278 | NN | RT | RA | RB | RC | mode 010 |Rc| bm* |
279 | NN | RT | RA | RB | RC | 0 1 111 |Rc| bmrev |
280
281
282 ```
283 uint_xlen_t bmset(RA, RB, sh)
284 {
285 int shamt = RB & (XLEN - 1);
286 mask = (2<<sh)-1;
287 return RA | (mask << shamt);
288 }
289
290 uint_xlen_t bmclr(RA, RB, sh)
291 {
292 int shamt = RB & (XLEN - 1);
293 mask = (2<<sh)-1;
294 return RA & ~(mask << shamt);
295 }
296
297 uint_xlen_t bminv(RA, RB, sh)
298 {
299 int shamt = RB & (XLEN - 1);
300 mask = (2<<sh)-1;
301 return RA ^ (mask << shamt);
302 }
303
304 uint_xlen_t bmext(RA, RB, sh)
305 {
306 int shamt = RB & (XLEN - 1);
307 mask = (2<<sh)-1;
308 return mask & (RA >> shamt);
309 }
310 ```
311
312 bitmask extract with reverse. can be done by bitinverting all of RA and getting bits of RA from the opposite end.
313
314 ```
315 msb = rb[5:0];
316 rev[0:msb] = ra[msb:0];
317 rt = ZE(rev[msb:0]);
318
319 uint_xlen_t bmextrev(RA, RB, sh)
320 {
321 int shamt = (RB & (XLEN - 1));
322 shamt = (XLEN-1)-shamt; # shift other end
323 bra = bitreverse(RA) # swap LSB-MSB
324 mask = (2<<sh)-1;
325 return mask & (bra >> shamt);
326 }
327 ```
328
329 | 0.5|6.10|11.15|16.20|21.26| 27..30 |31| name |
330 | -- | -- | --- | --- | --- | ------- |--| ------ |
331 | NN | RT | RA | RB | sh | 0 111 |Rc| bmrevi |
332
333
334
335 # grev
336
337 based on RV bitmanip
338
339 ```
340 uint64_t grev64(uint64_t RA, uint64_t RB)
341 {
342 uint64_t x = RA;
343 int shamt = RB & 63;
344 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
345 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
346 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
347 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
348 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
349 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
350 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
351 ((x & 0xFF00FF00FF00FF00LL) >> 8);
352 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
353 ((x & 0xFFFF0000FFFF0000LL) >> 16);
354 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
355 ((x & 0xFFFFFFFF00000000LL) >> 32);
356 return x;
357 }
358
359 ```
360
361 # shuffle / unshuffle
362
363 based on RV bitmanip
364
365 ```
366 uint32_t shfl32(uint32_t RA, uint32_t RB)
367 {
368 uint32_t x = RA;
369 int shamt = RB & 15;
370 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
371 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
372 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
373 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
374 return x;
375 }
376 uint32_t unshfl32(uint32_t RA, uint32_t RB)
377 {
378 uint32_t x = RA;
379 int shamt = RB & 15;
380 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
381 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
382 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
383 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
384 return x;
385 }
386
387 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
388 {
389 uint64_t x = src & ~(maskL | maskR);
390 x |= ((src << N) & maskL) | ((src >> N) & maskR);
391 return x;
392 }
393 uint64_t shfl64(uint64_t RA, uint64_t RB)
394 {
395 uint64_t x = RA;
396 int shamt = RB & 31;
397 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
398 0x00000000ffff0000LL, 16);
399 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
400 0x0000ff000000ff00LL, 8);
401 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
402 0x00f000f000f000f0LL, 4);
403 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
404 0x0c0c0c0c0c0c0c0cLL, 2);
405 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
406 0x2222222222222222LL, 1);
407 return x;
408 }
409 uint64_t unshfl64(uint64_t RA, uint64_t RB)
410 {
411 uint64_t x = RA;
412 int shamt = RB & 31;
413 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
414 0x2222222222222222LL, 1);
415 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
416 0x0c0c0c0c0c0c0c0cLL, 2);
417 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
418 0x00f000f000f000f0LL, 4);
419 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
420 0x0000ff000000ff00LL, 8);
421 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
422 0x00000000ffff0000LL, 16);
423 return x;
424 }
425 ```
426
427 # xperm
428
429 based on RV bitmanip
430
431 ```
432 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
433 {
434 uint_xlen_t r = 0;
435 uint_xlen_t sz = 1LL << sz_log2;
436 uint_xlen_t mask = (1LL << sz) - 1;
437 for (int i = 0; i < XLEN; i += sz) {
438 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
439 if (pos < XLEN)
440 r |= ((RA >> pos) & mask) << i;
441 }
442 return r;
443 }
444 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
445 { return xperm(RA, RB, 2); }
446 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
447 { return xperm(RA, RB, 3); }
448 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
449 { return xperm(RA, RB, 4); }
450 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
451 { return xperm(RA, RB, 5); }
452 ```
453
454 # gorc
455
456 based on RV bitmanip
457
458 ```
459 uint32_t gorc32(uint32_t RA, uint32_t RB)
460 {
461 uint32_t x = RA;
462 int shamt = RB & 31;
463 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
464 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
465 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
466 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
467 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
468 return x;
469 }
470 uint64_t gorc64(uint64_t RA, uint64_t RB)
471 {
472 uint64_t x = RA;
473 int shamt = RB & 63;
474 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
475 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
476 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
477 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
478 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
479 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
480 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
481 ((x & 0xFF00FF00FF00FF00LL) >> 8);
482 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
483 ((x & 0xFFFF0000FFFF0000LL) >> 16);
484 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
485 ((x & 0xFFFFFFFF00000000LL) >> 32);
486 return x;
487 }
488
489 ```
490
491 # cmix
492
493 based on RV bitmanip, covered by ternary bitops
494
495 ```
496 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
497 return (RA & RB) | (RC & ~RB);
498 }
499 ```
500
501 # carryless mul
502
503 based on RV bitmanip
504 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
505
506 ```
507 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
508 {
509 uint_xlen_t x = 0;
510 for (int i = 0; i < XLEN; i++)
511 if ((RB >> i) & 1)
512 x ^= RA << i;
513 return x;
514 }
515 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
516 {
517 uint_xlen_t x = 0;
518 for (int i = 1; i < XLEN; i++)
519 if ((RB >> i) & 1)
520 x ^= RA >> (XLEN-i);
521 return x;
522 }
523 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
524 {
525 uint_xlen_t x = 0;
526 for (int i = 0; i < XLEN; i++)
527 if ((RB >> i) & 1)
528 x ^= RA >> (XLEN-i-1);
529 return x;
530 }
531 ```
532 # Galois Field
533
534 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
535
536 ## Multiply
537
538 this requires 3 parameters and a "degree"
539
540 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
541
542 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
543
544 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
545 RS = GFMUL(RS, RA, gfdegree=RC, modulo=RB)
546
547 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
548 | -- | -- | --- | --- | --- | ------- |--|
549 | NN | RS | RA | RB | deg | 00 011 |Rc|
550 | NN | RS | RA | RB | RC | 11 011 |Rc|
551
552 where the SimpleV variant may override RS-as-src differently from RS-as-dest
553
554
555
556 ```
557 from functools import reduce
558
559 # constants used in the multGF2 function
560 mask1 = mask2 = polyred = None
561
562 def setGF2(degree, irPoly):
563 """Define parameters of binary finite field GF(2^m)/g(x)
564 - degree: extension degree of binary field
565 - irPoly: coefficients of irreducible polynomial g(x)
566 """
567 def i2P(sInt):
568 """Convert an integer into a polynomial"""
569 return [(sInt >> i) & 1
570 for i in reversed(range(sInt.bit_length()))]
571
572 global mask1, mask2, polyred
573 mask1 = mask2 = 1 << degree
574 mask2 -= 1
575 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
576
577 def multGF2(p1, p2):
578 """Multiply two polynomials in GF(2^m)/g(x)"""
579 p = 0
580 while p2:
581 if p2 & 1:
582 p ^= p1
583 p1 <<= 1
584 if p1 & mask1:
585 p1 ^= polyred
586 p2 >>= 1
587 return p & mask2
588
589 if __name__ == "__main__":
590
591 # Define binary field GF(2^3)/x^3 + x + 1
592 setGF2(3, 0b1011)
593
594 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
595 print("{:02x}".format(multGF2(0b111, 0b101)))
596
597 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
598 # (used in the Advanced Encryption Standard-AES)
599 setGF2(8, 0b100011011)
600
601 # Evaluate the product (x^7)(x^7 + x + 1)
602 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
603 ```
604 ## GF add
605
606 RS = GFADDI(RS, RA|0, gfdegree, modulo=RB)
607 RS = GFADD(RS, RA|0, gfdegree=RC, modulo=RB)
608
609 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
610 | -- | -- | --- | --- | --- | ------- |--| ----- |
611 | NN | RS | RA | RB | deg | 0 1 011 |Rc| gfaddi |
612 | NN | RS | RA | RB | RC | 1 1 111 |Rc| gfadd |
613
614 GFMOD is a pseudo-op where RA=0
615
616 ## gf invert
617
618 ```
619 def gf_degree(a) :
620 res = 0
621 a >>= 1
622 while (a != 0) :
623 a >>= 1;
624 res += 1;
625 return res
626
627 def gf_invert(a, mod=0x1B) :
628 v = mod
629 g1 = 1
630 g2 = 0
631 j = gf_degree(a) - 8
632
633 while (a != 1) :
634 if (j < 0) :
635 a, v = v, a
636 g1, g2 = g2, g1
637 j = -j
638
639 a ^= v << j
640 g1 ^= g2 << j
641
642 a %= 256 # Emulating 8-bit overflow
643 g1 %= 256 # Emulating 8-bit overflow
644
645 j = gf_degree(a) - gf_degree(v)
646
647 return g1
648 ```
649
650 # bitmatrix
651
652 ```
653 uint64_t bmatflip(uint64_t RA)
654 {
655 uint64_t x = RA;
656 x = shfl64(x, 31);
657 x = shfl64(x, 31);
658 x = shfl64(x, 31);
659 return x;
660 }
661 uint64_t bmatxor(uint64_t RA, uint64_t RB)
662 {
663 // transpose of RB
664 uint64_t RBt = bmatflip(RB);
665 uint8_t u[8]; // rows of RA
666 uint8_t v[8]; // cols of RB
667 for (int i = 0; i < 8; i++) {
668 u[i] = RA >> (i*8);
669 v[i] = RBt >> (i*8);
670 }
671 uint64_t x = 0;
672 for (int i = 0; i < 64; i++) {
673 if (pcnt(u[i / 8] & v[i % 8]) & 1)
674 x |= 1LL << i;
675 }
676 return x;
677 }
678 uint64_t bmator(uint64_t RA, uint64_t RB)
679 {
680 // transpose of RB
681 uint64_t RBt = bmatflip(RB);
682 uint8_t u[8]; // rows of RA
683 uint8_t v[8]; // cols of RB
684 for (int i = 0; i < 8; i++) {
685 u[i] = RA >> (i*8);
686 v[i] = RBt >> (i*8);
687 }
688 uint64_t x = 0;
689 for (int i = 0; i < 64; i++) {
690 if ((u[i / 8] & v[i % 8]) != 0)
691 x |= 1LL << i;
692 }
693 return x;
694 }
695
696 ```