openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # bitmanipulation
   4
   5 **DRAFT STATUS**
   6
   7 this extension amalgamates bitnanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
   8 Vectorisation Context is provided by [[openpower/sv]].
   9
  10
  11 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD".  It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs.  Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
  12
  13 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science.  however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added.  The expectation is that these operations would all be covered by the same pipeline.
  14
  15 note that there are brownfield spaces below that could incorporate some of the set-before-first and other operations.
  16
  17 # summary
  18
  19 minor opcode allocation
  20
  21     |  28.30 |31| name      |
  22     | ------ |--| --------- |
  23     |   00   |Rc| ternaryi  |
  24     |  001   |Rc| ternary   |
  25     |  010   |Rc| bitmask   |
  26     |  011   |Rc| gf*       |
  27     |  101   |1 | ternaryv  |
  28     |  101   |0 | ternarycr |
  29     |  110   |Rc| 1/2-op    |
  30     |  111   |Rc| 3-op      |
  31
  32 1-op and variants
  33
  34 | dest | src1 | subop | op       |
  35 | ---- | ---- | ----- | -------- |
  36 | RT   | RA   | ..    | bmatflip |
  37
  38 2-op and variants
  39
  40 | dest | src1 | src2 | subop | op       |
  41 | ---- | ---- | ---- | ----- | -------- |
  42 | RT   | RA   | RB   | or    | bmatflip |
  43 | RT   | RA   | RB   | xor   | bmatflip |
  44 | RT   | RA   | RB   | bdep  | dep/ext  |
  45 | RT   | RA   | RB   | bext  | dep/ext  |
  46 | RT   | RA   | RB   |       | grev  |
  47 | RT   | RA   | RB   |       | clmul*  |
  48 | RT   | RA   | RB   |       | gorc |
  49 | RT   | RA   | RB   | shuf  | shuffle |
  50 | RT   | RA   | RB   | unshuf| shuffle |
  51 | RT   | RA   | RB   | width | xperm  |
  52 | RT   | RA   | RB   | type | minmax |
  53 | RT   | RA   | RB   |  |  |
  54 | RT   | RA   | RB   |  |  |
  55 | RT   | RA   | RB   |  |  |
  56
  57 3 ops
  58
  59 * bitmask set/extract
  60 * ternary bitops
  61 * GF
  62
  63 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  64 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  65 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  66 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  67 | NN | RS | RA  | RB  | RC    | 00  011  |Rc| gfmul |
  68 | NN | RS | RA  | RB  | RC    | 01  011  |Rc| gfadd |
  69 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  70 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gfmuli |
  71 | NN | RS | RA  | RB  | deg   | 11  111  |Rc| gfaddi |
  72
  73 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  74 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  75 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  76
  77 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  78 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  79 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  80
  81 ops
  82
  83 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  84 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  85 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  86 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  87 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  88 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  89 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  90 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  91 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  92 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  93 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  94 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  95 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  96 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  97 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  98 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  99 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
 100 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
 101 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
 102 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
 103 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
 104 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
 105 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
 106 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
 107 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
 108 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
 109 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
 110 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
 111 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
 112 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
 113 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
 114 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
 115
 116 # count leading/trailing zeros with mask
 117
 118 in v3.1 p105
 119
 120 ```
 121 count = 0
 122 do i = 0 to 63 if((RB)i=1) then do
 123 if((RS)i=1) then break end end count ← count + 1
 124 RA ← EXTZ64(count)
 125 ```
 126
 127 # bit to byte permute
 128
 129 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 130
 131     do j = 0 to 7
 132       do k = 0 to 7
 133          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 134          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 135
 136 #  bit deposit
 137
 138 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
 139
 140     do while(m < 64)
 141        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 142           result = VSR[VRA+32].dword[i].bit[63-k]
 143           VSR[VRT+32].dword[i].bit[63-m] = result
 144           k = k + 1
 145        m = m + 1
 146
 147 ```
 148
 149 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 150 {
 151     uint_xlen_t r = 0;
 152     for (int i = 0, j = 0; i < XLEN; i++)
 153         if ((RB >> i) & 1) {
 154             if ((RA >> j) & 1)
 155                 r |= uint_xlen_t(1) << i;
 156             j++;
 157         }
 158     return r;
 159 }
 160
 161 ```
 162
 163 # bit extract
 164
 165 other way round: identical to RV bext, found in v3.1 p196
 166
 167 ```
 168 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 169 {
 170     uint_xlen_t r = 0;
 171     for (int i = 0, j = 0; i < XLEN; i++)
 172         if ((RB >> i) & 1) {
 173             if ((RA >> i) & 1)
 174                 r |= uint_xlen_t(1) << j;
 175             j++;
 176         }
 177     return r;
 178 }
 179 ```
 180
 181 # centrifuge
 182
 183 found in v3.1 p106
 184
 185 ```
 186 ptr0 ← 0 ptr1 ← 0 do i = 0 to 63 if((RB)i=0) then do
 187 resultptr0 ← (RS)i end ptr0 ← ptr0 + 1
 188 if((RB)63-i==1) then do
 189 result63-ptr1 ← (RS)63-i end end ptr1 ← ptr1 + 1
 190 RA ← result
 191 ```
 192
 193 # int min/max
 194
 195 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 196
 197 signed/unsigned min/max gives more flexibility.
 198
 199 ```
 200 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 201 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 202 }
 203 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 204 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 205 }
 206 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 207 { return rs1 < rs2 ? rs1 : rs2;
 208 }
 209 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 210 { return rs1 > rs2 ? rs1 : rs2;
 211 }
 212 ```
 213
 214
 215 # ternary bitops
 216
 217 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 218
 219 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 220 | -- | -- | --- | --- | ----- | -------- |--|
 221 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 222
 223     for i in range(64):
 224         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 225         RT[i] = (imm & (1<<idx)) != 0
 226
 227 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 228
 229 a 4 operand variant which becomes more along the lines of an FPGA:
 230
 231 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 232 | -- | -- | --- | --- | --- | -------- |--|
 233 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 234
 235     for i in range(64):
 236         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 237         RT[i] = (RC & (1<<idx)) != 0
 238
 239 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 240 3 modes.
 241
 242 also, another possible variant involving swizzle and vec4:
 243
 244 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 245 | -- | -- | --- | ----- | ---- | ----- |--|
 246 | NN | RT | RA  | imm   | mask | 101   |1 |
 247
 248     for i in range(8):
 249         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 250         res = (imm & (1<<idx)) != 0
 251         for j in range(3):
 252              if mask[j]: RT[i+j*8] = res
 253
 254 another mode selection would be CRs not Ints.
 255
 256 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 257 | -- | -- | --- | --- |- |-----|----- | -----|--|
 258 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 259
 260     for i in range(4):
 261         if not mask[i] continue
 262         idx = crregs[BA][i] << 2 |
 263               crregs[BB][i] << 1 |
 264               crregs[BC][i]
 265         crregs[BA][i] = (imm & (1<<idx)) != 0
 266
 267 # bitmask set
 268
 269 based on RV bitmanip singlebit set, instruction format similar to shift
 270 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 271 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 272
 273 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter, so a bmexti is not required.
 274 bmrev however there is no direct equivalent and consequently a bmrevi is required.
 275
 276 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 277 | -- | -- | --- | --- | --- | ------- |--| ----- |
 278 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
 279 | NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 280
 281
 282 ```
 283 uint_xlen_t bmset(RA, RB, sh)
 284 {
 285     int shamt = RB & (XLEN - 1);
 286     mask = (2<<sh)-1;
 287     return RA | (mask << shamt);
 288 }
 289
 290 uint_xlen_t bmclr(RA, RB, sh)
 291 {
 292     int shamt = RB & (XLEN - 1);
 293     mask = (2<<sh)-1;
 294     return RA & ~(mask << shamt);
 295 }
 296
 297 uint_xlen_t bminv(RA, RB, sh)
 298 {
 299     int shamt = RB & (XLEN - 1);
 300     mask = (2<<sh)-1;
 301     return RA ^ (mask << shamt);
 302 }
 303
 304 uint_xlen_t bmext(RA, RB, sh)
 305 {
 306     int shamt = RB & (XLEN - 1);
 307     mask = (2<<sh)-1;
 308     return mask & (RA >> shamt);
 309 }
 310 ```
 311
 312 bitmask extract with reverse.  can be done by bitinverting all of RA and getting bits of RA from the opposite end.
 313
 314 ```
 315 msb = rb[5:0];
 316 rev[0:msb] = ra[msb:0];
 317 rt = ZE(rev[msb:0]);
 318
 319 uint_xlen_t bmextrev(RA, RB, sh)
 320 {
 321     int shamt = (RB & (XLEN - 1));
 322     shamt = (XLEN-1)-shamt;  # shift other end
 323     bra = bitreverse(RA)     # swap LSB-MSB
 324     mask = (2<<sh)-1;
 325     return mask & (bra >> shamt);
 326 }
 327 ```
 328
 329 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 330 | -- | -- | --- | --- | --- | ------- |--| ------ |
 331 | NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
 332
 333
 334
 335 # grev
 336
 337 based on RV bitmanip
 338
 339 ```
 340 uint64_t grev64(uint64_t RA, uint64_t RB)
 341 {
 342     uint64_t x = RA;
 343     int shamt = RB & 63;
 344     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 345                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 346     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 347                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 348     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 349                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 350     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 351                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 352     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 353                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 354     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 355                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 356     return x;
 357 }
 358
 359 ```
 360
 361 # shuffle / unshuffle
 362
 363 based on RV bitmanip
 364
 365 ```
 366 uint32_t shfl32(uint32_t RA, uint32_t RB)
 367 {
 368     uint32_t x = RA;
 369     int shamt = RB & 15;
 370     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 371     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 372     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 373     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 374     return x;
 375 }
 376 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 377 {
 378     uint32_t x = RA;
 379     int shamt = RB & 15;
 380     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 381     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 382     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 383     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 384     return x;
 385 }
 386
 387 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 388 {
 389     uint64_t x = src & ~(maskL | maskR);
 390     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 391     return x;
 392 }
 393 uint64_t shfl64(uint64_t RA, uint64_t RB)
 394 {
 395     uint64_t x = RA;
 396     int shamt = RB & 31;
 397     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 398                                            0x00000000ffff0000LL, 16);
 399     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 400                                            0x0000ff000000ff00LL, 8);
 401     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 402                                            0x00f000f000f000f0LL, 4);
 403     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 404                                            0x0c0c0c0c0c0c0c0cLL, 2);
 405     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 406                                            0x2222222222222222LL, 1);
 407     return x;
 408 }
 409 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 410 {
 411     uint64_t x = RA;
 412     int shamt = RB & 31;
 413     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 414                                            0x2222222222222222LL, 1);
 415     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 416                                            0x0c0c0c0c0c0c0c0cLL, 2);
 417     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 418                                            0x00f000f000f000f0LL, 4);
 419     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 420                                            0x0000ff000000ff00LL, 8);
 421     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 422                                            0x00000000ffff0000LL, 16);
 423     return x;
 424 }
 425 ```
 426
 427 # xperm
 428
 429 based on RV bitmanip
 430
 431 ```
 432 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 433 {
 434     uint_xlen_t r = 0;
 435     uint_xlen_t sz = 1LL << sz_log2;
 436     uint_xlen_t mask = (1LL << sz) - 1;
 437     for (int i = 0; i < XLEN; i += sz) {
 438         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 439         if (pos < XLEN)
 440             r |= ((RA >> pos) & mask) << i;
 441     }
 442     return r;
 443 }
 444 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 445 {  return xperm(RA, RB, 2); }
 446 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 447 {  return xperm(RA, RB, 3); }
 448 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 449 {  return xperm(RA, RB, 4); }
 450 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 451 {  return xperm(RA, RB, 5); }
 452 ```
 453
 454 # gorc
 455
 456 based on RV bitmanip
 457
 458 ```
 459 uint32_t gorc32(uint32_t RA, uint32_t RB)
 460 {
 461     uint32_t x = RA;
 462     int shamt = RB & 31;
 463     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 464     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 465     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 466     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 467     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 468     return x;
 469 }
 470 uint64_t gorc64(uint64_t RA, uint64_t RB)
 471 {
 472     uint64_t x = RA;
 473     int shamt = RB & 63;
 474     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 475                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 476     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 477                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 478     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 479                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 480     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 481                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 482     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 483                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 484     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 485                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 486     return x;
 487 }
 488
 489 ```
 490
 491 # cmix
 492
 493 based on RV bitmanip, covered by ternary bitops
 494
 495 ```
 496 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 497     return (RA & RB) | (RC & ~RB);
 498 }
 499 ```
 500
 501 # carryless mul
 502
 503 based on RV bitmanip
 504 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 505
 506 ```
 507 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 508 {
 509     uint_xlen_t x = 0;
 510     for (int i = 0; i < XLEN; i++)
 511         if ((RB >> i) & 1)
 512             x ^= RA << i;
 513     return x;
 514 }
 515 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 516 {
 517     uint_xlen_t x = 0;
 518     for (int i = 1; i < XLEN; i++)
 519         if ((RB >> i) & 1)
 520             x ^= RA >> (XLEN-i);
 521     return x;
 522 }
 523 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 524 {
 525     uint_xlen_t x = 0;
 526     for (int i = 0; i < XLEN; i++)
 527         if ((RB >> i) & 1)
 528             x ^= RA >> (XLEN-i-1);
 529     return x;
 530 }
 531 ```
 532 # Galois Field
 533
 534 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
 535
 536 ## Multiply
 537
 538 this requires 3 parameters and a "degree"
 539
 540     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 541
 542 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 543
 544     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 545     RS = GFMUL(RS, RA, gfdegree=RC, modulo=RB)
 546
 547 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 548 | -- | -- | --- | --- | --- | ------- |--|
 549 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 550 | NN | RS | RA  | RB  | RC  | 11  011 |Rc|
 551
 552 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 553
 554
 555
 556 ```
 557 from functools import reduce
 558
 559 # constants used in the multGF2 function
 560 mask1 = mask2 = polyred = None
 561
 562 def setGF2(degree, irPoly):
 563     """Define parameters of binary finite field GF(2^m)/g(x)
 564        - degree: extension degree of binary field
 565        - irPoly: coefficients of irreducible polynomial g(x)
 566     """
 567     def i2P(sInt):
 568         """Convert an integer into a polynomial"""
 569         return [(sInt >> i) & 1
 570                 for i in reversed(range(sInt.bit_length()))]
 571
 572     global mask1, mask2, polyred
 573     mask1 = mask2 = 1 << degree
 574     mask2 -= 1
 575     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 576
 577 def multGF2(p1, p2):
 578     """Multiply two polynomials in GF(2^m)/g(x)"""
 579     p = 0
 580     while p2:
 581         if p2 & 1:
 582             p ^= p1
 583         p1 <<= 1
 584         if p1 & mask1:
 585             p1 ^= polyred
 586         p2 >>= 1
 587     return p & mask2
 588
 589 if __name__ == "__main__":
 590
 591     # Define binary field GF(2^3)/x^3 + x + 1
 592     setGF2(3, 0b1011)
 593
 594     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 595     print("{:02x}".format(multGF2(0b111, 0b101)))
 596
 597     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 598     # (used in the Advanced Encryption Standard-AES)
 599     setGF2(8, 0b100011011)
 600
 601     # Evaluate the product (x^7)(x^7 + x + 1)
 602     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 603 ```
 604 ## GF add
 605
 606     RS = GFADDI(RS, RA|0, gfdegree, modulo=RB)
 607     RS = GFADD(RS, RA|0, gfdegree=RC, modulo=RB)
 608
 609 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 610 | -- | -- | --- | --- | --- | ------- |--| ----- |
 611 | NN | RS | RA  | RB  | deg | 0 1  011 |Rc| gfaddi |
 612 | NN | RS | RA  | RB  | RC  | 1 1  111 |Rc| gfadd |
 613
 614 GFMOD is a pseudo-op where RA=0
 615
 616 ## gf invert
 617
 618 ```
 619 def gf_degree(a) :
 620   res = 0
 621   a >>= 1
 622   while (a != 0) :
 623     a >>= 1;
 624     res += 1;
 625   return res
 626
 627 def gf_invert(a, mod=0x1B) :
 628   v = mod
 629   g1 = 1
 630   g2 = 0
 631   j = gf_degree(a) - 8
 632
 633   while (a != 1) :
 634     if (j < 0) :
 635       a, v = v, a
 636       g1, g2 = g2, g1
 637       j = -j
 638
 639     a ^= v << j
 640     g1 ^= g2 << j
 641
 642     a %= 256  # Emulating 8-bit overflow
 643     g1 %= 256 # Emulating 8-bit overflow
 644
 645     j = gf_degree(a) - gf_degree(v)
 646
 647   return g1
 648 ```
 649
 650 # bitmatrix
 651
 652 ```
 653 uint64_t bmatflip(uint64_t RA)
 654 {
 655     uint64_t x = RA;
 656     x = shfl64(x, 31);
 657     x = shfl64(x, 31);
 658     x = shfl64(x, 31);
 659     return x;
 660 }
 661 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 662 {
 663     // transpose of RB
 664     uint64_t RBt = bmatflip(RB);
 665     uint8_t u[8]; // rows of RA
 666     uint8_t v[8]; // cols of RB
 667     for (int i = 0; i < 8; i++) {
 668         u[i] = RA >> (i*8);
 669         v[i] = RBt >> (i*8);
 670     }
 671     uint64_t x = 0;
 672     for (int i = 0; i < 64; i++) {
 673         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 674             x |= 1LL << i;
 675     }
 676     return x;
 677 }
 678 uint64_t bmator(uint64_t RA, uint64_t RB)
 679 {
 680     // transpose of RB
 681     uint64_t RBt = bmatflip(RB);
 682     uint8_t u[8]; // rows of RA
 683     uint8_t v[8]; // cols of RB
 684     for (int i = 0; i < 8; i++) {
 685         u[i] = RA >> (i*8);
 686         v[i] = RBt >> (i*8);
 687     }
 688     uint64_t x = 0;
 689     for (int i = 0; i < 64; i++) {
 690         if ((u[i / 8] & v[i % 8]) != 0)
 691             x |= 1LL << i;
 692     }
 693     return x;
 694 }
 695
 696 ```