conferences/siliconsalon2023/siliconsalon2023.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \graphicspath{{./}}
   7
   8 \title{Big Integer Arithmetic Instruction design}
   9 \author{Luke Kenneth Casson Leighton}
  10
  11
  12 \begin{document}
  13
  14 \frame{
  15    \begin{center}
  16     \huge{Big Integer Arithmetic Instruction design}\\
  17     \vspace{32pt}
  18     \Large{An analysis of big-integer arithmetic instructions}\\
  19     \Large{(why not to put all eggs in Custom Silicon basket)}\\
  20     \vspace{24pt}
  21     \Large{Silicon Salon 2023}\\
  22     \vspace{16pt}
  23     \large{Sponsored by NLnet's Assure Programme}\\
  24     \vspace{6pt}
  25     \large{\today}
  26   \end{center}
  27 }
  28
  29
  30 \frame{\frametitle{Who are we?}
  31
  32  \begin{itemize}
  33    \item Libre-SOC: a fully Libre Project with the goal of creating
  34                 a Hybrid 3D CPU-VPU-GPU including designing a powerful Vector
  35                 Extension (for the Power ISA). https://libre-soc.org
  36         \vspace{6pt}
  37    \item RED Semiconductor Ltd: a commercial realisation of Libre-SOC
  38                  designs. https://redsemiconductor.com
  39         \vspace{6pt}
  40    \item Libre-SOC researches and designs instructions that are then
  41                  proposed to the OpenPOWER Foundation ISA Technical Workgroup;
  42                  RED Semiconductor (as an OPF ISA WG Voting Member) then keeps
  43                  an eye on the RFC.
  44         \vspace{6pt}
  45    \item RED Semiconductor Ltd seeks VC funding and commercial business
  46                 propositions, Libre-SOC covers Research.
  47         \vspace{6pt}
  48
  49   \end{itemize}
  50 }
  51
  52
  53 \frame{\frametitle{What are the challenges faced by Biginteger?}
  54
  55  \begin{itemize}
  56    \item Algorithms especially post-quantum are now fast-moving. This
  57                  does not go down well! It typically takes 5-10 years for an
  58                  algorithm to become "trustable".
  59         \vspace{6pt}
  60    \item Custom Cryptographic Hardware will typically take 3 years from
  61                  design concept to first production silicon: Certification even longer.
  62                  If a fault is found in the algorithm, the entire investment is wasted.
  63         \vspace{6pt}
  64    \item Performance on 32-bit and 64-bit Embedded Hardware sucks. Algorithms
  65                  are roughly O(N\textsuperscript{2}) which wreaks havoc. The
  66                  temptation therefore is to add SIMD instructions or dedicated
  67                  "custom" instructions which makes the problem worse.
  68         \vspace{6pt}
  69    \item So how can these polar opposites be solved?
  70         \vspace{6pt}
  71   \end{itemize}
  72 }
  73
  74
  75 \begin{frame}[fragile]\frametitle{Go back to the algorithms.}
  76
  77  \begin{itemize}
  78    \item https://libre-soc.org/openpower/sv/biginteger/analysis/
  79    \item Starting with Knuth's Algorithm D and M, if a True-Scalable
  80                  Vector ISA can cope with those, chances are good it'll cope
  81                  with more (Karatsuba, and so on).
  82    \item SVP64 has "looping" as a primary construct: \\
  83                 loop i 0..VL-1: GPR(RT+i) = ADD(GPR(RA+i), GPR(RB+i)\\
  84         \vspace{1pt}
  85    \item If however Carry-in and Carry-out are included in that, we
  86                 have arbitrary-length Big-Integer Vector Add!
  87    \item For all other operations as long as Scalar-Vector is ok,
  88                 it turns out to be possible to do 64-bit carry-in and
  89                 64-bit carry-out, without significant hardware disruption.
  90    \item Irony: all relevant Scalar instructions (shift, mul, div)
  91                 usually drop 1/2 the result on the floor!
  92   \end{itemize}
  93
  94 \end{frame}
  95
  96 \begin{frame}[fragile]\frametitle{Turning add-with-carry into Vector-Add}
  97
  98  \begin{itemize}
  99    \item Add-with-Carry is the building-block of larger operations
 100    \item Let's simply chain them together.
 101    \item sv.adde (Add-Carry with Vector loop) creates chains
 102   \end{itemize}
 103
 104   \begin{verbatim}
 105         R0,CA = A0+B0+CA  adde r0,a0,b0
 106             |
 107             +----------+
 108                        |
 109         R1,CA = A1+B1+CA  adde r1,a1,b1
 110             |
 111             +----------+
 112                        |
 113         R2,CA = A2+B2+CA  adde r2,a2,b2
 114   \end{verbatim}
 115
 116 \end{frame}
 117
 118 \begin{frame}[fragile]\frametitle{Vector-Scalar Shift}
 119
 120  \begin{itemize}
 121    \item Shift by 64-bit is just "pick a register"
 122    \item Add a 2nd input register with what needs to be shifted IN\\
 123          (64-bit carry in)
 124    \item Add 2nd output saving what normally gets thrown away\\
 125                 (64-bit carry-out)
 126    \item Again: a chain of these performs Vector-by-Scalar shift
 127   \end{itemize}
 128
 129   \begin{verbatim}
 130   brs(uint64_t s, uint64_t r[], uint64_t un[], int n) {
 131       for (int i = 0; i < n - 1; i++)
 132           r[i] = (un[i] >> s) | (un[i + 1] << (64 - s));
 133       r[n - 1] = un[n - 1] >> s;
 134   }
 135   \end{verbatim}
 136
 137 \end{frame}
 138
 139 \begin{frame}[fragile]\frametitle{Vector-Scalar Multiply}
 140
 141  \begin{itemize}
 142    \item Normally in MAC the top 64-bits is thrown away.
 143    \item What if we stored those 64-bits in a 2nd register?\\
 144                 (64-bit carry-out)
 145    \item And what if the next MAC added that "digit" on?\\
 146                 (64-bit carry-in)
 147    \item Again: a chain of these performs Vector-by-Scalar Multiply
 148   \end{itemize}
 149
 150   \begin{verbatim}
 151       RT0, RC0 = RA0 * RB0 + 0
 152             |
 153             +----------------+
 154                              |
 155       RT1, RC1 = RA1 * RB1 + RC0
 156             |
 157             +----------------+
 158                              |
 159       RT2, RC2 = RA2 * RB2 + RC1
 160   \end{verbatim}
 161
 162 \end{frame}
 163
 164 \begin{frame}[fragile]\frametitle{Vector-Scalar Divide}
 165
 166  \begin{itemize}
 167    \item Same story. special-case for overflow.
 168   \end{itemize}
 169
 170   \begin{verbatim}
 171       RT0      = ((  0<<64) | RA0) / RB0
 172            RC0 = ((  0<<64) | RA0) % RB0
 173             |
 174             +-------+
 175                     |
 176       RT1      = ((RC0<<64) | RA1) / RB1
 177            RC1 = ((RC0<<64) | RA1) % RB1
 178             |
 179             +-------+
 180                     |
 181       RT2      = ((RC1<<64) | RA2) / RB2
 182            RC2 = ((RC1<<64) | RA2) % RB2
 183   \end{verbatim}
 184
 185 \end{frame}
 186
 187 \frame{\frametitle{Summary so far}
 188
 189  \begin{itemize}
 190    \item Extending the usual 1-bit Carry-in Carry-out to 64-bit and
 191                  adding a loop-construct inherently turns Scalar operations
 192                  into arbitrary-length Vectorised ones
 193    \item Irony: 30 years ago Power ISA actually had a "Carry SPR", where
 194                  the normally-discarded upper half of multiply would be placed in
 195                  that SPR (it was deprecated).
 196    \item Hardware is NOT made more complex because in all shift multiply
 197                  and divide operations these bits are discarded in other ISAs,
 198                  which is why you end up with complex carry workarounds. This
 199                  gives ISAs a "bad rep" for doing Big-int
 200    \item The "complication" is that you need 3-in 2-out instructions,
 201                  but actually in Micro-code you can do operand-forwarding.
 202                  1st op: 3-in 1-out. chain: 2-in 1-out. Last: 2-in 2-out.
 203   \end{itemize}
 204 }
 205
 206 \frame{\frametitle{OpenTITAN}
 207
 208  \begin{itemize}
 209    \item https://opentitan.org/book/hw/ip/otbn/index.html
 210    \item 256b wide data path with 32 256b wide registers
 211    \item Zero-Overhead Loop Control would have been better\\
 212    https://ieeexplore.ieee.org/abstract/document/1692906/
 213    \item Formal verification completion time is a factor of the operation
 214                 bit-width.  256-bit unlikely to be reasonable time.
 215    \item 256-bit is great for EC25519 but for RSA (etc.) you run
 216                 into exactly the same problem as a Scalar ISA, made worse.
 217    \item Opportunities to optimise algorithms not possible (efficient
 218          power-optimised Karatsuba, etc.)
 219   \end{itemize}
 220 }
 221
 222 \begin{frame}[fragile]\frametitle{OpenTITAN shift}
 223
 224  \begin{itemize}
 225    \item Immediate-only. what about shift-by-reg?
 226    \item merges 2 operands, still not chainable.
 227    \item needs a copy of the vector input (double number of regs)
 228    \item needs massive 256-bit shifter! 8 layers of muxes!
 229   \end{itemize}
 230
 231   \begin{verbatim}
 232   a = WDRs[wrs1]
 233   b = WDRs[wrs2]
 234
 235   result = (((a << 256) | b) >> imm) & ((1 << 256) - 1)
 236   WDRs[wrd] = result
 237   \end{verbatim}
 238
 239 \end{frame}
 240
 241 \begin{frame}[fragile]\frametitle{Draft Double-Shift}
 242
 243  \begin{itemize}
 244    \item Remarkably similar to x86 dsld
 245    \item Does not need 128-bit ROT: simple mod to existing hardware
 246    \item Hardware may macro-op fuse Vector-shift for better efficiency
 247    \item Chainable and in-place (no copy of vector needed).
 248   \end{itemize}
 249
 250   \begin{verbatim}
 251     n <- (RB)[58:63]    # Power ISA MSB0 numbering. sigh
 252     v <- ROTL64((RA), n)
 253     mask <- MASK(0, 63-n)
 254     RT <- (v[0:63] & mask) | ((RC) & ~mask)
 255     RS <- v[0:63] & ~mask
 256   \end{verbatim}
 257
 258 \end{frame}
 259
 260 \frame{\frametitle{Conclusion}
 261
 262  \begin{itemize}
 263    \item We went back to the algorithms (Knuth D and M) and examined
 264                  what they are trying to achieve.
 265    \item Turns out they need a 64-bit carry-in and carry-out
 266    \item Keeping to 64-bit maximum hardware means Formal Proofs complete
 267                  in reasonable time (less than heat-death of universe)
 268    \item Reasonably straightforward: creates and uses partial results
 269                  normally thrown away (needing more instructions)
 270    \item Freaks out pure-RISC proponents (3-in 2-out) but look at the
 271                  number of instructions (and temporary registers) needed
 272                  otherwise, and the overall algorithm efficiency, and the
 273                  case for these instructions is clear.
 274         \item They also speed up \textbf{general-purpose} code
 275   \end{itemize}
 276 }
 277
 278 \frame{
 279   \begin{center}
 280     {\Huge The end\\
 281                    Thank you\\
 282                    Questions?\\\vspace{5pt}
 283         }
 284   \end{center}
 285
 286   \begin{itemize}
 287         \item https://redsemiconductor.com
 288         \item Discussion: http://lists.libre-soc.org
 289         \item Libera.Chat IRC \#libre-soc
 290         \item http://libre-soc.org/
 291         \item http://nlnet.nl/assure
 292         \item https://libre-soc.org/nlnet/\#faq
 293   \end{itemize}
 294 }
 295
 296
 297 \end{document}