add siliconsalon2023 latex slides
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Wed, 3 May 2023 10:51:46 +0000 (11:51 +0100)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Wed, 3 May 2023 10:51:50 +0000 (11:51 +0100)
conferences/siliconsalon2023/siliconsalon2023.tex [new file with mode: 0644]

diff --git a/conferences/siliconsalon2023/siliconsalon2023.tex b/conferences/siliconsalon2023/siliconsalon2023.tex
new file mode 100644 (file)
index 0000000..2a9956b
--- /dev/null
@@ -0,0 +1,258 @@
+\documentclass[slidestop]{beamer}
+\usepackage{beamerthemesplit}
+\usepackage{graphics}
+\usepackage{pstricks}
+
+\graphicspath{{./}}
+
+\title{Big Integer Arithmetic Instruction design}
+\author{Luke Kenneth Casson Leighton}
+
+
+\begin{document}
+
+\frame{
+   \begin{center}
+    \huge{Big Integer Arithmetic Instruction design}\\
+    \vspace{32pt}
+    \Large{An analysis of big-integer arithmetic instructions}\\
+    \Large{(why not to put all eggs in Custom Silicon basket)}\\
+    \vspace{24pt}
+    \Large{Silicon Salon 2023}\\
+    \vspace{16pt}
+    \large{Sponsored by NLnet's Assure Programme}\\
+    \vspace{6pt}
+    \large{\today}
+  \end{center}
+}
+
+
+\frame{\frametitle{Who are we?}
+
+ \begin{itemize}
+   \item Libre-SOC: a fully Libre Project with the goal of creating
+               a Hybrid 3D CPU-VPU-GPU including designing a powerful Vector
+               Extension (for the Power ISA). https://libre-soc.org
+        \vspace{6pt}
+   \item RED Semiconductor Ltd: a commercial realisation of Libre-SOC
+                designs. https://redsemiconductor.com
+        \vspace{6pt}
+   \item Libre-SOC researches and designs instructions that are then
+                proposed to the OpenPOWER Foundation ISA Technical Workgroup;
+                RED Semiconductor (as an OPF ISA WG Voting Member) then keeps
+                an eye on the RFC.
+        \vspace{6pt}
+   \item RED Semiconductor Ltd seeks VC funding and commercial business
+               propositions, Libre-SOC covers Research.
+        \vspace{6pt}
+
+  \end{itemize}
+}
+
+
+\frame{\frametitle{What are the challenges faced by Biginteger?}
+
+ \begin{itemize}
+   \item Algorithms especially post-quantum are now fast-moving. This
+                does not go down well! It typically takes 5-10 years for an
+                algorithm to become "trustable".
+        \vspace{6pt}
+   \item Custom Cryptographic Hardware will typically take 3 years from
+                design concept to first production silicon: Certification even longer.
+                If a fault is found in the algorithm, the entire investment is wasted.
+        \vspace{6pt}
+   \item Performance on 32-bit and 64-bit Embedded Hardware sucks. Algorithms
+                are roughly O(N\textsuperscript{2}) which wreaks havoc. The
+                temptation therefore is to add SIMD instructions or dedicated
+                "custom" instructions which makes the problem worse.
+        \vspace{6pt}
+   \item So how can these polar opposites be solved?
+        \vspace{6pt}
+  \end{itemize}
+}
+
+
+\begin{frame}[fragile]\frametitle{Go back to the algorithms.}
+
+ \begin{itemize}
+   \item https://libre-soc.org/openpower/sv/biginteger/analysis/
+   \item Starting with Knuth's Algorithm D and M, if a True-Scalable
+                Vector ISA can cope with those, chances are good it'll cope
+                with more (Karatsuba, and so on).
+   \item SVP64 has "looping" as a primary construct: \\
+               loop i 0..VL-1: GPR(RT+i) = ADD(GPR(RA+i), GPR(RB+i)\\
+        \vspace{1pt}
+   \item If however Carry-in and Carry-out are included in that, we
+               have arbitrary-length Big-Integer Vector Add!
+   \item For all other operations as long as Scalar-Vector is ok,
+               it turns out to be possible to do 64-bit carry-in and
+               64-bit carry-out, without significant hardware disruption.
+   \item Irony: all relevant Scalar instructions (shift, mul, div)
+               usually drop 1/2 the result on the floor!
+  \end{itemize}  
+  
+\end{frame}
+
+\begin{frame}[fragile]\frametitle{Turning add-with-carry into Vector-Add}
+
+ \begin{itemize}
+   \item Add-with-Carry is the building-block of larger operations
+   \item Let's simply chain them together.
+   \item sv.adde (Add-Carry with Vector loop) creates chains
+  \end{itemize}
+  
+  \begin{verbatim}
+        R0,CA = A0+B0+CA  adde r0,a0,b0
+            |
+            +----------+
+                       |
+        R1,CA = A1+B1+CA  adde r1,a1,b1
+            |
+            +----------+
+                       |
+        R2,CA = A2+B2+CA  adde r2,a2,b2
+  \end{verbatim}
+  
+\end{frame}
+
+\begin{frame}[fragile]\frametitle{Vector-Scalar Shift}
+
+ \begin{itemize}
+   \item Shift by 64-bit is just "pick a register"
+   \item Add a 2nd input register with what needs to be shifted IN\\
+         (64-bit carry in)
+   \item Add a 2nd output register saving what normally gets thrown away\\
+               (64-bit carry-out)
+   \item Again: a chain of these performs Vector-by-Scalar shift
+  \end{itemize}
+  
+  \begin{verbatim}
+  brs(uint64_t s, uint64_t r[], uint64_t un[], int n) {
+      for (int i = 0; i < n - 1; i++)
+          r[i] = (un[i] >> s) | (un[i + 1] << (64 - s));
+      r[n - 1] = un[n - 1] >> s;
+  }
+  \end{verbatim}
+  
+\end{frame}
+
+\begin{frame}[fragile]\frametitle{Vector-Scalar Multiply}
+
+ \begin{itemize}
+   \item Normally in FMAC the top 64-bits is thrown away.
+   \item What if we stored those 64-bits in a 2nd register?\\
+               (64-bit carry-out)
+   \item And what if the next FMAC added that "digit" on?\\
+               (64-bit carry-in)
+   \item Again: a chain of these performs Vector-by-Scalar Multiply
+  \end{itemize}
+  
+  \begin{verbatim}
+      RT0, RC0 = RA0 * RB0 + 0
+            |
+            +----------------+
+                             |
+      RT1, RC1 = RA1 * RB1 + RC0
+            |
+            +----------------+
+                             |
+      RT2, RC2 = RA2 * RB2 + RC1
+  \end{verbatim}
+  
+\end{frame}
+
+\begin{frame}[fragile]\frametitle{Vector-Scalar Divide}
+
+ \begin{itemize}
+   \item Same story. special-case for overflow.
+  \end{itemize}
+  
+  \begin{verbatim}
+      RT0      = ((  0<<64) | RA0) / RB0
+           RC0 = ((  0<<64) | RA0) % RB0
+            |
+            +-------+
+                    |
+      RT1      = ((RC0<<64) | RA1) / RB1
+           RC1 = ((RC0<<64) | RA1) % RB1
+            |
+            +-------+
+                    |
+      RT2      = ((RC1<<64) | RA2) / RB2
+           RC2 = ((RC1<<64) | RA2) % RB2
+  \end{verbatim}
+  
+\end{frame}
+
+\frame{\frametitle{Summary so far}
+
+ \begin{itemize}
+   \item Extending the usual 1-bit Carry-in Carry-out to 64-bit and
+                adding a loop-construct inherently turns Scalar operations
+                into arbitrary-length Vectorised ones
+   \item Irony: 30 years ago Power ISA actually had a "Carry SPR", where
+                the normally-discarded upper half of multiply would be placed in
+                that SPR (it was deprecated).
+   \item Hardware is NOT made more complex because in all shift multiply
+                and divide operations these bits are discarded in other ISAs,
+                which is why you end up with complex carry workarounds. This
+                gives ISAs a "bad rep" for doing Big-int
+   \item The "complication" is that you need 3-in 2-out instructions,
+                but actually in Micro-code you can do operand-forwarding.
+                1st op: 3-in 1-out. chain: 2-in 1-out. Last: 2-in 2-out.
+  \end{itemize}
+}
+
+\frame{\frametitle{OpenTITAN}
+
+ \begin{itemize}
+   \item https://opentitan.org/book/hw/ip/otbn/index.html
+   \item 256b wide data path with 32 256b wide registers
+   \item Zero-Overhead Loop Control would have been better\\
+   https://ieeexplore.ieee.org/abstract/document/1692906/
+   \item Formal verification completion time is a factor of the operation
+               bit-width.  256-bit unlikely to be reasonable time.
+   \item 256-bit is great for EC25519 but for RSA (etc.) you run
+               into exactly the same problem as a Scalar ISA, made worse.
+   \item Opportunities to optimise algorithms not possible.
+  \end{itemize}
+}
+
+\frame{\frametitle{Conclusion}
+
+ \begin{itemize}
+   \item We went back to the algorithms (Knuth D and M) and examined
+                what they are trying to achieve.
+   \item Turns out they need a 64-bit carry-in and carry-out
+   \item Keeping to 64-bit maximum hardware means Formal Proofs complete
+                in reasonable time (less than heat-death of universe)
+   \item Reasonably straightforward: creates and uses partial results
+                normally thrown away (needing more instructions)
+   \item Freaks out pure-RISC proponents (3-in 2-out) but look at the
+                number of instructions (and temporary registers) needed
+                otherwise, and the overall algorithm efficiency, and the
+                case for these instructions is clear.
+       \item They also speed up \textbf{general-purpose} code
+  \end{itemize}
+}
+
+\frame{
+  \begin{center}
+    {\Huge The end\\
+                  Thank you\\
+                  Questions?\\\vspace{5pt}
+       }
+  \end{center}
+  
+  \begin{itemize}
+       \item https://redsemiconductor.com
+       \item Discussion: http://lists.libre-soc.org
+       \item Libera.Chat IRC \#libre-soc
+       \item http://libre-soc.org/
+       \item http://nlnet.nl/assure
+       \item https://libre-soc.org/nlnet/\#faq
+  \end{itemize}
+}
+
+
+\end{document}