bug 1244: add assembler and python maxloc listing to slides
[libreriscv.git] / conferences / fosdem2024 / fosdem2024_ddffirst / fosdem2024_ddffirst.tex
index 06301e8998b12505898bd0677ce316c5830e362e..fdbc83369f330f9c73983aa6705a336a584d619f 100644 (file)
 
 \frame{
    \begin{center}
-    \huge{The Libre-SOC Hybrid 3D CPU}\\
-    \vspace{32pt}
-    \Large{Data-Dependent-Fail-First}\\
+    \huge{Libre-SOC Simple-V Specification \\
+                       Advanced features}\\
+    \vspace{24pt}
+    \Large{Data-Dependent Fail-First}\\
 
     \vspace{24pt}
     \Large{FOSDEM2024}\\
     \vspace{16pt}
-    \large{Sponsored by NLnet's PET Programme}\\
+    \large{Funded by NLnet NGI-ASSURE \\
+       EU grant agreement No 957073}\\
     \vspace{6pt}
     \large{\today}
   \end{center}
 }
 
-
-\frame{\frametitle{Why another SoC?}
-
- \begin{itemize}
-   \item Intel Management Engine, Apple QA issues, Spectre\vspace{6pt}
-   \item Endless proprietary drivers, "simplest" solution: \\
-         License proprietary hard macros (with proprietary firmware)\\
-                Adversely affects product development cost\\
-               due to opaque driver bugs (Samsung S3C6410 / S5P100)
-                \vspace{6pt}
-   \item Alternative: Intel and Valve-Steam collaboration\\
-         "Most productive business meeting ever!"\\
-         https://tinyurl.com/valve-steam-intel
-               \vspace{6pt}
-   \item Because for 30 years I Always Wanted To Design A CPU
-               \vspace{6pt}
-   \item Ultimately it is a strategic \textit{business} objective to
-         develop entirely Libre hardware, firmware and drivers.
-  \end{itemize}
-}
-
-
-
-\frame{\frametitle{How can you help?}
-
-\vspace{5pt}
-
- \begin{itemize}
-   \item Start here! https://libre-soc.org \\
-            Mailing lists https://lists.libre-soc.org \\
-            IRC Freenode libre-soc \\
-            etc. etc. (it's a Libre project, go figure) \\
-                  \vspace{3pt}
-   \item Can I get paid? Yes!  NLnet funded\\
-                See https://libre-soc.org/nlnet/\#faq \\
-                \vspace{3pt}
-   \item Also profit-sharing in any commercial ventures \\
-            \vspace{3pt}
-   \item How many opportunities to develop Libre SoCs exist,\\
-            and actually get paid for it?
-                    \vspace{3pt}
-   \item I'm not a developer, how can I help?\\
-               - Plenty of research needed, artwork, website \\
-               - Help find customers and OEMs willing to commit (LOI)
-  \end{itemize}
-}
-
-
-
-\frame{\frametitle{What goes into a typical SoC?}
-\vspace{9pt}
- \begin{itemize}
-   \item 15 to 20mm BGA package: 2.5 to 5 watt power consumption\\
-               heat sink normally not required (simplifies overall design)
-               \vspace{3pt}
-   \item Fully-integrated peripherals (not Northbridge/Southbridge)\\
-         USB, HDMI, RGB/TTL, SD/MMC, I2C, UART, SPI, GPIO etc. etc.
-         \vspace{3pt}
-   \item Built-in GPU (shared memory bus, 3rd party licensed) \vspace{3pt}
-   \item Built-in VPU (likewise, proprietary)\vspace{3pt}
-   \item Target price between \$2.50 and \$30 depending on market\\
-         Radically different from IBM POWER9 Core (200 Watt)
-         \vspace{3pt}
-   \item We're doing the same, just with a hybrid architecture.\\
-                CPU == GPU == VPU
-  \end{itemize}
-}
-
-
-
 \begin{frame}[fragile]
 \frametitle{Simple-V CMPI in a nutshell}
 
@@ -120,27 +52,27 @@ function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
 
 
 \frame{\frametitle{Load/Store Fault-First}
-       
-       \begin{itemize}
-               \item Problem: vector load and store can cause a page fault
-               \item Solution: a protocol that allows optional load/store
-               \item instruction \textit{requests} a number of elements
-               \item instruction \textit{informs} the number actually loaded
-               \item first element load/store is not optional (cannot fail)
+
+    \begin{itemize}
+        \item Problem: vector load and store can cause a page fault
+        \item Solution: a protocol that allows optional load/store
+        \item instruction \textit{requests} a number of elements
+        \item instruction \textit{informs} the number actually loaded
+        \item first element load/store is not optional (cannot fail)
         \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
         \item more: wikipedia Vector processor page: Fault/Fail First
         \vspace{10pt}
-               \item Load/Store is Memory to/from Register, what about
+        \item Load/Store is Memory to/from Register, what about
               Register to Register?
         \item Register-to-register: "Data-Dependent Fail-First."
         \item Z80 LDIR: Mem-Register, CPIR: Register-Register
-       \end{itemize}
+    \end{itemize}
 }
 
 \begin{frame}[fragile]
-       \frametitle{Data-Dependent-Fail-First in a nutshell}
-       
-       \begin{semiverbatim}
+    \frametitle{Data-Dependent-Fail-First in a nutshell}
+
+    \begin{semiverbatim}
 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
 int i, id=0, ira=0;
 for (i = 0; i < VL; i++)
@@ -148,93 +80,102 @@ for (i = 0; i < VL; i++)
     if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
     if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
     if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
-       \end{semiverbatim}
-       
-       \begin{itemize}
-               \item Parallelism still perfectly possible
-                     ("hold" writing results until sequential post-analysis
-                      carried out. Best done with OoO)
-               \item VL truncation can be inclusive or exclusive
-                     (include or exclude a NULL pointer or a
-                     string-end character, or overflow result)
-               \item \textit{Truncation can be to zero Vector Length}
-       \end{itemize}
+    \end{semiverbatim}
+
+    \begin{itemize}
+        \item Parallelism still perfectly possible
+              ("hold" writing results until sequential post-analysis
+               carried out. Best done with OoO)
+        \item VL truncation can be inclusive or exclusive
+              (include or exclude a NULL pointer or a
+              string-end character, or overflow result)
+        \item \textit{Truncation can be to zero Vector Length}
+    \end{itemize}
 \end{frame}
 
 \frame{\frametitle{Power ISA v3.1 vstribr}
-       
-       \lstinputlisting[language={}]{vstribr.txt}
-       
-       \begin{itemize}
-               \item ironically this hard-coded instruction is
-               identical to general-purpose Simple-V DD-FFirst...
-       \end{itemize}
-       
-}Po
 
-\frame{\frametitle{maxloc}
-  \begin{itemize}
-               \item "TODO
-  \end{itemize}
+    \lstinputlisting[language={}]{vstribr.txt}
+
+    \begin{itemize}
+        \item ironically this hard-coded instruction is
+        identical to general-purpose Simple-V DD-FFirst...
+    \end{itemize}
+
 }
 
 \frame{\frametitle{Pospopcount}
-       
+
   \begin{itemize}
-       \item   Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
-       \item   Notoriously difficult to do in SIMD assembler: typically 550 lines
+    \item     Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
+    \item   Notoriously difficult to do in SIMD assembler: typically 550 lines
     \item https://github.com/clausecker/pospop
 
    \end{itemize}
-       
-       \lstinputlisting[language={}]{pospopcount.c}
 
-       
+    \lstinputlisting[language={}]{pospopcount.c}
+
+
 }
 
 \frame{\frametitle{Pospopcount}
-       
-       \begin{center}
-               \includegraphics[width=0.5\textwidth]{pospopcount.png}
-       \end{center}
-         \begin{itemize}
-               \item   The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally),
-               in blocks that suit the processor and the ISA capacity.
 
-               
-       \end{itemize}
+    \begin{center}
+        \includegraphics[width=0.5\textwidth]{pospopcount.png}
+    \end{center}
+      \begin{itemize}
+        \item   The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally),
+        in blocks that suit the processor and the ISA capacity.
+
+
+    \end{itemize}
 }
 
 \frame{\frametitle{Pospopcount}
-       
-       \begin{center}
-               \includegraphics[width=0.6\textwidth]{array_popcnt.png}
-       \end{center}
+
+    \begin{center}
+        \includegraphics[width=0.6\textwidth]{array_popcnt.png}
+    \end{center}
 
   \begin{itemize}
 
-               \item   The draft gbbd instruction implements the transpose (shown above),
-                               preparing the data to use standard popcount.
-                          (gbbd is based on Power ISA vgbbd, v3.1 p445)
-       
-       \end{itemize}
-       
+        \item     The draft gbbd instruction implements the transpose (shown above),
+                preparing the data to use standard popcount.
+               (gbbd is based on Power ISA vgbbd, v3.1 p445)
+
+    \end{itemize}
+
 }
 
-\frame{\frametitle{Pospopcount.s}
+\frame{\frametitle{pospopcount assembler}
 
 
 \lstinputlisting[language={}]{pospopcount.s}
 
 }
 
-
 \frame{\frametitle{strncpy}
 
-       \lstinputlisting[language={}]{strncpy.c}
+    \lstinputlisting[language={}]{strncpy.c}
   \begin{itemize}
-       \item "TODO
- \end{itemize} 
+    \item two simple-looking for-loops,
+          data-dependent in the first.
+    \item sv.cmpi stops at the first zero, /vli includes the zero
+          in VL.
+    \item note the post-increment Load/Store: saves       
+          pre-decrementing
+    \item a Vector of CRs is produced which then get tested
+          by the sv.bc/all instruction, counting down CTR
+          per item tested.
+     \item Power ISA added hard-coded data-dependent capacity
+           into vstribr, where SVP64 it is generic (applies
+           to any instruction)
+     \item even the null-ing part is not straightforward as
+          it could be mis-aligned compared to the VSX width.
+     \item end-result: assembler-optimised  strncpy on Power
+          ISA v3.0 is a whopping 240 instructions. SVP64 is 10
+          and parallel in HW
+ \end{itemize}
 }
 
 
@@ -245,45 +186,91 @@ for (i = 0; i < VL; i++)
 
 }
 
-\frame{\frametitle{linked-list walking}        
+\frame{\frametitle{sv.lbz/ff=RC1/vli *16,1(10)}
+    \begin{center}
+        \includegraphics[width=0.6\textwidth]{lbz_ff_vli.png}
+    \end{center}
+
   \begin{itemize}
-       \item "TODO
+    \item r10 points to memory address 0x001007
+    \item sv.lbz (Power ISA load byte immediate) multiplies immediate
+          offset by element step index, to get Effective Address (EA)
+    \item LD/ST has no Rc=1 so Data-Dependent Fail-First specified
+          as "ff=RC1". Not LD/ST Fault First! vli: VL inclusive
+    \item Test done after each load. Fails at Memory contents
+          0x001009. Inclusive Mode: VL is truncated to 5 (FIVE) not 4
  \end{itemize}
 }
+
+\frame{\frametitle{linked-list walking}
+
+  \begin{itemize}
+    \item "TODO
+ \end{itemize}
+}
+
+\frame{\frametitle{sv.ld/ff=RC1/vli *17, 8(*16)}
+       
+       \begin{center}
+               \includegraphics[width=1.0\textwidth]{linked_list_dd.png}
+       \end{center}
+}
+
+\frame{\frametitle{maxloc}
+    \lstinputlisting[language={}]{maxloc.py}
+
+       \begin{itemize}
+               \item "TODO
+       \end{itemize}
+}
+
+\frame{\frametitle{maxlocassembler}
+       
+       \lstinputlisting[language={}]{maxloc.s}
+       
+}
+
 \frame{\frametitle{Summary}
 
  \begin{itemize}
-   \item Goal is to create a mass-volume low-power embedded SoC suitable
-         for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
-   \item No way we could implement a project of this magnitude without
-         nmigen (being able to use python OO to  HDL)
-   \item Collaboration with OpenPOWER Foundation and Members absolutely
-         essential. No short-cuts.  Standards to be developed and ratified
-         so that everyone benefits.
-   \item Riding the wave of huge stability of OpenPOWER ecosystem
-   \item Greatly simplified open 3D and Video drivers reduces product
-         development costs for customers
-   \item It also happens to be fascinating, deeply rewarding technically
-         challenging, and funded by NLnet
-
+   \item SIMD fundamentally assumes element independence.
+   \item No provision in SIMD ISAs or Architectures for
+         inter-element inter-dependence, let alone sequential
+         inter-dependence.
+   \item Simple-V adds features such as Data-Dependent
+         Fail-First as \textit{general concepts},
+         exploiting Condition Registers (Vectorised)
+   \item Hardware Parallelism is \textit{still possible}
+         by exploiting the standard capabilities of
+         Speculative Execution: produce results, hold
+         off writing, post-analyse and cancel the results
+         that should not be written. Uses \textit{existing}
+         standard OoO Micro-architecture
+   \item Huge simplification of algorithms, huge "compactification"
+         just like Zilog Z80 and Intel 8086, yet still parallel
+   \item compact deep-expressive assembler brings CISC
+         capability but RISC-RISC (Prefix-Suffix). SIMD remains
+         at the \textit{back-end in hardware} where it belongs.
+         Not exposed at the programmer.
   \end{itemize}
 }
 
-
 \frame{
   \begin{center}
     {\Huge The end\vspace{12pt}\\
-                  Thank you\vspace{12pt}\\
-                  Questions?\vspace{12pt}
-       }
+           Thank you\vspace{12pt}
+    }
   \end{center}
 
   \begin{itemize}
-       \item Discussion: http://lists.libre-soc.org
-       \item Freenode IRC \#libre-soc
-       \item http://libre-soc.org/
-       \item http://nlnet.nl/PET
-       \item https://libre-soc.org/nlnet/\#faq
+    \item Discussion: http://lists.libre-soc.org
+    \item OFTC.net IRC \#libre-soc
+    \item http://libre-soc.org/
+       \item https://nlnet.nl/project/Libre-SOC-OpenPOWER-ISA
+\item https://bugs.libre-soc.org/show\_bug.cgi?id=676
+\item https://bugs.libre-soc.org/show\_bug.cgi?id=1244
+\item https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
+    \item https://libre-soc.org/nlnet/\#faq
   \end{itemize}
 }