Bug 1244: Added description for pospopcount
[libreriscv.git] / conferences / fosdem2024 / fosdem2024_ddffirst / fosdem2024_ddffirst.tex
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0f42add4bccd30b910b6cb491797bae58181fa55 100644 (file)
@@ -0,0 +1,285 @@
+\documentclass[slidestop]{beamer}
+\usepackage{beamerthemesplit}
+\usepackage{graphics}
+\usepackage{pstricks}
+\usepackage{pgffor}
+\usepackage{listings}
+
+\graphicspath{{./}}
+
+\title{Data-Dependent-Fail-First}
+\author{Luke Kenneth Casson Leighton and Shriya Sharma}
+
+
+\begin{document}
+
+\frame{
+   \begin{center}
+    \huge{The Libre-SOC Hybrid 3D CPU}\\
+    \vspace{32pt}
+    \Large{Data-Dependent-Fail-First}\\
+
+    \vspace{24pt}
+    \Large{FOSDEM2024}\\
+    \vspace{16pt}
+    \large{Sponsored by NLnet's PET Programme}\\
+    \vspace{6pt}
+    \large{\today}
+  \end{center}
+}
+
+
+\frame{\frametitle{Why another SoC?}
+
+ \begin{itemize}
+   \item Intel Management Engine, Apple QA issues, Spectre\vspace{6pt}
+   \item Endless proprietary drivers, "simplest" solution: \\
+         License proprietary hard macros (with proprietary firmware)\\
+                Adversely affects product development cost\\
+               due to opaque driver bugs (Samsung S3C6410 / S5P100)
+                \vspace{6pt}
+   \item Alternative: Intel and Valve-Steam collaboration\\
+         "Most productive business meeting ever!"\\
+         https://tinyurl.com/valve-steam-intel
+               \vspace{6pt}
+   \item Because for 30 years I Always Wanted To Design A CPU
+               \vspace{6pt}
+   \item Ultimately it is a strategic \textit{business} objective to
+         develop entirely Libre hardware, firmware and drivers.
+  \end{itemize}
+}
+
+
+
+\frame{\frametitle{How can you help?}
+
+\vspace{5pt}
+
+ \begin{itemize}
+   \item Start here! https://libre-soc.org \\
+            Mailing lists https://lists.libre-soc.org \\
+            IRC Freenode libre-soc \\
+            etc. etc. (it's a Libre project, go figure) \\
+                  \vspace{3pt}
+   \item Can I get paid? Yes!  NLnet funded\\
+                See https://libre-soc.org/nlnet/\#faq \\
+                \vspace{3pt}
+   \item Also profit-sharing in any commercial ventures \\
+            \vspace{3pt}
+   \item How many opportunities to develop Libre SoCs exist,\\
+            and actually get paid for it?
+                    \vspace{3pt}
+   \item I'm not a developer, how can I help?\\
+               - Plenty of research needed, artwork, website \\
+               - Help find customers and OEMs willing to commit (LOI)
+  \end{itemize}
+}
+
+
+
+\frame{\frametitle{What goes into a typical SoC?}
+\vspace{9pt}
+ \begin{itemize}
+   \item 15 to 20mm BGA package: 2.5 to 5 watt power consumption\\
+               heat sink normally not required (simplifies overall design)
+               \vspace{3pt}
+   \item Fully-integrated peripherals (not Northbridge/Southbridge)\\
+         USB, HDMI, RGB/TTL, SD/MMC, I2C, UART, SPI, GPIO etc. etc. 
+         \vspace{3pt}
+   \item Built-in GPU (shared memory bus, 3rd party licensed) \vspace{3pt}
+   \item Built-in VPU (likewise, proprietary)\vspace{3pt}
+   \item Target price between \$2.50 and \$30 depending on market\\
+         Radically different from IBM POWER9 Core (200 Watt)
+         \vspace{3pt}
+   \item We're doing the same, just with a hybrid architecture.\\
+                CPU == GPU == VPU
+  \end{itemize}
+}
+
+
+
+\begin{frame}[fragile]
+\frametitle{Simple-V CMPI in a nutshell}
+
+\begin{semiverbatim}
+function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
+  (assuming you know power-isa)
+  int i, id=0, ira=0;
+  for (i = 0; i < VL; i++)
+    CR[BA+id] <= compare(ireg[RA+ira], SI);
+    if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
+    if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
+\end{semiverbatim}
+
+  \begin{itemize}
+   \item Above is oversimplified: predication etc. left out
+   \item Scalar-scalar and scalar-vector and vector-vector now all in one
+   \item OoO may choose to push CMPIs into instr. queue (v. busy!)
+  \end{itemize}
+\end{frame}
+
+
+\frame{\frametitle{Load/Store Fault-First}
+       
+       \begin{itemize}
+               \item Problem: vector load and store can cause a page fault
+               \item Solution: a protocol that allows optional load/store
+               \item instruction \textit{requests} a number of elements
+               \item instruction \textit{informs} the number actually loaded
+               \item first element load/store is not optional (cannot fail)
+        \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
+        \item more: wikipedia Vector processor page: Fault/Fail First
+        \vspace{10pt}
+               \item Load/Store is Memory to/from Register, what about
+              Register to Register?
+        \item Register-to-register: "Data-Dependent Fail-First."
+        \item Z80 LDIR: Mem-Register, CPIR: Register-Register
+       \end{itemize}
+}
+
+\begin{frame}[fragile]
+       \frametitle{Data-Dependent-Fail-First in a nutshell}
+       
+       \begin{semiverbatim}
+function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
+int i, id=0, ira=0;
+for (i = 0; i < VL; i++)
+    CR[BA+id] <= compare(ireg[RA+ira], SI);
+    if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
+    if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
+    if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
+       \end{semiverbatim}
+       
+       \begin{itemize}
+               \item Parallelism still perfectly possible
+                     ("hold" writing results until sequential post-analysis
+                      carried out. Best done with OoO)
+               \item VL truncation can be inclusive or exclusive
+                     (include or exclude a NULL pointer or a
+                     string-end character, or overflow result)
+               \item \textit{Truncation can be to zero Vector Length}
+       \end{itemize}
+\end{frame}
+
+\frame{\frametitle{Power ISA v3.1 vstribr}
+       
+       \lstinputlisting[language={}]{vstribr.txt}
+       
+       \begin{itemize}
+               \item ironically this hard-coded instruction is
+               identical to general-purpose Simple-V DD-FFirst...
+       \end{itemize}
+       
+}Po
+
+\frame{\frametitle{maxloc}
+  \begin{itemize}
+               \item "TODO
+  \end{itemize}
+}
+
+\frame{\frametitle{Pospopcount}
+       
+  \begin{itemize}
+       \item   Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
+       \item   Notoriously difficult to do in SIMD assembler: typically 550 lines
+
+   \end{itemize}
+       
+       \lstinputlisting[language={}]{pospopcount.c}
+       
+}
+
+\frame{\frametitle{Pospopcount}
+       
+       \begin{center}
+               \includegraphics[width=0.5\textwidth]{pospopcount.png}
+       \end{center}
+       
+}
+
+\frame{\frametitle{Pospopcount}
+       
+       \begin{center}
+               \includegraphics[width=0.5\textwidth]{array_popcnt.png}
+       \end{center}
+
+  \begin{itemize}
+               \item   Part of the challenge is therefore to perform an appropriate transpose of the data,
+                               in blocks that suit the processor and the ISA capacity.
+               \item   The gbbd instruction is used for implementing the transpose function, 
+                               preparing the data for using the standard popcount instruction.
+
+       
+       \end{itemize}
+       
+}
+
+\frame{\frametitle{Pospopcount.s}
+
+
+\lstinputlisting[language={}]{pospopcount.s}
+
+}
+
+
+\frame{\frametitle{strncpy}
+
+       \lstinputlisting[language={}]{strncpy.c}
+  \begin{itemize}
+       \item "TODO
+ \end{itemize} 
+}
+
+
+
+\frame{\frametitle{strncpy assembler}
+
+\lstinputlisting[language={}]{strncpy.s}
+
+}
+
+\frame{\frametitle{linked-list walking}        
+  \begin{itemize}
+       \item "TODO
+ \end{itemize}
+}
+\frame{\frametitle{Summary}
+
+ \begin{itemize}
+   \item Goal is to create a mass-volume low-power embedded SoC suitable
+         for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
+   \item No way we could implement a project of this magnitude without
+         nmigen (being able to use python OO to  HDL)
+   \item Collaboration with OpenPOWER Foundation and Members absolutely
+         essential. No short-cuts.  Standards to be developed and ratified
+         so that everyone benefits.
+   \item Riding the wave of huge stability of OpenPOWER ecosystem
+   \item Greatly simplified open 3D and Video drivers reduces product
+         development costs for customers
+   \item It also happens to be fascinating, deeply rewarding technically
+         challenging, and funded by NLnet
+         
+  \end{itemize}
+}
+
+
+\frame{
+  \begin{center}
+    {\Huge The end\vspace{12pt}\\
+                  Thank you\vspace{12pt}\\
+                  Questions?\vspace{12pt}
+       }
+  \end{center}
+  
+  \begin{itemize}
+       \item Discussion: http://lists.libre-soc.org
+       \item Freenode IRC \#libre-soc
+       \item http://libre-soc.org/
+       \item http://nlnet.nl/PET
+       \item https://libre-soc.org/nlnet/\#faq
+  \end{itemize}
+}
+
+
+\end{document}