conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5 \usepackage{pgffor}
   6 \usepackage{listings}
   7
   8 \graphicspath{{./}}
   9
  10 \title{Data-Dependent-Fail-First}
  11 \author{Luke Kenneth Casson Leighton and Shriya Sharma}
  12
  13
  14 \begin{document}
  15
  16 \frame{
  17    \begin{center}
  18     \huge{Libre-SOC Simple-V Specification \\
  19                         Advanced features}\\
  20     \vspace{24pt}
  21     \Large{Data-Dependent Fail-First}\\
  22
  23     \vspace{24pt}
  24     \Large{FOSDEM2024}\\
  25     \vspace{16pt}
  26     \large{Funded by NLnet NGI-ASSURE \\
  27         EU grant agreement No 957073}\\
  28     \vspace{6pt}
  29     \large{\today}
  30   \end{center}
  31 }
  32
  33 \begin{frame}[fragile]
  34 \frametitle{Simple-V CMPI in a nutshell}
  35
  36 \begin{semiverbatim}
  37 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
  38   (assuming you know power-isa)
  39   int i, id=0, ira=0;
  40   for (i = 0; i < VL; i++)
  41     CR[BA+id] <= compare(ireg[RA+ira], SI);
  42     if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
  43     if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
  44 \end{semiverbatim}
  45
  46   \begin{itemize}
  47    \item Above is oversimplified: predication etc. left out
  48    \item Scalar-scalar and scalar-vector and vector-vector now all in one
  49    \item OoO may choose to push CMPIs into instr. queue (v. busy!)
  50   \end{itemize}
  51 \end{frame}
  52
  53
  54 \frame{\frametitle{Load/Store Fault-First}
  55
  56     \begin{itemize}
  57         \item Problem: vector load and store can cause a page fault
  58         \item Solution: a protocol that allows optional load/store
  59         \item instruction \textit{requests} a number of elements
  60         \item instruction \textit{informs} the number actually loaded
  61         \item first element load/store is not optional (cannot fail)
  62         \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
  63         \item more: wikipedia Vector processor page: Fault/Fail First
  64         \vspace{10pt}
  65         \item Load/Store is Memory to/from Register, what about
  66               Register to Register?
  67         \item Register-to-register: "Data-Dependent Fail-First."
  68         \item Z80 LDIR: Mem-Register, CPIR: Register-Register
  69     \end{itemize}
  70 }
  71
  72 \begin{frame}[fragile]
  73     \frametitle{Data-Dependent-Fail-First in a nutshell}
  74
  75     \begin{semiverbatim}
  76 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
  77 int i, id=0, ira=0;
  78 for (i = 0; i < VL; i++)
  79     CR[BA+id] <= compare(ireg[RA+ira], SI);
  80     if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
  81     if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
  82     if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
  83     \end{semiverbatim}
  84
  85     \begin{itemize}
  86         \item Parallelism still perfectly possible
  87               ("hold" writing results until sequential post-analysis
  88                carried out. Best done with OoO)
  89         \item VL truncation can be inclusive or exclusive
  90               (include or exclude a NULL pointer or a
  91               string-end character, or overflow result)
  92         \item \textit{Truncation can be to zero Vector Length}
  93     \end{itemize}
  94 \end{frame}
  95
  96 \frame{\frametitle{Power ISA v3.1 vstribr}
  97
  98     \lstinputlisting[language={}]{vstribr.txt}
  99
 100     \begin{itemize}
 101         \item ironically this hard-coded instruction is
 102         identical to general-purpose Simple-V DD-FFirst...
 103     \end{itemize}
 104
 105 }
 106
 107 \frame{\frametitle{Pospopcount}
 108
 109   \begin{itemize}
 110     \item     Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
 111     \item   Notoriously difficult to do in SIMD assembler: typically 550 lines
 112     \item https://github.com/clausecker/pospop - Full writeup: \\
 113           https://libre-soc.org/openpower/sv/cookbook/pospopcnt
 114
 115    \end{itemize}
 116
 117     \lstinputlisting[language={}]{pospopcount.c}
 118
 119
 120 }
 121
 122 \frame{\frametitle{Pospopcount}
 123
 124     \begin{center}
 125         \includegraphics[width=0.5\textwidth]{pospopcount.png}
 126     \end{center}
 127       \begin{itemize}
 128         \item   The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally),
 129         in blocks that suit the processor and the ISA capacity.
 130
 131
 132     \end{itemize}
 133 }
 134
 135 \frame{\frametitle{Pospopcount}
 136
 137     \begin{center}
 138         \includegraphics[width=0.6\textwidth]{array_popcnt.png}
 139     \end{center}
 140
 141   \begin{itemize}
 142
 143         \item     The draft gbbd instruction implements the transpose (shown above),
 144                 preparing the data to use standard popcount.
 145                (gbbd is based on Power ISA vgbbd, v3.1 p445)
 146
 147     \end{itemize}
 148
 149 }
 150
 151 \frame{\frametitle{pospopcount assembler}
 152
 153
 154 \lstinputlisting[language={}]{pospopcount.s}
 155
 156 }
 157
 158 \frame{\frametitle{strncpy}
 159
 160     \lstinputlisting[language={}]{strncpy.c}
 161   \begin{itemize}
 162     \item two simple-looking for-loops,
 163           data-dependent in the first.
 164     \item sv.cmpi stops at the first zero, /vli includes the zero
 165           in VL.
 166     \item note the post-increment Load/Store: saves
 167           pre-decrementing
 168     \item a Vector of CRs is produced which then get tested
 169            by the sv.bc/all instruction, counting down CTR
 170            per item tested.
 171      \item Power ISA added hard-coded data-dependent capacity
 172            into vstribr, where SVP64 it is generic (applies
 173            to any instruction)
 174      \item even the null-ing part is not straightforward as
 175           it could be mis-aligned compared to the VSX width.
 176      \item end-result: assembler-optimised  strncpy on Power
 177           ISA v3.0 is a whopping 240 instructions. SVP64 is 10
 178           and parallel in HW
 179  \end{itemize}
 180 }
 181
 182
 183
 184 \frame{\frametitle{strncpy assembler}
 185
 186 \lstinputlisting[language={}]{strncpy.s}
 187
 188 }
 189
 190 \frame{\frametitle{sv.lbz/ff=RC1/vli *16,1(10)}
 191     \begin{center}
 192         \includegraphics[width=0.6\textwidth]{lbz_ff_vli.png}
 193     \end{center}
 194
 195   \begin{itemize}
 196     \item r10 points to memory address 0x001007
 197     \item sv.lbz (Power ISA load byte immediate) multiplies immediate
 198           offset by element step index, to get Effective Address (EA)
 199     \item LD/ST has no Rc=1 so Data-Dependent Fail-First specified
 200           as "ff=RC1". Not LD/ST Fault First! vli: VL inclusive
 201     \item Test done after each load. Fails at Memory contents
 202           0x001009. Inclusive Mode: VL is truncated to 5 (FIVE) not 4
 203  \end{itemize}
 204 }
 205
 206 \frame{\frametitle{linked-list walking}
 207
 208   \begin{itemize}
 209     \item "TODO
 210  \end{itemize}
 211 }
 212
 213 \frame{\frametitle{sv.ld/ff=RC1/vli *17, 8(*16)}
 214
 215         \begin{center}
 216                 \includegraphics[width=1.0\textwidth]{linked_list_dd.png}
 217         \end{center}
 218 }
 219
 220 \frame{\frametitle{maxloc}
 221     \lstinputlisting[language={}]{maxloc.py}
 222
 223         \begin{itemize}
 224                 \item FORTRAN MAXLOC - find the index of largest number
 225                       notoriously difficult to optimally implement for SIMD
 226                 \item algorithms include \textit{depth-first} recursive
 227                       descent (!) mapreduce-style, offsetting the
 228                       locally-computed largest index (plus value) which
 229                       are then tested in upper level(s)
 230                 \item SVP64: note below the sv.cmp (first while-loop),
 231                 sv.minmax. (second while-loop) and the sv.crnand which
 232                 by Predicate masking is 3-in 1-out CR ops
 233                 not the usual 2-in 1-out
 234                 \item There is however quite a bit of "housekeeping".
 235                         Full analysis: \\
 236         https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
 237         \end{itemize}
 238 }
 239
 240 \frame{\frametitle{maxloc assembler}
 241
 242         \lstinputlisting[language={}]{maxloc.s}
 243 }
 244
 245 \frame{\frametitle{Summary}
 246
 247  \begin{itemize}
 248    \item SIMD fundamentally assumes element independence.
 249    \item No provision in SIMD ISAs or Architectures for
 250          inter-element inter-dependence, let alone sequential
 251          inter-dependence.
 252    \item Simple-V adds features such as Data-Dependent
 253          Fail-First as \textit{general concepts},
 254          exploiting Condition Registers (Vectorised)
 255    \item Hardware Parallelism is \textit{still possible}
 256          by exploiting the standard capabilities of
 257          Speculative Execution: produce results, hold
 258          off writing, post-analyse and cancel the results
 259          that should not be written. Uses \textit{existing}
 260          standard OoO Micro-architecture
 261    \item Huge simplification of algorithms, huge "compactification"
 262          just like Zilog Z80 and Intel 8086, yet still parallel
 263    \item compact deep-expressive assembler brings CISC
 264          capability but RISC-RISC (Prefix-Suffix). SIMD remains
 265          at the \textit{back-end in hardware} where it belongs.
 266          Not exposed at the programmer.
 267   \end{itemize}
 268 }
 269
 270 \frame{
 271   \begin{center}
 272     {\Huge The end\vspace{12pt}\\
 273            Thank you\vspace{12pt}
 274     }
 275   \end{center}
 276
 277   \begin{itemize}
 278     \item Discussion: http://lists.libre-soc.org
 279     \item OFTC.net IRC \#libre-soc
 280     \item http://libre-soc.org/
 281         \item https://nlnet.nl/project/Libre-SOC-OpenPOWER-ISA
 282 \item https://bugs.libre-soc.org/show\_bug.cgi?id=676
 283 \item https://bugs.libre-soc.org/show\_bug.cgi?id=1244
 284 \item https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
 285     \item https://libre-soc.org/nlnet/\#faq
 286   \end{itemize}
 287 }
 288
 289
 290 \end{document}