conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5 \usepackage{pgffor}
   6 \usepackage{listings}
   7
   8 \graphicspath{{./}}
   9
  10 \title{Data-Dependent-Fail-First}
  11 \author{Luke Kenneth Casson Leighton and Shriya Sharma}
  12
  13
  14 \begin{document}
  15
  16 \frame{
  17    \begin{center}
  18     \huge{Libre-SOC Simple-V Specification \\
  19                         Advanced features}\\
  20     \vspace{24pt}
  21     \Large{Data-Dependent Fail-First}\\
  22
  23     \vspace{24pt}
  24     \Large{FOSDEM2024}\\
  25     \vspace{16pt}
  26     \large{Funded by NLnet NGI-ASSURE \\
  27         EU grant agreement No 957073}\\
  28     \vspace{6pt}
  29     \large{\today}
  30   \end{center}
  31 }
  32
  33 \begin{frame}[fragile]
  34 \frametitle{Simple-V CMPI in a nutshell}
  35
  36 \begin{semiverbatim}
  37 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
  38   (assuming you know power-isa)
  39   int i, id=0, ira=0;
  40   for (i = 0; i < VL; i++)
  41     CR[BA+id] <= compare(ireg[RA+ira], SI);
  42     if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
  43     if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
  44 \end{semiverbatim}
  45
  46   \begin{itemize}
  47    \item Above is oversimplified: predication etc. left out
  48    \item Scalar-scalar and scalar-vector and vector-vector now all in one
  49    \item OoO may choose to push CMPIs into instr. queue (v. busy!)
  50   \end{itemize}
  51 \end{frame}
  52
  53
  54 \frame{\frametitle{Load/Store Fault-First}
  55
  56     \begin{itemize}
  57         \item Problem: vector load and store can cause a page fault
  58         \item Solution: a protocol that allows optional load/store
  59         \item instruction \textit{requests} a number of elements
  60         \item instruction \textit{informs} the number actually loaded
  61         \item first element load/store is not optional (cannot fail)
  62         \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
  63         \item more: wikipedia Vector processor page: Fault/Fail First
  64         \vspace{10pt}
  65         \item Load/Store is Memory to/from Register, what about
  66               Register to Register?
  67         \item Register-to-register: "Data-Dependent Fail-First."
  68         \item Z80 LDIR: Mem-Register, CPIR: Register-Register
  69     \end{itemize}
  70 }
  71
  72 \begin{frame}[fragile]
  73     \frametitle{Data-Dependent-Fail-First in a nutshell}
  74
  75     \begin{semiverbatim}
  76 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
  77 int i, id=0, ira=0;
  78 for (i = 0; i < VL; i++)
  79     CR[BA+id] <= compare(ireg[RA+ira], SI);
  80     if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
  81     if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
  82     if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
  83     \end{semiverbatim}
  84
  85     \begin{itemize}
  86         \item Parallelism still perfectly possible
  87               ("hold" writing results until sequential post-analysis
  88                carried out. Best done with OoO)
  89         \item VL truncation can be inclusive or exclusive
  90               (include or exclude a NULL pointer or a
  91               string-end character, or overflow result)
  92         \item \textit{Truncation can be to zero Vector Length}
  93     \end{itemize}
  94 \end{frame}
  95
  96 \frame{\frametitle{Power ISA v3.1 vstribr}
  97
  98     \lstinputlisting[language={}]{vstribr.txt}
  99
 100     \begin{itemize}
 101         \item ironically this hard-coded instruction is
 102         identical to general-purpose Simple-V DD-FFirst...
 103     \end{itemize}
 104
 105 }
 106
 107 \frame{\frametitle{Pospopcount}
 108
 109   \begin{itemize}
 110     \item     Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
 111     \item   Notoriously difficult to do in SIMD assembler: typically 550 lines
 112     \item https://github.com/clausecker/pospop
 113
 114    \end{itemize}
 115
 116     \lstinputlisting[language={}]{pospopcount.c}
 117
 118
 119 }
 120
 121 \frame{\frametitle{Pospopcount}
 122
 123     \begin{center}
 124         \includegraphics[width=0.5\textwidth]{pospopcount.png}
 125     \end{center}
 126       \begin{itemize}
 127         \item   The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally),
 128         in blocks that suit the processor and the ISA capacity.
 129
 130
 131     \end{itemize}
 132 }
 133
 134 \frame{\frametitle{Pospopcount}
 135
 136     \begin{center}
 137         \includegraphics[width=0.6\textwidth]{array_popcnt.png}
 138     \end{center}
 139
 140   \begin{itemize}
 141
 142         \item     The draft gbbd instruction implements the transpose (shown above),
 143                 preparing the data to use standard popcount.
 144                (gbbd is based on Power ISA vgbbd, v3.1 p445)
 145
 146     \end{itemize}
 147
 148 }
 149
 150 \frame{\frametitle{pospopcount assembler}
 151
 152
 153 \lstinputlisting[language={}]{pospopcount.s}
 154
 155 }
 156
 157 \frame{\frametitle{strncpy}
 158
 159     \lstinputlisting[language={}]{strncpy.c}
 160   \begin{itemize}
 161     \item two simple-looking for-loops,
 162           data-dependent in the first.
 163     \item sv.cmpi stops at the first zero, /vli includes the zero
 164           in VL.
 165     \item note the post-increment Load/Store: saves
 166           pre-decrementing
 167     \item a Vector of CRs is produced which then get tested
 168            by the sv.bc/all instruction, counting down CTR
 169            per item tested.
 170      \item Power ISA added hard-coded data-dependent capacity
 171            into vstribr, where SVP64 it is generic (applies
 172            to any instruction)
 173      \item even the null-ing part is not straightforward as
 174           it could be mis-aligned compared to the VSX width.
 175      \item end-result: assembler-optimised  strncpy on Power
 176           ISA v3.0 is a whopping 240 instructions. SVP64 is 10
 177           and parallel in HW
 178  \end{itemize}
 179 }
 180
 181
 182
 183 \frame{\frametitle{strncpy assembler}
 184
 185 \lstinputlisting[language={}]{strncpy.s}
 186
 187 }
 188
 189 \frame{\frametitle{sv.lbz/ff=RC1/vli *16,1(10)}
 190     \begin{center}
 191         \includegraphics[width=0.6\textwidth]{lbz_ff_vli.png}
 192     \end{center}
 193
 194   \begin{itemize}
 195     \item r10 points to memory address 0x001007
 196     \item sv.lbz (Power ISA load byte immediate) multiplies immediate
 197           offset by element step index, to get Effective Address (EA)
 198     \item LD/ST has no Rc=1 so Data-Dependent Fail-First specified
 199           as "ff=RC1". Not LD/ST Fault First! vli: VL inclusive
 200     \item Test done after each load. Fails at Memory contents
 201           0x001009. Inclusive Mode: VL is truncated to 5 (FIVE) not 4
 202  \end{itemize}
 203 }
 204
 205 \frame{\frametitle{linked-list walking}
 206
 207   \begin{itemize}
 208     \item "TODO
 209  \end{itemize}
 210 }
 211
 212 \frame{\frametitle{sv.ld/ff=RC1/vli *17, 8(*16)}
 213
 214         \begin{center}
 215                 \includegraphics[width=1.0\textwidth]{linked_list_dd.png}
 216         \end{center}
 217 }
 218
 219 \frame{\frametitle{maxloc}
 220     \lstinputlisting[language={}]{maxloc.py}
 221
 222         \begin{itemize}
 223                 \item FORTRAN MAXLOC - find the index of largest number
 224                 \item notoriously difficult to optimally implement for SIMD
 225                 \item algorithms include \textit{depth-first} recursive
 226                       descent (!) mapreduce-style, offsetting the
 227                       locally-computed largest index (plus value) which
 228                       are then tested in upper level(s)
 229                 \item SVP64 through Data-Dependent Fail-First can perform
 230                           each of the two key while-loop tests with
 231                           \textit{single instructions}.
 232                 \item There is however quite a bit of "housekeeping".
 233                         Full analysis: \\
 234         https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
 235         \end{itemize}
 236 }
 237
 238 \frame{\frametitle{maxloc assembler}
 239
 240         \lstinputlisting[language={}]{maxloc.s}
 241
 242 }
 243
 244 \frame{\frametitle{Summary}
 245
 246  \begin{itemize}
 247    \item SIMD fundamentally assumes element independence.
 248    \item No provision in SIMD ISAs or Architectures for
 249          inter-element inter-dependence, let alone sequential
 250          inter-dependence.
 251    \item Simple-V adds features such as Data-Dependent
 252          Fail-First as \textit{general concepts},
 253          exploiting Condition Registers (Vectorised)
 254    \item Hardware Parallelism is \textit{still possible}
 255          by exploiting the standard capabilities of
 256          Speculative Execution: produce results, hold
 257          off writing, post-analyse and cancel the results
 258          that should not be written. Uses \textit{existing}
 259          standard OoO Micro-architecture
 260    \item Huge simplification of algorithms, huge "compactification"
 261          just like Zilog Z80 and Intel 8086, yet still parallel
 262    \item compact deep-expressive assembler brings CISC
 263          capability but RISC-RISC (Prefix-Suffix). SIMD remains
 264          at the \textit{back-end in hardware} where it belongs.
 265          Not exposed at the programmer.
 266   \end{itemize}
 267 }
 268
 269 \frame{
 270   \begin{center}
 271     {\Huge The end\vspace{12pt}\\
 272            Thank you\vspace{12pt}
 273     }
 274   \end{center}
 275
 276   \begin{itemize}
 277     \item Discussion: http://lists.libre-soc.org
 278     \item OFTC.net IRC \#libre-soc
 279     \item http://libre-soc.org/
 280         \item https://nlnet.nl/project/Libre-SOC-OpenPOWER-ISA
 281 \item https://bugs.libre-soc.org/show\_bug.cgi?id=676
 282 \item https://bugs.libre-soc.org/show\_bug.cgi?id=1244
 283 \item https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
 284     \item https://libre-soc.org/nlnet/\#faq
 285   \end{itemize}
 286 }
 287
 288
 289 \end{document}