bug 676: noted a way to reduce the number of instructions
[libreriscv.git] / conferences / fosdem2024 / fosdem2024_ddffirst / fosdem2024_ddffirst.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5 \usepackage{pgffor}
6 \usepackage{listings}
7
8 \graphicspath{{./}}
9
10 \title{Data-Dependent-Fail-First}
11 \author{Luke Kenneth Casson Leighton and Shriya Sharma}
12
13
14 \begin{document}
15
16 \frame{
17 \begin{center}
18 \huge{Libre-SOC Simple-V Specification \\
19 Advanced features}\\
20 \vspace{24pt}
21 \Large{Data-Dependent Fail-First}\\
22
23 \vspace{24pt}
24 \Large{FOSDEM2024}\\
25 \vspace{16pt}
26 \large{Funded by NLnet NGI-ASSURE \\
27 EU grant agreement No 957073}\\
28 \vspace{6pt}
29 \large{\today}
30 \end{center}
31 }
32
33 \begin{frame}[fragile]
34 \frametitle{Simple-V CMPI in a nutshell}
35
36 \begin{semiverbatim}
37 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
38 (assuming you know power-isa)
39  int i, id=0, ira=0;
40  for (i = 0; i < VL; i++)
41   CR[BA+id] <= compare(ireg[RA+ira], SI);
42 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
43 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
44 \end{semiverbatim}
45
46 \begin{itemize}
47 \item Above is oversimplified: predication etc. left out
48 \item Scalar-scalar and scalar-vector and vector-vector now all in one
49 \item OoO may choose to push CMPIs into instr. queue (v. busy!)
50 \end{itemize}
51 \end{frame}
52
53
54 \frame{\frametitle{Load/Store Fault-First}
55
56 \begin{itemize}
57 \item Problem: vector load and store can cause a page fault
58 \item Solution: a protocol that allows optional load/store
59 \item instruction \textit{requests} a number of elements
60 \item instruction \textit{informs} the number actually loaded
61 \item first element load/store is not optional (cannot fail)
62 \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
63 \item more: wikipedia Vector processor page: Fault/Fail First
64 \vspace{10pt}
65 \item Load/Store is Memory to/from Register, what about
66 Register to Register?
67 \item Register-to-register: "Data-Dependent Fail-First."
68 \item Z80 LDIR: Mem-Register, CPIR: Register-Register
69 \end{itemize}
70 }
71
72 \begin{frame}[fragile]
73 \frametitle{Data-Dependent-Fail-First in a nutshell}
74
75 \begin{semiverbatim}
76 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
77 int i, id=0, ira=0;
78 for (i = 0; i < VL; i++)
79 CR[BA+id] <= compare(ireg[RA+ira], SI);
80 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
81 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
82 if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
83 \end{semiverbatim}
84
85 \begin{itemize}
86 \item Parallelism still perfectly possible
87 ("hold" writing results until sequential post-analysis
88 carried out. Best done with OoO)
89 \item VL truncation can be inclusive or exclusive
90 (include or exclude a NULL pointer or a
91 string-end character, or overflow result)
92 \item \textit{Truncation can be to zero Vector Length}
93 \end{itemize}
94 \end{frame}
95
96 \frame{\frametitle{Power ISA v3.1 vstribr}
97
98 \lstinputlisting[language={}]{vstribr.txt}
99
100 \begin{itemize}
101 \item ironically this hard-coded instruction is
102 identical to general-purpose Simple-V DD-FFirst...
103 \end{itemize}
104
105 }
106
107 \frame{\frametitle{Pospopcount}
108
109 \begin{itemize}
110 \item Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
111 \item Notoriously difficult to do in SIMD assembler: typically 550 lines
112 \item https://github.com/clausecker/pospop
113
114 \end{itemize}
115
116 \lstinputlisting[language={}]{pospopcount.c}
117
118
119 }
120
121 \frame{\frametitle{Pospopcount}
122
123 \begin{center}
124 \includegraphics[width=0.5\textwidth]{pospopcount.png}
125 \end{center}
126 \begin{itemize}
127 \item The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally),
128 in blocks that suit the processor and the ISA capacity.
129
130
131 \end{itemize}
132 }
133
134 \frame{\frametitle{Pospopcount}
135
136 \begin{center}
137 \includegraphics[width=0.6\textwidth]{array_popcnt.png}
138 \end{center}
139
140 \begin{itemize}
141
142 \item The draft gbbd instruction implements the transpose (shown above),
143 preparing the data to use standard popcount.
144 (gbbd is based on Power ISA vgbbd, v3.1 p445)
145
146 \end{itemize}
147
148 }
149
150 \frame{\frametitle{pospopcount assembler}
151
152
153 \lstinputlisting[language={}]{pospopcount.s}
154
155 }
156
157 \frame{\frametitle{strncpy}
158
159 \lstinputlisting[language={}]{strncpy.c}
160 \begin{itemize}
161 \item two simple-looking for-loops,
162 data-dependent in the first.
163 \item sv.cmpi stops at the first zero, /vli includes the zero
164 in VL.
165 \item note the post-increment Load/Store: saves
166 pre-decrementing
167 \item a Vector of CRs is produced which then get tested
168 by the sv.bc/all instruction, counting down CTR
169 per item tested.
170 \item Power ISA added hard-coded data-dependent capacity
171 into vstribr, where SVP64 it is generic (applies
172 to any instruction)
173 \item even the null-ing part is not straightforward as
174 it could be mis-aligned compared to the VSX width.
175 \item end-result: assembler-optimised strncpy on Power
176 ISA v3.0 is a whopping 240 instructions. SVP64 is 10
177 and parallel in HW
178 \end{itemize}
179 }
180
181
182
183 \frame{\frametitle{strncpy assembler}
184
185 \lstinputlisting[language={}]{strncpy.s}
186
187 }
188
189 \frame{\frametitle{sv.lbz/ff=RC1/vli *16,1(10)}
190 \begin{center}
191 \includegraphics[width=0.6\textwidth]{lbz_ff_vli.png}
192 \end{center}
193
194 \begin{itemize}
195 \item r10 points to memory address 0x001007
196 \item sv.lbz (Power ISA load byte immediate) multiplies immediate
197 offset by element step index, to get Effective Address (EA)
198 \item LD/ST has no Rc=1 so Data-Dependent Fail-First specified
199 as "ff=RC1". Not LD/ST Fault First! vli: VL inclusive
200 \item Test done after each load. Fails at Memory contents
201 0x001009. Inclusive Mode: VL is truncated to 5 (FIVE) not 4
202 \end{itemize}
203 }
204
205 \frame{\frametitle{linked-list walking}
206
207 \begin{itemize}
208 \item "TODO
209 \end{itemize}
210 }
211
212 \frame{\frametitle{sv.ld/ff=RC1/vli *17, 8(*16)}
213
214 \begin{center}
215 \includegraphics[width=1.0\textwidth]{linked_list_dd.png}
216 \end{center}
217 }
218
219 \frame{\frametitle{maxloc}
220 \lstinputlisting[language={}]{maxloc.py}
221
222 \begin{itemize}
223 \item FORTRAN MAXLOC - find the index of largest number
224 \item notoriously difficult to optimally implement for SIMD
225 \item algorithms include \textit{depth-first} recursive
226 descent (!) mapreduce-style, offsetting the
227 locally-computed largest index (plus value) which
228 are then tested in upper level(s)
229 \item SVP64 through Data-Dependent Fail-First can perform
230 each of the two key while-loop tests with
231 \textit{single instructions}.
232 \item There is however quite a bit of "housekeeping".
233 Full analysis: \\
234 https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
235 \end{itemize}
236 }
237
238 \frame{\frametitle{maxloc assembler}
239
240 \lstinputlisting[language={}]{maxloc.s}
241
242 }
243
244 \frame{\frametitle{Summary}
245
246 \begin{itemize}
247 \item SIMD fundamentally assumes element independence.
248 \item No provision in SIMD ISAs or Architectures for
249 inter-element inter-dependence, let alone sequential
250 inter-dependence.
251 \item Simple-V adds features such as Data-Dependent
252 Fail-First as \textit{general concepts},
253 exploiting Condition Registers (Vectorised)
254 \item Hardware Parallelism is \textit{still possible}
255 by exploiting the standard capabilities of
256 Speculative Execution: produce results, hold
257 off writing, post-analyse and cancel the results
258 that should not be written. Uses \textit{existing}
259 standard OoO Micro-architecture
260 \item Huge simplification of algorithms, huge "compactification"
261 just like Zilog Z80 and Intel 8086, yet still parallel
262 \item compact deep-expressive assembler brings CISC
263 capability but RISC-RISC (Prefix-Suffix). SIMD remains
264 at the \textit{back-end in hardware} where it belongs.
265 Not exposed at the programmer.
266 \end{itemize}
267 }
268
269 \frame{
270 \begin{center}
271 {\Huge The end\vspace{12pt}\\
272 Thank you\vspace{12pt}
273 }
274 \end{center}
275
276 \begin{itemize}
277 \item Discussion: http://lists.libre-soc.org
278 \item OFTC.net IRC \#libre-soc
279 \item http://libre-soc.org/
280 \item https://nlnet.nl/project/Libre-SOC-OpenPOWER-ISA
281 \item https://bugs.libre-soc.org/show\_bug.cgi?id=676
282 \item https://bugs.libre-soc.org/show\_bug.cgi?id=1244
283 \item https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
284 \item https://libre-soc.org/nlnet/\#faq
285 \end{itemize}
286 }
287
288
289 \end{document}