bug 1244: add pospopcount cookbook link
[libreriscv.git] / conferences / fosdem2024 / fosdem2024_ddffirst / fosdem2024_ddffirst.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5 \usepackage{pgffor}
6 \usepackage{listings}
7
8 \graphicspath{{./}}
9
10 \title{Data-Dependent-Fail-First}
11 \author{Luke Kenneth Casson Leighton and Shriya Sharma}
12
13
14 \begin{document}
15
16 \frame{
17 \begin{center}
18 \huge{Libre-SOC Simple-V Specification \\
19 Advanced features}\\
20 \vspace{24pt}
21 \Large{Data-Dependent Fail-First}\\
22
23 \vspace{24pt}
24 \Large{FOSDEM2024}\\
25 \vspace{16pt}
26 \large{Funded by NLnet NGI-ASSURE \\
27 EU grant agreement No 957073}\\
28 \vspace{6pt}
29 \large{\today}
30 \end{center}
31 }
32
33 \begin{frame}[fragile]
34 \frametitle{Simple-V CMPI in a nutshell}
35
36 \begin{semiverbatim}
37 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
38 (assuming you know power-isa)
39  int i, id=0, ira=0;
40  for (i = 0; i < VL; i++)
41   CR[BA+id] <= compare(ireg[RA+ira], SI);
42 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
43 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
44 \end{semiverbatim}
45
46 \begin{itemize}
47 \item Above is oversimplified: predication etc. left out
48 \item Scalar-scalar and scalar-vector and vector-vector now all in one
49 \item OoO may choose to push CMPIs into instr. queue (v. busy!)
50 \end{itemize}
51 \end{frame}
52
53
54 \frame{\frametitle{Load/Store Fault-First}
55
56 \begin{itemize}
57 \item Problem: vector load and store can cause a page fault
58 \item Solution: a protocol that allows optional load/store
59 \item instruction \textit{requests} a number of elements
60 \item instruction \textit{informs} the number actually loaded
61 \item first element load/store is not optional (cannot fail)
62 \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
63 \item more: wikipedia Vector processor page: Fault/Fail First
64 \vspace{10pt}
65 \item Load/Store is Memory to/from Register, what about
66 Register to Register?
67 \item Register-to-register: "Data-Dependent Fail-First."
68 \item Z80 LDIR: Mem-Register, CPIR: Register-Register
69 \end{itemize}
70 }
71
72 \begin{frame}[fragile]
73 \frametitle{Data-Dependent-Fail-First in a nutshell}
74
75 \begin{semiverbatim}
76 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
77 int i, id=0, ira=0;
78 for (i = 0; i < VL; i++)
79 CR[BA+id] <= compare(ireg[RA+ira], SI);
80 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
81 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
82 if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
83 \end{semiverbatim}
84
85 \begin{itemize}
86 \item Parallelism still perfectly possible
87 ("hold" writing results until sequential post-analysis
88 carried out. Best done with OoO)
89 \item VL truncation can be inclusive or exclusive
90 (include or exclude a NULL pointer or a
91 string-end character, or overflow result)
92 \item \textit{Truncation can be to zero Vector Length}
93 \end{itemize}
94 \end{frame}
95
96 \frame{\frametitle{Power ISA v3.1 vstribr}
97
98 \lstinputlisting[language={}]{vstribr.txt}
99
100 \begin{itemize}
101 \item ironically this hard-coded instruction is
102 identical to general-purpose Simple-V DD-FFirst...
103 \end{itemize}
104
105 }
106
107 \frame{\frametitle{Pospopcount}
108
109 \begin{itemize}
110 \item Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
111 \item Notoriously difficult to do in SIMD assembler: typically 550 lines
112 \item https://github.com/clausecker/pospop - Full writeup: \\
113 https://libre-soc.org/openpower/sv/cookbook/pospopcnt
114
115 \end{itemize}
116
117 \lstinputlisting[language={}]{pospopcount.c}
118
119
120 }
121
122 \frame{\frametitle{Pospopcount}
123
124 \begin{center}
125 \includegraphics[width=0.5\textwidth]{pospopcount.png}
126 \end{center}
127 \begin{itemize}
128 \item The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally),
129 in blocks that suit the processor and the ISA capacity.
130
131
132 \end{itemize}
133 }
134
135 \frame{\frametitle{Pospopcount}
136
137 \begin{center}
138 \includegraphics[width=0.6\textwidth]{array_popcnt.png}
139 \end{center}
140
141 \begin{itemize}
142
143 \item The draft gbbd instruction implements the transpose (shown above),
144 preparing the data to use standard popcount.
145 (gbbd is based on Power ISA vgbbd, v3.1 p445)
146
147 \end{itemize}
148
149 }
150
151 \frame{\frametitle{pospopcount assembler}
152
153
154 \lstinputlisting[language={}]{pospopcount.s}
155
156 }
157
158 \frame{\frametitle{strncpy}
159
160 \lstinputlisting[language={}]{strncpy.c}
161 \begin{itemize}
162 \item two simple-looking for-loops,
163 data-dependent in the first.
164 \item sv.cmpi stops at the first zero, /vli includes the zero
165 in VL.
166 \item note the post-increment Load/Store: saves
167 pre-decrementing
168 \item a Vector of CRs is produced which then get tested
169 by the sv.bc/all instruction, counting down CTR
170 per item tested.
171 \item Power ISA added hard-coded data-dependent capacity
172 into vstribr, where SVP64 it is generic (applies
173 to any instruction)
174 \item even the null-ing part is not straightforward as
175 it could be mis-aligned compared to the VSX width.
176 \item end-result: assembler-optimised strncpy on Power
177 ISA v3.0 is a whopping 240 instructions. SVP64 is 10
178 and parallel in HW
179 \end{itemize}
180 }
181
182
183
184 \frame{\frametitle{strncpy assembler}
185
186 \lstinputlisting[language={}]{strncpy.s}
187
188 }
189
190 \frame{\frametitle{sv.lbz/ff=RC1/vli *16,1(10)}
191 \begin{center}
192 \includegraphics[width=0.6\textwidth]{lbz_ff_vli.png}
193 \end{center}
194
195 \begin{itemize}
196 \item r10 points to memory address 0x001007
197 \item sv.lbz (Power ISA load byte immediate) multiplies immediate
198 offset by element step index, to get Effective Address (EA)
199 \item LD/ST has no Rc=1 so Data-Dependent Fail-First specified
200 as "ff=RC1". Not LD/ST Fault First! vli: VL inclusive
201 \item Test done after each load. Fails at Memory contents
202 0x001009. Inclusive Mode: VL is truncated to 5 (FIVE) not 4
203 \end{itemize}
204 }
205
206 \frame{\frametitle{linked-list walking}
207
208 \begin{itemize}
209 \item "TODO
210 \end{itemize}
211 }
212
213 \frame{\frametitle{sv.ld/ff=RC1/vli *17, 8(*16)}
214
215 \begin{center}
216 \includegraphics[width=1.0\textwidth]{linked_list_dd.png}
217 \end{center}
218 }
219
220 \frame{\frametitle{maxloc}
221 \lstinputlisting[language={}]{maxloc.py}
222
223 \begin{itemize}
224 \item FORTRAN MAXLOC - find the index of largest number
225 notoriously difficult to optimally implement for SIMD
226 \item algorithms include \textit{depth-first} recursive
227 descent (!) mapreduce-style, offsetting the
228 locally-computed largest index (plus value) which
229 are then tested in upper level(s)
230 \item SVP64: note below the sv.cmp (first while-loop),
231 sv.minmax. (second while-loop) and the sv.crnand which
232 by Predicate masking is 3-in 1-out CR ops
233 not the usual 2-in 1-out
234 \item There is however quite a bit of "housekeeping".
235 Full analysis: \\
236 https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
237 \end{itemize}
238 }
239
240 \frame{\frametitle{maxloc assembler}
241
242 \lstinputlisting[language={}]{maxloc.s}
243 }
244
245 \frame{\frametitle{Summary}
246
247 \begin{itemize}
248 \item SIMD fundamentally assumes element independence.
249 \item No provision in SIMD ISAs or Architectures for
250 inter-element inter-dependence, let alone sequential
251 inter-dependence.
252 \item Simple-V adds features such as Data-Dependent
253 Fail-First as \textit{general concepts},
254 exploiting Condition Registers (Vectorised)
255 \item Hardware Parallelism is \textit{still possible}
256 by exploiting the standard capabilities of
257 Speculative Execution: produce results, hold
258 off writing, post-analyse and cancel the results
259 that should not be written. Uses \textit{existing}
260 standard OoO Micro-architecture
261 \item Huge simplification of algorithms, huge "compactification"
262 just like Zilog Z80 and Intel 8086, yet still parallel
263 \item compact deep-expressive assembler brings CISC
264 capability but RISC-RISC (Prefix-Suffix). SIMD remains
265 at the \textit{back-end in hardware} where it belongs.
266 Not exposed at the programmer.
267 \end{itemize}
268 }
269
270 \frame{
271 \begin{center}
272 {\Huge The end\vspace{12pt}\\
273 Thank you\vspace{12pt}
274 }
275 \end{center}
276
277 \begin{itemize}
278 \item Discussion: http://lists.libre-soc.org
279 \item OFTC.net IRC \#libre-soc
280 \item http://libre-soc.org/
281 \item https://nlnet.nl/project/Libre-SOC-OpenPOWER-ISA
282 \item https://bugs.libre-soc.org/show\_bug.cgi?id=676
283 \item https://bugs.libre-soc.org/show\_bug.cgi?id=1244
284 \item https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
285 \item https://libre-soc.org/nlnet/\#faq
286 \end{itemize}
287 }
288
289
290 \end{document}