c02d755c2b5197daa67d39bf8dc47e0f1c65a83b
[libreriscv.git] / conferences / fosdem2024 / fosdem2024_ddffirst / fosdem2024_ddffirst.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5 \usepackage{pgffor}
6 \usepackage{listings}
7
8 \graphicspath{{./}}
9
10 \title{Data-Dependent-Fail-First}
11 \author{Luke Kenneth Casson Leighton and Shriya Sharma}
12
13
14 \begin{document}
15
16 \frame{
17 \begin{center}
18 \huge{The Libre-SOC Hybrid 3D CPU}\\
19 \vspace{32pt}
20 \Large{Data-Dependent-Fail-First}\\
21
22 \vspace{24pt}
23 \Large{FOSDEM2024}\\
24 \vspace{16pt}
25 \large{Sponsored by NLnet's PET Programme}\\
26 \vspace{6pt}
27 \large{\today}
28 \end{center}
29 }
30
31
32
33 \begin{frame}[fragile]
34 \frametitle{Simple-V CMPI in a nutshell}
35
36 \begin{semiverbatim}
37 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
38 (assuming you know power-isa)
39  int i, id=0, ira=0;
40  for (i = 0; i < VL; i++)
41   CR[BA+id] <= compare(ireg[RA+ira], SI);
42 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
43 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
44 \end{semiverbatim}
45
46 \begin{itemize}
47 \item Above is oversimplified: predication etc. left out
48 \item Scalar-scalar and scalar-vector and vector-vector now all in one
49 \item OoO may choose to push CMPIs into instr. queue (v. busy!)
50 \end{itemize}
51 \end{frame}
52
53
54 \frame{\frametitle{Load/Store Fault-First}
55
56 \begin{itemize}
57 \item Problem: vector load and store can cause a page fault
58 \item Solution: a protocol that allows optional load/store
59 \item instruction \textit{requests} a number of elements
60 \item instruction \textit{informs} the number actually loaded
61 \item first element load/store is not optional (cannot fail)
62 \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
63 \item more: wikipedia Vector processor page: Fault/Fail First
64 \vspace{10pt}
65 \item Load/Store is Memory to/from Register, what about
66 Register to Register?
67 \item Register-to-register: "Data-Dependent Fail-First."
68 \item Z80 LDIR: Mem-Register, CPIR: Register-Register
69 \end{itemize}
70 }
71
72 \begin{frame}[fragile]
73 \frametitle{Data-Dependent-Fail-First in a nutshell}
74
75 \begin{semiverbatim}
76 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
77 int i, id=0, ira=0;
78 for (i = 0; i < VL; i++)
79 CR[BA+id] <= compare(ireg[RA+ira], SI);
80 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
81 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
82 if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
83 \end{semiverbatim}
84
85 \begin{itemize}
86 \item Parallelism still perfectly possible
87 ("hold" writing results until sequential post-analysis
88 carried out. Best done with OoO)
89 \item VL truncation can be inclusive or exclusive
90 (include or exclude a NULL pointer or a
91 string-end character, or overflow result)
92 \item \textit{Truncation can be to zero Vector Length}
93 \end{itemize}
94 \end{frame}
95
96 \frame{\frametitle{Power ISA v3.1 vstribr}
97
98 \lstinputlisting[language={}]{vstribr.txt}
99
100 \begin{itemize}
101 \item ironically this hard-coded instruction is
102 identical to general-purpose Simple-V DD-FFirst...
103 \end{itemize}
104
105 }Po
106
107 \frame{\frametitle{maxloc}
108 \begin{itemize}
109 \item "TODO
110 \end{itemize}
111 }
112
113 \frame{\frametitle{Pospopcount}
114
115 \begin{itemize}
116 \item Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
117 \item Notoriously difficult to do in SIMD assembler: typically 550 lines
118 \item https://github.com/clausecker/pospop
119
120 \end{itemize}
121
122 \lstinputlisting[language={}]{pospopcount.c}
123
124
125 }
126
127 \frame{\frametitle{Pospopcount}
128
129 \begin{center}
130 \includegraphics[width=0.5\textwidth]{pospopcount.png}
131 \end{center}
132 \begin{itemize}
133 \item The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally),
134 in blocks that suit the processor and the ISA capacity.
135
136
137 \end{itemize}
138 }
139
140 \frame{\frametitle{Pospopcount}
141
142 \begin{center}
143 \includegraphics[width=0.6\textwidth]{array_popcnt.png}
144 \end{center}
145
146 \begin{itemize}
147
148 \item The draft gbbd instruction implements the transpose (shown above),
149 preparing the data to use standard popcount.
150 (gbbd is based on Power ISA vgbbd, v3.1 p445)
151
152 \end{itemize}
153
154 }
155
156 \frame{\frametitle{Pospopcount.s}
157
158
159 \lstinputlisting[language={}]{pospopcount.s}
160
161 }
162
163
164 \frame{\frametitle{strncpy}
165
166 \lstinputlisting[language={}]{strncpy.c}
167 \begin{itemize}
168 \item two simple-looking for-loops, unfortunately sequentially
169 data-dependent in the first.
170 \item Power ISA added a hard-coded variant of this inner
171 data-dependent capacity into VSX - only for strcpy!
172 \item even the null-ing part is not straightforward as
173 it could be mis-aligned compared to the VSX width.
174 \item end-result is that assembler-optimised strncpy on Power
175 ISA v3.0 is a whopping 240 instructions. SVP64 is 10
176 \end{itemize}
177 }
178
179
180
181 \frame{\frametitle{strncpy assembler}
182
183 \lstinputlisting[language={}]{strncpy.s}
184
185 }
186
187 \frame{\frametitle{sv.lbz/ff=RC1/vli *16,1(10)}
188 \begin{center}
189 \includegraphics[width=0.6\textwidth]{lbz_ff_vli.png}
190 \end{center}
191
192 \begin{itemize}
193 \item r10 points to memory address 0x001007
194 \item sv.lbz (Power ISA load byte immediate) multiplies immediate
195 offset by element step index, to get Effective Address (EA)
196 \item LD/ST has no Rc=1 so Data-Dependent Fail-First specified
197 as "ff=RC1". Not LD/ST Fault First! vli: VL inclusive
198 \item Test done after each load. Fails at Memory contents
199 0x001009. Inclusive Mode: VL is truncated to 5 (FIVE) not 4
200 \end{itemize}
201 }
202
203 \frame{\frametitle{linked-list walking}
204
205 \begin{itemize}
206 \item "TODO
207 \end{itemize}
208 }
209
210 \frame{\frametitle{sv.ld/ff=RC1/vli *17, 8(*16)}
211
212 \begin{center}
213 \includegraphics[width=1.0\textwidth]{linked_list_dd.png}
214 \end{center}
215 }
216
217 \frame{\frametitle{Summary}
218
219 \begin{itemize}
220 \item Goal is to create a mass-volume low-power embedded SoC suitable
221 for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
222 \item No way we could implement a project of this magnitude without
223 nmigen (being able to use python OO to HDL)
224 \item Collaboration with OpenPOWER Foundation and Members absolutely
225 essential. No short-cuts. Standards to be developed and ratified
226 so that everyone benefits.
227 \item Riding the wave of huge stability of OpenPOWER ecosystem
228 \item Greatly simplified open 3D and Video drivers reduces product
229 development costs for customers
230 \item It also happens to be fascinating, deeply rewarding technically
231 challenging, and funded by NLnet
232
233 \end{itemize}
234 }
235
236 \frame{\frametitle{How can you help?}
237
238 \vspace{5pt}
239
240 \begin{itemize}
241 \item Start here! https://libre-soc.org \\
242 Mailing lists https://lists.libre-soc.org \\
243 IRC Freenode libre-soc \\
244 etc. etc. (it's a Libre project, go figure) \\
245 \vspace{3pt}
246 \item Can I get paid? Yes! NLnet funded\\
247 See https://libre-soc.org/nlnet/\#faq \\
248 \vspace{3pt}
249 \item Also profit-sharing in any commercial ventures \\
250 \vspace{3pt}
251 \item How many opportunities to develop Libre SoCs exist,\\
252 and actually get paid for it?
253 \vspace{3pt}
254 \item I'm not a developer, how can I help?\\
255 - Plenty of research needed, artwork, website \\
256 - Help find customers and OEMs willing to commit (LOI)
257 \end{itemize}
258 }
259
260
261
262 \frame{
263 \begin{center}
264 {\Huge The end\vspace{12pt}\\
265 Thank you\vspace{12pt}\\
266 Questions?\vspace{12pt}
267 }
268 \end{center}
269
270 \begin{itemize}
271 \item Discussion: http://lists.libre-soc.org
272 \item Freenode IRC \#libre-soc
273 \item http://libre-soc.org/
274 \item http://nlnet.nl/PET
275 \item https://libre-soc.org/nlnet/\#faq
276 \end{itemize}
277 }
278
279
280 \end{document}