fcdccdbc536f910c892fa59aff41f80d22ceaeb2
[libreriscv.git] / conferences / fosdem2024 / fosdem2024_ddffirst / fosdem2024_ddffirst.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \graphicspath{{./}}
7
8 \title{Data-Dependent-Fail-First}
9 \author{Luke Kenneth Casson Leighton and Shriya Sharma}
10
11
12 \begin{document}
13
14 \frame{
15 \begin{center}
16 \huge{The Libre-SOC Hybrid 3D CPU}\\
17 \vspace{32pt}
18 \Large{Data-Dependent-Fail-First}\\
19
20 \vspace{24pt}
21 \Large{FOSDEM2024}\\
22 \vspace{16pt}
23 \large{Sponsored by NLnet's PET Programme}\\
24 \vspace{6pt}
25 \large{\today}
26 \end{center}
27 }
28
29
30 \frame{\frametitle{Why another SoC?}
31
32 \begin{itemize}
33 \item Intel Management Engine, Apple QA issues, Spectre\vspace{6pt}
34 \item Endless proprietary drivers, "simplest" solution: \\
35 License proprietary hard macros (with proprietary firmware)\\
36 Adversely affects product development cost\\
37 due to opaque driver bugs (Samsung S3C6410 / S5P100)
38 \vspace{6pt}
39 \item Alternative: Intel and Valve-Steam collaboration\\
40 "Most productive business meeting ever!"\\
41 https://tinyurl.com/valve-steam-intel
42 \vspace{6pt}
43 \item Because for 30 years I Always Wanted To Design A CPU
44 \vspace{6pt}
45 \item Ultimately it is a strategic \textit{business} objective to
46 develop entirely Libre hardware, firmware and drivers.
47 \end{itemize}
48 }
49
50
51
52 \frame{\frametitle{How can you help?}
53
54 \vspace{5pt}
55
56 \begin{itemize}
57 \item Start here! https://libre-soc.org \\
58 Mailing lists https://lists.libre-soc.org \\
59 IRC Freenode libre-soc \\
60 etc. etc. (it's a Libre project, go figure) \\
61 \vspace{3pt}
62 \item Can I get paid? Yes! NLnet funded\\
63 See https://libre-soc.org/nlnet/\#faq \\
64 \vspace{3pt}
65 \item Also profit-sharing in any commercial ventures \\
66 \vspace{3pt}
67 \item How many opportunities to develop Libre SoCs exist,\\
68 and actually get paid for it?
69 \vspace{3pt}
70 \item I'm not a developer, how can I help?\\
71 - Plenty of research needed, artwork, website \\
72 - Help find customers and OEMs willing to commit (LOI)
73 \end{itemize}
74 }
75
76
77
78 \frame{\frametitle{What goes into a typical SoC?}
79 \vspace{9pt}
80 \begin{itemize}
81 \item 15 to 20mm BGA package: 2.5 to 5 watt power consumption\\
82 heat sink normally not required (simplifies overall design)
83 \vspace{3pt}
84 \item Fully-integrated peripherals (not Northbridge/Southbridge)\\
85 USB, HDMI, RGB/TTL, SD/MMC, I2C, UART, SPI, GPIO etc. etc.
86 \vspace{3pt}
87 \item Built-in GPU (shared memory bus, 3rd party licensed) \vspace{3pt}
88 \item Built-in VPU (likewise, proprietary)\vspace{3pt}
89 \item Target price between \$2.50 and \$30 depending on market\\
90 Radically different from IBM POWER9 Core (200 Watt)
91 \vspace{3pt}
92 \item We're doing the same, just with a hybrid architecture.\\
93 CPU == GPU == VPU
94 \end{itemize}
95 }
96
97
98
99 %%\frame{\frametitle{Simple SBC-style SoC}
100 %%
101 %%\begin{center}
102 %%\includegraphics[width=0.9\textwidth]{shakti_libre_soc.jpg}
103 %%\end{center}
104
105 %%}
106
107
108
109
110 \begin{frame}[fragile]
111 \frametitle{Simple-V CMPI in a nutshell}
112
113 \begin{semiverbatim}
114 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
115 (assuming you know power-isa)
116  int i, id=0, ira=0;
117  for (i = 0; i < VL; i++)
118   CR[BA+id] <= compare(ireg[RA+ira], SI);
119 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
120 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
121 \end{semiverbatim}
122
123 \begin{itemize}
124 \item Above is oversimplified: predication etc. left out
125 \item Scalar-scalar and scalar-vector and vector-vector now all in one
126 \item OoO may choose to push CMPIs into instr. queue (v. busy!)
127 \end{itemize}
128 \end{frame}
129
130 \frame{\frametitle{Load/Store Fault-First}
131
132 \begin{itemize}
133 \item Problem: vector load and store can cause a page fault
134 \item Solution: a protocol that allows optional load/store
135 \item instruction \textit{requests} a number of elements
136 \item instruction \textit{informs} the number actually loaded
137 \item first load/store is not optional
138 \end{itemize}
139 }
140
141 \begin{frame}[fragile]
142 \frametitle{Data-Dependent Fail-First}
143
144 \begin{semiverbatim}
145 function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
146 int i, id=0, ira=0;
147 for (i = 0; i < VL; i++)
148 CR[BA+id] <= compare(ireg[RA+ira], SI);
149 if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
150 if (reg\_is\_vectorised[RA])  \{ ira += 1; \}
151 \end{semiverbatim}
152
153 \begin{itemize}
154 \item Above is oversimplified: predication etc. left out
155 \item Scalar-scalar and scalar-vector and vector-vector now all in one
156 \item OoO may choose to push CMPIs into instr. queue (v. busy!)
157 \end{itemize}
158 \end{frame}
159
160
161 \frame{\frametitle{Additional Simple-V features}
162
163 \begin{itemize}
164 \item "fail-on-first" (POWER9 VSX strncpy segfaults on boundary!)
165 \item "Twin Predication" (covers VSPLAT, VGATHER, VSCATTER, VINDEX etc.)
166 \item SVP64: extensive "tag" (Vector context) augmentation
167 \item "Context propagation": a VLIW-like context. Allows contexts
168 to be repeatedly applied.
169 Effectively a "hardware compression algorithm" for ISAs.
170 \item Ultimate goal: cut down I-Cache usage, cuts down on power
171 \item Typical GPU has its own I-Cache and small shaders.\\
172 \textit{We are a Hybrid CPU/GPU: I-Cache is not separate!}
173 \item Needs to go through OpenPOWER Foundation `approval'
174 \end{itemize}
175 }
176
177 \frame{\frametitle{maxloc}
178 \begin{itemize}
179 \item "TODO
180 \end{itemize}
181 }
182 \frame{\frametitle{Pospopcount}
183 \begin{semiverbatim}
184 // Copyright (c) 2020 Robert Clausecker <fuz@fuz.su>
185 // count8 reference implementation for tests. Do not alter.
186 func count8safe(counts *[8]int, buf []uint8) {
187 for i := range buf {
188 for j := 0; j < 8; j++ {
189 counts[j] += int(buf[i] >> j & 1)
190 }
191 }
192 }
193
194 A simple but still hardware-paralleliseable SVP64 assembler for 8-bit input values (count8safe) is as follows:
195
196 mtspr 9, 3 # move r3 to CTR
197 setvl 3,0,8,0,1,1 # set MVL=8, VL=r3=MIN(MVL,CTR)
198 # load VL bytes (update r4 addr) but compressed (dw=8)
199 addi 6, 0, 0 # initialise all 64-bits of r6 to zero
200 sv.lbzu/pi/dw=8 *6, 1(4) # should be /lf here as well
201 # gather performs the transpose (which gets us to positional..)
202 gbbd 8,6
203 # now those bits have been turned around, popcount and sum them
204 setvl 0,0,8,0,1,1 # set MVL=VL=8
205 sv.popcntd/sw=8 *24,*8 # do the (now transposed) popcount
206 sv.add *16,*16,*24 # and accumulate in results
207 # branch back if CTR still non-zero. works even though VL=8
208 sv.bc/all 16, *0, -0x28 # reduce CTR by VL and stop if -ve
209 \end{semiverbatim}
210
211 }
212 \frame{\frametitle{strncpy}
213 \begin{itemize}
214 \item "TODO
215 \end{itemize}
216 }
217 \frame{\frametitle{linked-list walking}
218 \begin{itemize}
219 \item "TODO
220 \end{itemize}
221 }
222 \frame{\frametitle{Summary}
223
224 \begin{itemize}
225 \item Goal is to create a mass-volume low-power embedded SoC suitable
226 for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
227 \item No way we could implement a project of this magnitude without
228 nmigen (being able to use python OO to HDL)
229 \item Collaboration with OpenPOWER Foundation and Members absolutely
230 essential. No short-cuts. Standards to be developed and ratified
231 so that everyone benefits.
232 \item Riding the wave of huge stability of OpenPOWER ecosystem
233 \item Greatly simplified open 3D and Video drivers reduces product
234 development costs for customers
235 \item It also happens to be fascinating, deeply rewarding technically
236 challenging, and funded by NLnet
237
238 \end{itemize}
239 }
240
241
242 \frame{
243 \begin{center}
244 {\Huge The end\vspace{12pt}\\
245 Thank you\vspace{12pt}\\
246 Questions?\vspace{12pt}
247 }
248 \end{center}
249
250 \begin{itemize}
251 \item Discussion: http://lists.libre-soc.org
252 \item Freenode IRC \#libre-soc
253 \item http://libre-soc.org/
254 \item http://nlnet.nl/PET
255 \item https://libre-soc.org/nlnet/\#faq
256 \end{itemize}
257 }
258
259
260 \end{document}