update opf 2021 slides
[libreriscv.git] / conferences / openpower2021 / openpower_2021.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \graphicspath{{./}}
7
8 \title{The Libre-SOC Hybrid 3D CPU}
9 \author{Luke Kenneth Casson Leighton}
10
11
12 \begin{document}
13
14 \frame{
15 \begin{center}
16 \huge{The Libre-SOC Hybrid 3D CPU}\\
17 \vspace{32pt}
18 \Large{Draft SVP64 in-place Matrix Multiply}\\
19 \Large{and FFT / DCT for the Power ISA}\\
20 \vspace{24pt}
21 \Large{OpenPOWER Summit 2021}\\
22 \vspace{16pt}
23 \large{Sponsored by NLnet's PET Programme}\\
24 \vspace{6pt}
25 \large{28th Oct 2021}
26 \end{center}
27 }
28
29
30 \frame{\frametitle{}
31
32 \vspace{15pt}
33
34 \begin{itemize}
35 \item \vspace{15pt}
36 \item \vspace{15pt}
37 \item \vspace{15pt}
38 \end{itemize}
39 }
40
41 \frame{\frametitle{Overview of Libre-SOC goals}
42
43 \vspace{15pt}
44
45 \begin{itemize}
46 \item To create power-efficient mass-volume products\vspace{15pt}
47 \item To leverage the OpenPOWER ecosystem to do so\vspace{15pt}
48 \item To be entirely transparent for Security reasons\vspace{15pt}
49 \item To empower businesses to bring Secure transparent\\
50 mass-volume products to market\vspace{15pt}
51 \end{itemize}
52 }
53
54 \frame{\frametitle{Overview of SVP64 goals}
55
56 \vspace{15pt}
57
58 \begin{itemize}
59 \item High performance and high performance/watt\vspace{15pt}
60 \item Reduced code density (reduced I-Cache usage)\\
61 https://arxiv.org/abs/2002.10143 - 3.5x power reduction\vspace{8pt}
62 \item Remain accessible for assembler writers and compilers alike\vspace{15pt}
63 \item Introduce true Vectorisation to the Power ISA\\
64 (VSX is Packed SIMD)\vspace{8pt}
65 \item Be adopted via the external OPF ISA WG RFC process\\
66 (not: be a non-official custom extension. proprietary\\
67 custom extensions conflict with mass-volume adoption)\vspace{15pt}
68 \end{itemize}
69 }
70
71
72
73 \begin{frame}[fragile]
74 \frametitle{Reminder of Simple-V}
75
76 \begin{semiverbatim}
77 https://libre-soc.org/openpower/sv/overview/
78 Greatly simplified (like x86 "REP" instruction):
79
80  for (i = 0; i < VL; i++)
81    GPR[RT+i] <= GPR[RA+i] + GPR[RB+i];
82
83 function op\_add(RT, RA, RB, predr) # add not VADD!
84  int i, id=0, irs1=0, irs2=0;
85  for (i = 0; i < VL; i++)
86   if (GPR[predr] & 1<<i) # predication
87    GPR[RT+id] <= GPR[RA+irs1] + GPR[RB+irs2];
88 if (reg\_is\_vectorised[RT])  \{ id += 1; \}
89 if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
90 if (reg\_is\_vectorised[RB])  \{ irs2 += 1; \}
91 \end{semiverbatim}
92
93 \end{frame}
94
95
96 \begin{frame}[fragile]
97 \frametitle{SVP64 REMAP system}
98
99 \begin{semiverbatim}
100 Register offsets are "REMAP"ed through a Hardware FSM
101 https://libre-soc.org/openpower/sv/remap/
102 remarkably similar to ZOLC
103 https://www.researchgate.net/publication/224647569
104
105 function op\_add(RT, RA, rs2, predr) # add not VADD!
106  int i, id=0, irs1=0, irs2=0;
107  for (i = 0; i < VL; i++)
108   if (GPR[predr] & 1<<i) # predication
109    GPR[RT+REMAP(id)] <= GPR[RA+REMAP(irs1)] +
110 GPR[rs2+REMAP(irs2)];
111 if (reg\_is\_vectorised[RT])  \{ id += 1; \}
112 if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
113 if (reg\_is\_vectorised[s2])  \{ irs2 += 1; \}
114 \end{semiverbatim}
115
116 \end{frame}
117
118 \begin{frame}[fragile]
119 \frametitle{Matrix Multiply: Basics}
120
121 \begin{semiverbatim}
122 (a00 a01 a02 x (b00 b01 =
123 a10 a11 a12) b10 b11
124 b20 b21)
125
126 (a00*b00 + a01*b10 + a02*b20 a00*b01 + a01*b11 + a02*b21
127 a10*b00 + a11*b10 + a12*b20 a10*b01 + a11*b11 + a12*b21)
128
129 (b00 b01 x (a00 a01 a02 =
130 b10 b11 a10 a11 a12)
131 b20 b21)
132
133 (b00*a00 + b01*a10 b00*a01 + b01*a11 b00*a02 + b01*a12
134 b10*a00 + b11*a10 b10*a01 + b11*a11 b10*a02 + b11*a12
135 b20*a00 + b21*a10 b20*a01 + b21*a11 b20*a02 + b21*a12)
136
137 \end{semiverbatim}
138
139 \end{frame}
140
141
142 \begin{frame}[fragile]
143 \frametitle{Matrix Multiply: naive, with python for-loops}
144
145 \begin{semiverbatim}
146 result = [] # final result
147 for i in range(len(A)):
148
149 row = [] # the new row in new matrix
150 for j in range(len(B[0])):
151
152 product = 0 # the new element in the new row
153 for v in range(len(A[i])):
154 product += A[i][v] * B[v][j]
155 row.append(product) # add sum of product to new row
156
157 result.append(row) # add new row into final result
158 \end{semiverbatim}
159
160 \end{frame}
161
162 \begin{frame}[fragile]
163 \frametitle{Matrix Multiply: suitable for Hardware scheduling}
164
165 \begin{semiverbatim}
166 Unsuitable: creates massive Read-After-Write chains
167
168 for i in range(len(A)):
169 for j in range(len(B[0])):
170 for v in range(len(A[i])):
171 product[i][j] += A[i][v] * B[v][j]
172
173 Suitable: can be parallelised / pipelined. RaW avoided
174
175 for i in range(len(A)):
176 for v in range(len(A[i])): # swapped
177 for j in range(len(B[0])): # with this
178 product[i][j] += A[i][v] * B[v][j]
179
180 \end{semiverbatim}
181
182 \end{frame}
183
184
185 \frame{\frametitle{Matrix Multiply: Generalise but Specialise}
186
187 \vspace{8pt}
188
189 \begin{itemize}
190 \item Why not make a general-purpose nested "Loop" system?\\
191 - Other uses (algorithms) beyond Matrix Multiplication\\
192 - Allow any arbitrary-sized loops\\
193 - Allow any permutation of nesting\\
194 - Allow reversing per-dimension\vspace{5pt}
195 \item Specialise by making Matrix Multiply "setup" quick/easy\\
196 - two 32-bit instructions to set up A, B, C sizes\\
197 - one 64-bit SVP64 FMAC instruction.\\
198 - Nothing else needed. Saves on I-Cache\vspace{5pt}
199 \item Hardware turns out to be near-identical to ZOLC\\
200 https://opencores.org/projects/hwlu\\
201 https://libre-soc.org/openpower/sv/remap/\vspace{15pt}
202 \end{itemize}
203 }
204
205 \begin{frame}[fragile]
206 \frametitle{Matrix Multiply: unit test / example}
207
208 \begin{semiverbatim}
209 94 def test_sv_remap2(self):
210 95 lst = ["svshape 5, 4, 3, 0, 0",
211 96 "svremap 0b11111, 1, 2, 3, 0, 0, 0, 0",
212 97 "sv.fmadds 0.v, 8.v, 16.v, 0.v"
213 98 ]
214 99 REMAP fmadds FRT, FRA, FRC, FRB
215
216 svshape 5, 4, 3, 0, 0 => A: 3x5 B: 3x4
217 => C: 3x3
218 svremap (enable) (F)RS, (F)RT, (F)RA, (F)RB, (F)RC
219 sv.fmadds: uses fp0 as accumulator
220 product[i][j] += A[i][v] * B[v][j]
221 \end{semiverbatim}
222
223 \end{frame}
224
225 \frame{\frametitle{Matrix Multiply: Ehm that's all Folks}
226
227 \vspace{6pt}
228
229 \begin{itemize}
230 \item Really is that straightforward: no actual Vector ops\\
231 - Does not dictate or limit micro-architectural detail\\
232 - Issues Scalar FMACs into existing back-end hardware\\
233 - Can use any 4-operand instruction (GF, INT, Bitmanip)\\
234 - No Power-2 limits. Any operand width (8/16/32/64)\vspace{8pt}
235 \item Limited to 127 scalar ops and in-place registers. Future?\\
236 - https://arxiv.org/abs/2002.10143 CISC-like load-and-inc\\
237 - Auto-load/store (tagged) registers, keeps RISC ISA\\
238 - Extend to memory-based arbitrary NxN matrix sizes\\
239 - Still power-efficient: no I-cache usage during FMAC issue\vspace{8pt}
240 \item Future can be investigated as part of EUR 22.6m EU Grant\\
241 https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
242 \end{itemize}
243 }
244
245
246 \frame{\frametitle{DCT / FFT / DFT / NTT: what if we could REMAP?}
247
248 \vspace{6pt}
249
250 \begin{itemize}
251 \item Can we create a REMAP Schedule for FFT (etc)? YES\\
252 - More complicated than Matrix Schedules but same principle\\
253 - Again: issues Scalar instructions into back-end micro-arch\\
254 - Requires 5-operand (3-in, 2-out) new Scalar Instructions\\
255 - Any operand width (8/16/32/64)\vspace{8pt}
256 \item Limited to in-place registers and Power-of-Two. Future?\\
257 - Again: CISC-like auto-load/store-and-increment\\
258 - https://arxiv.org/abs/2002.10143\\
259 - Again: still power-efficient (no I-Cache usage in loops)\vspace{8pt}
260 \item Again: can be investigated as part of EUR 22.6m EU Grant\\
261 https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
262 \end{itemize}
263 }
264
265
266 \frame{\frametitle{Discrete Cosine Transform (DCT): Basics}
267
268 \begin{itemize}
269 \item Standard DCT Schedule (messy, impossible for SIMD)
270 \item Output is in bit-reversed order\\
271 0b000 = 0b000 (in: 0 out: 0)\\
272 0b001 = 0b100 (in: 1 out: 4) ...\\
273 0b110 = 0b011 (in: 6 out: 3)\\
274 0b111 = 0b111 (in: 7 out: 7)
275 \end{itemize}
276
277 \begin{center}
278 \includegraphics[width=0.75\textwidth]{plonka_dct1.png}
279 \end{center}
280
281 }
282
283 \frame{\frametitle{Fast Fourier Transform (FFT/DFT): Butterfly Basics}
284
285 \begin{itemize}
286 \item Standard Butterfly Schedule (again: messy, but less so)
287 \item Output, again, is in bit-reversed order
288 \end{itemize}
289
290 \begin{center}
291 \includegraphics[width=0.70\textwidth]{fft_butterfly.png}
292 \end{center}
293
294 }
295
296 \frame{\frametitle{FFT/DFT: 3-in, 2-out butterfly}
297
298 \begin{itemize}
299 \item One multiply (by coefficient), one add, one subtract
300 \item inputs: X[0] X[1] C(oeff) outputs: X[0] X[1]
301 \end{itemize}
302
303 \begin{center}
304 \includegraphics[width=0.90\textwidth]{2-point-dft.png}
305 \end{center}
306
307 }
308
309
310 \begin{frame}[fragile]
311 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (1)}
312
313 \begin{semiverbatim}
314 coef = (2 if inverse else -2) * cmath.pi / n
315 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
316 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
317 size = 2
318 while size <= n:
319 halfsize, tablestep = size // 2, n // size
320 for i in range(0, n, size):
321 k = 0
322 for j in range(i, i + halfsize):
323 temp = vec[j + halfsize] * exptable[k]
324 vec[j + halfsize] = vec[j] - temp
325 vec[j] += temp
326 k += tablestep
327 size *= 2
328 \end{semiverbatim}
329
330 \end{frame}
331
332
333 \begin{frame}[fragile]
334 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (2)}
335
336 \begin{semiverbatim}
337 coef = (2 if inverse else -2) * cmath.pi / n
338 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
339 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
340 size = 2
341 while size <= n:
342 hs, tablestep = size // 2, n // size
343 for i in range(0, n, size):
344 k = 0
345 for j in range(i, i+hs):
346 # Twin-Butterfly 3-in 2-out: one instruction
347 C = exptable[k]
348 vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
349 k += tablestep
350 size *= 2
351 \end{semiverbatim}
352
353 \end{frame}
354
355 \begin{frame}[fragile]
356 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (3)}
357
358 \begin{itemize}
359 \item What if the Triple Loop could be done with REMAP?
360 \item Register Offsets j, j+hs, k created automatically?
361 \item Only one actual inner loop instruction (Twin-butterfly)
362 \item 3-in (X0/X1/C) 2-out (X0/X1) allows for in-place FFT
363 \item Hardware FSM (like ZOLC) creates offset triplet\\
364 - Technically not that hard to implement (for Radix-2)\\
365 - Exact same principle as Triple-loop for Matrices
366 \end{itemize}
367
368 \begin{semiverbatim}
369 for j,k,hs in REMAP_TRIPLE_LOOP_GENERATOR():
370 # Twin-Butterfly 3-in 2-out: one instruction
371 C = exptable[k]
372 vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
373 \end{semiverbatim}
374
375 \end{frame}
376
377 \frame{\frametitle{DCT: pre-arrange (pre-load) data}
378
379 \begin{itemize}
380 \item Arrange input data such that output falls into place
381 \item (another) Twin 3-in 2-out Mul-Add in-place instruction
382 \end{itemize}
383
384 \begin{center}
385 \includegraphics[width=0.70\textwidth]{dct_butterfly.png}
386 \end{center}
387
388 }
389
390
391 \frame{\frametitle{FFT (Complex numbers) and DCT coefficients?}
392
393 \begin{itemize}
394 \item Problem (DCT): DCT Cosine Coefficients change (cos + 0.5)
395 depending on the layer. Cannot do as single instruction
396 \item Problem (FFT): Complex number butterfly multiplication involves
397 4 multiplies. Cannot do in-place as single instruction\vspace{12pt}
398 \item Solution: "Vertical-First" style Vectors\vspace{12pt}
399 \item Understanding of SVP64 "Vertical-First"\\
400 30min explanatory video https://youtube.com/watch?v=fn2KJvWyBKg
401 \item Basically involves stepping "vertically" through instructions
402 then moving ("stepping") to the next offset (next REMAP)
403 \item Horizontal-first: run through the entire REMAP schedule on a single
404 instruction before moving on to the next instruction
405
406 \end{itemize}
407 }
408
409 \frame{\frametitle{TODO rewrite Summary}
410
411 \begin{itemize}
412 \item Goal is to create a mass-volume low-power embedded SoC suitable
413 for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
414 \item No DRM. 'Trustable' (by the users, not by Media Moguls) design
415 ethos as a \textit{business} objective: requires full transparency
416 as well as Formal Correctness Proofs
417 \item Collaboration with OpenPOWER Foundation and Members absolutely
418 essential. No short-cuts. Standards to be developed and ratified
419 so that everyone benefits.
420 \item Working on the back of huge stability of POWER ecosystem
421 \item Combination of which is that Board Support Package is 100\%
422 upstream, app and product development by customer is hugely
423 simplified and much more attractive
424
425 \end{itemize}
426 }
427
428
429 \frame{
430 \begin{center}
431 {\Huge The end\vspace{15pt}\\
432 Thank you\vspace{15pt}\\
433 Questions?\vspace{15pt}
434 }
435 \end{center}
436
437 \begin{itemize}
438 \item Discussion: Libre-SOC-dev mailing list
439 \item Freenode IRC \#libre-soc
440 \item http://libre-soc.org/
441 \item http://nlnet.nl/PET
442 \end{itemize}
443 }
444
445
446 \end{document}