add slide
[libreriscv.git] / simple_v_extension / simple_v_chennai_2018.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
7 \author{Luke Kenneth Casson Leighton}
8
9
10 \begin{document}
11
12 \frame{
13 \begin{center}
14 \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
15 \vspace{32pt}
16 \Large{Flexible Vectorisation}\\
17 \Large{(aka not so Simple-V?)}\\
18 \vspace{24pt}
19 \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
20 \vspace{24pt}
21 \large{\today}
22 \end{center}
23 }
24
25
26 \frame{\frametitle{Credits and Acknowledgements}
27
28 \begin{itemize}
29 \item The Designers of RISC-V\vspace{15pt}
30 \item The RVV Working Group and contributors\vspace{15pt}
31 \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
32 Guy Lemurieux, Jonathan Neuschafer, Roger Bruisse,
33 and others\vspace{15pt}
34 \item ISA-Dev Group Members\vspace{10pt}
35 \end{itemize}
36 }
37
38
39 \frame{\frametitle{Quick refresher on SIMD}
40
41 \begin{itemize}
42 \item SIMD very easy to implement (and very seductive)\vspace{10pt}
43 \item Parallelism is in the ALU\vspace{10pt}
44 \item Zero-to-Negligeable impact for rest of core\vspace{10pt}
45 \end{itemize}
46 Where SIMD Goes Wrong:\vspace{10pt}
47 \begin{itemize}
48 \item See "SIMD instructions considered harmful"
49 https://www.sigarch.org/simd-instructions-considered-harmful
50 \item Corner-cases alone are extremely complex.\\
51 Hardware is easy, but software is hell.
52 \item O($N^{6}$) ISA opcode proliferation!\\
53 opcode, elwidth, veclen, src1-src2-dest hi/lo
54 \end{itemize}
55 }
56
57 \frame{\frametitle{Quick refresher on RVV}
58
59 \begin{itemize}
60 \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
61 \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
62 \item Requires a separate Register File\vspace{10pt}
63 \item Can be implemented as a separate pipeline\vspace{10pt}
64 \end{itemize}
65 However...\vspace{10pt}
66 \begin{itemize}
67 \item 98 percent opcode duplication with rest of RV (CLIP)
68 \item Extending RVV requires customisation not just of h/w:\\
69 gcc and s/w also need customisation (and maintenance)
70 \end{itemize}
71 }
72
73
74 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
75
76 \begin{itemize}
77 \item Why?
78 Implementors need flexibility in vectorisation to optimise for
79 area or performance depending on the scope:
80 embedded DSP, Mobile GPU's, Server CPU's and more.\vspace{4pt}\\
81 Compilers also need flexibility in vectorisation to optimise for cost
82 of pipeline setup, amount of state to context switch
83 and software portability\vspace{4pt}
84 \item How?
85 By implicitly marking INT/FP regs as "Vectorised",\\
86 SV expresses how existing instructions should act
87 on [contiguous] blocks of registers, in parallel.\vspace{4pt}
88 \item What?
89 Simple-V is an "API" that implicitly extends
90 existing (scalar) instructions with explicit parallelisation.
91 \end{itemize}
92 }
93
94
95 \frame{\frametitle{How does Simple-V relate to RVV?}
96
97 \begin{itemize}
98 \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
99 \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
100 \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
101 \item Even Compressed instructions become vectorised\vspace{10pt}
102 \end{itemize}
103 What Simple-V is not:\vspace{10pt}
104 \begin{itemize}
105 \item A full supercomputer-level Vector Proposal
106 \item A replacement for RVV (SV is designed to be over-ridden\\
107 by - or augmented to become, or just be replaced by - RVV)
108 \end{itemize}
109 }
110
111
112 \frame{\frametitle{How is Parallelism abstracted in Simple-V?}
113
114 \begin{itemize}
115 \item Register "typing" turns any op into an implicit Vector op\vspace{10pt}
116 \item Primarily at the Instruction issue phase (except SIMD)\\
117 Note: it's ok to pass predication through to ALU (like SIMD)
118 \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
119 \end{itemize}
120 Notes:\vspace{6pt}
121 \begin{itemize}
122 \item All LOAD/STORE (inc. Compressed, Int/FP versions)
123 \item All ALU ops (soft / hybrid / full HW, on per-op basis)
124 \item All branches become predication targets (C.FNE added)
125 \item C.MV of particular interest (s/v, v/v, v/s)
126 \end{itemize}
127 }
128
129
130 \frame{\frametitle{Implementation Options}
131
132 \begin{itemize}
133 \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)
134 \item Hardware loop, single-instruction issue\\
135 (Do / Don't send through predication to ALU)
136 \item Hardware loop, parallel (multi-instruction) issue\\
137 (Do / Don't send through predication to ALU)
138 \item Hardware loop, full parallel ALU (not recommended)
139 \end{itemize}
140 Notes:\vspace{6pt}
141 \begin{itemize}
142 \item 4 (or more?) options above may be deployed on per-op basis
143 \item SIMD always sends predication bits through to ALU
144 \item Minimum MVL MUST be sufficient to cover regfile LD/ST
145 \item Instr. FIFO may repeatedly split off N scalar ops at a time
146 \end{itemize}
147 }
148 % Instr. FIFO may need its own slide. Basically, the vectorised op
149 % gets pushed into the FIFO, where it is then "processed". Processing
150 % will remove the first set of ops from its vector numbering (taking
151 % predication into account) and shoving them **BACK** into the FIFO,
152 % but MODIFYING the remaining "vectorised" op, subtracting the now
153 % scalar ops from it.
154
155 \frame{\frametitle{How are SIMD Instructions Vectorised?}
156
157 \begin{itemize}
158 \item SIMD ALU(s) primarily unchanged\vspace{10pt}
159 \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt}
160 \item End of Vector enables predication (NO ZEROING!)\vspace{10pt}
161 \end{itemize}
162 Considerations:\vspace{10pt}
163 \begin{itemize}
164 \item Many SIMD ALUs possible (parallel execution)\vspace{10pt}
165 \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt}
166 \item Implementor free to choose (API remains the same)\vspace{10pt}
167 \end{itemize}
168 }
169 % With multiple SIMD ALUs at for example 32-bit wide they can be used
170 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
171 % or they can be used to cover several operations on totally different
172 % vectors / registers.
173
174 \frame{\frametitle{What's the deal / juice / score?}
175
176 \begin{itemize}
177 \item Standard Register File(s) overloaded with "vector span"\vspace{10pt}
178 \item Element width and type concepts remain same as RVV\vspace{10pt}
179 \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
180 \end{itemize}
181 Key differences from RVV:\vspace{10pt}
182 \begin{itemize}
183 \item Predication in INT regs as a BIT field (max VL=XLEN)
184 \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)
185 \item SV may condense sparse Vecs: RVV lets ALU do predication
186 \item NO ZEROING: non-predicated elements are skipped
187 \end{itemize}
188 }
189
190
191 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
192
193 \begin{itemize}
194 \item Same register(s) can have multiple "interpretations"\vspace{10pt}
195 \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops\vspace{10pt}
196 \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV)\vspace{10pt}
197 \item Same register(s) can be offset (no need for VSLIDE)\vspace{10pt}
198 \end{itemize}
199 Note:\vspace{10pt}
200 \begin{itemize}
201 \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) \vspace{10pt}
202 \item Hi-Performance: Macro-op fusion (more pipeline stages?)\vspace{10pt}
203 \end{itemize}
204 }
205
206
207 \frame{\frametitle{Why no Zeroing (place zeros in non-predicated elements)?}
208
209 \begin{itemize}
210 \item Zeroing is an implementation optimisation favouring OoO\vspace{8pt}
211 \item Simple implementations may skip non-predicated operations\vspace{8pt}
212 \item Simple implementations explicitly have to destroy data\vspace{8pt}
213 \item Complex implementations may use reg-renames to save power\\
214 Zeroing on predication chains makes optimisation harder
215 \end{itemize}
216 Considerations:\vspace{10pt}
217 \begin{itemize}
218 \item Complex not really impacted, Simple impacted a LOT
219 \item Overlapping "Vectors" may issue overlapping ops
220 \item Please don't use Vectors for "security" (use Sec-Ext)
221 \end{itemize}
222 }
223 % with overlapping "vectors" - bearing in mind that "vectors" are
224 % just a remap onto the standard register file, if the top bits of
225 % predication are zero, and there happens to be a second vector
226 % that uses some of the same register file that happens to be
227 % predicated out, the second vector op may be issued *at the same time*
228 % if there are available parallel ALUs to do so.
229
230
231 \frame{\frametitle{Predication key-value CSR store}
232
233 \begin{itemize}
234 \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
235 \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
236 \item register to store actual predication in (5 bits, value)\vspace{10pt}
237 \item predication is inverted (1 bit)\vspace{10pt}
238 \end{itemize}
239 Notes:\vspace{10pt}
240 \begin{itemize}
241 \item Table should be expanded out for high-speed implementations
242 \item Multiple "keys" (and values) theoretically permitted
243 \item RVV rules about deleting higher-indexed CSRs followed
244 \end{itemize}
245 }
246
247
248 \frame{\frametitle{Register key-value CSR store}
249
250 \begin{itemize}
251 \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
252 \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
253 \item register to store actual predication in (5 bits, value)\vspace{10pt}
254 \item TODO\vspace{10pt}
255 \end{itemize}
256 Notes:\vspace{10pt}
257 \begin{itemize}
258 \item Table should be expanded out for high-speed implementations
259 \item Multiple "keys" (and values) theoretically permitted
260 \item RVV rules about deleting higher-indexed CSRs followed
261 \end{itemize}
262 }
263
264
265 \begin{frame}[fragile]
266 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
267
268 \begin{semiverbatim}
269 function op_add(rd, rs1, rs2, predr) # add not VADD!
270  int i, id=0, irs1=0, irs2=0;
271  for (i=0; i < MIN(VL, vectorlen[rd]); i++)
272   if (ireg[predr] & 1<<i) # predication uses intregs
273    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
274 if (reg_is_vectorised[rd]) \{ id += 1; \}
275 if (reg_is_vectorised[rs1]) \{ irs1 += 1; \}
276 if (reg_is_vectorised[rs2]) \{ irs2 += 1; \}
277 \end{semiverbatim}
278
279 \begin{itemize}
280 \item SIMD slightly more complex (case above is elwidth = default)
281 \item Scalar-scalar and scalar-vector and vector-vector now all in one
282 \item OoO may choose to push ADDs into instr. queue (v. busy!)
283 \end{itemize}
284 \end{frame}
285
286 \begin{frame}[fragile]
287 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
288
289 \begin{semiverbatim}
290 s1 = vectorlen[src1] > 1;
291 s2 = vectorlen[src2] > 1;
292 for (int i = 0; i < VL; ++i)
293 preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1],
294 s2 ? reg[src2+i] : reg[src2]);
295 \end{semiverbatim}
296
297 \begin{itemize}
298 \item SIMD slightly more complex (case above is elwidth = default)
299 \item If s1 and s2 both scalars, Standard branch occurs
300 \item Predication stored in integer regfile as a bitfield
301 \item Scalar-vector and vector-vector supported
302 \end{itemize}
303 \end{frame}
304
305 \begin{frame}[fragile]
306 \frametitle{VLD/VLD.S/VLD.X (or trap, or actual hardware loop)}
307
308 \begin{semiverbatim}
309 if (unit-strided) stride = elsize;
310 else stride = areg[as2]; // constant-strided
311 for (int i = 0; i < VL; ++i)
312 if (preg_enabled[rd] && ([!]preg[rd] & 1<<i))
313 for (int j = 0; j < seglen+1; j++)
314 if (vectorised[rs2]) offs = vreg[rs2][i]
315 else offs = i*(seglen+1)*stride;
316 vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
317 \end{semiverbatim}
318
319 \begin{itemize}
320 \item Again: SIMD slightly more complex
321 \item rs2 vectorised taken to implicitly indicate VLD.X
322 \end{itemize}
323 \end{frame}
324
325
326 \frame{\frametitle{C.MV extremely flexible!}
327
328 \begin{itemize}
329 \item scalar-to-vector (w/no pred): VSPLAT
330 \item scalar-to-vector (w/dest-pred): Sparse VSPLAT
331 \item scalar-to-vector (w/single dest-pred): VINSERT
332 \item vector-to-scalar (w/src-pred): VEXTRACT
333 \item vector-to-vector (w/no pred): Vector Copy
334 \item vector-to-vector (w/src xor dest pred): Sparse Vector Copy
335 \item vector-to-vector (w/src and dest pred): Vector Gather/Scatter
336 \end{itemize}
337 \vspace{8pt}
338 Notes:\vspace{10pt}
339 \begin{itemize}
340 \item Really powerful!
341 \item Any other options?
342 \end{itemize}
343 }
344
345
346 \frame{\frametitle{Opcodes, compared to RVV}
347
348 \begin{itemize}
349 \item All integer and FP opcodes all removed (no CLIP!)\vspace{8pt}
350 \item VMPOP, VFIRST etc. all removed (use xBitManip)\vspace{8pt}
351 \item VSLIDE removed (use regfile overlaps)\vspace{8pt}
352 \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)\vspace{8pt}
353 \item VSETVL, VGETVL, VSELECT stay\vspace{8pt}
354 \item Issue: VCLIP is not in RV* (add with custom ext?)\vspace{8pt}
355 \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)\vspace{8pt}
356 \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)\vspace{8pt}
357 \end{itemize}
358 }
359
360
361 \frame{\frametitle{Under consideration}
362
363 \begin{itemize}
364 \item Is C.FNE actually needed? Should it be added if it is?
365 \item Is detection of all-scalar ops ok (without slowing pipeline)?
366 \item Can VSELECT be removed? (it's really complex)
367 \item Can CLIP be done as a CSR (mode, like elwidth)
368 \item SIMD saturation (etc.) also set as a mode?
369 \item C.MV src predication no different from dest predication\\
370 What to do? Make one have different meaning?
371 \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
372 (a bit like misaligned addressing... for registers)\\
373 or just use predication to skip start?
374 \end{itemize}
375 }
376
377
378 \frame{\frametitle{Summary}
379
380 \begin{itemize}
381 \item Designed for simplicity (graded levels of complexity)\vspace{10pt}
382 \item Fits RISC-V ethos: do more with less\vspace{10pt}
383 \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
384 (without SIMD downsides or sacrificing speed trade-off)\vspace{10pt}
385 \item Covers 98\% of RVV, allows RVV to fit "on top"\vspace{10pt}
386 \item Huge range of implementor freedom and flexibility\vspace{10pt}
387 \item Not designed for supercomputing (that's RVV), designed for
388 in between: DSPs, RV32E, Embedded 3D GPUs etc.\vspace{10pt}
389 \end{itemize}
390 }
391
392
393 \frame{\frametitle{slide}
394
395 \begin{itemize}
396 \item \vspace{10pt}
397 \end{itemize}
398 Considerations:\vspace{10pt}
399 \begin{itemize}
400 \item \vspace{10pt}
401 \end{itemize}
402 }
403
404
405 \frame{\frametitle{Including a plot}
406 \begin{center}
407 % \includegraphics[height=2in]{dental.ps}\\
408 {\bf \red Dental trajectories for 27 children:}
409 \end{center}
410 }
411
412 \frame{\frametitle{Creating .pdf slides in WinEdt}
413
414 \begin{itemize}
415 \item LaTeX [Shift-Control-L]\vspace{10pt}
416 \item dvi2pdf [click the button]\vspace{24pt}
417 \end{itemize}
418 To print 4 slides per page in acrobat click\vspace{10pt}
419 \begin{itemize}
420 \item File/print/properties\vspace{10pt}
421 \item Change ``pages per sheet'' to 4\vspace{10pt}
422 \end{itemize}
423 }
424
425 \frame{
426 \begin{center}
427 {\Huge \red The end}
428 \end{center}
429 }
430
431
432 \end{document}