update
[libreriscv.git] / simple_v_extension / simple_v_chennai_2018.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
7 \author{Luke Kenneth Casson Leighton}
8
9
10 \begin{document}
11
12 \frame{
13 \begin{center}
14 \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
15 \vspace{32pt}
16 \Large{Flexible Vectorisation}\\
17 \Large{(aka not so Simple-V?)}\\
18 \Large{(aka How to Parallelise the RISC-V ISA)}\\
19 \vspace{24pt}
20 \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
21 \vspace{16pt}
22 \large{\today}
23 \end{center}
24 }
25
26
27 \frame{\frametitle{Credits and Acknowledgements}
28
29 \begin{itemize}
30 \item The Designers of RISC-V\vspace{15pt}
31 \item The RVV Working Group and contributors\vspace{15pt}
32 \item Allen Baum, Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
33 Guy Lemurieux, Jonathan Neuschafer, Roger Brussee,
34 and others\vspace{15pt}
35 \item ISA-Dev Group Members\vspace{10pt}
36 \end{itemize}
37 }
38
39
40 \frame{\frametitle{Quick refresher on SIMD}
41
42 \begin{itemize}
43 \item SIMD very easy to implement (and very seductive)\vspace{8pt}
44 \item Parallelism is in the ALU\vspace{8pt}
45 \item Zero-to-Negligeable impact for rest of core\vspace{8pt}
46 \end{itemize}
47 Where SIMD Goes Wrong:\vspace{10pt}
48 \begin{itemize}
49 \item See "SIMD instructions considered harmful"
50 https://sigarch.org/simd-instructions-considered-harmful
51 \item Setup and corner-cases alone are extremely complex.\\
52 Hardware is easy, but software is hell.
53 \item O($N^{6}$) ISA opcode proliferation!\\
54 opcode, elwidth, veclen, src1-src2-dest hi/lo
55 \end{itemize}
56 }
57
58 \frame{\frametitle{Quick refresher on RVV}
59
60 \begin{itemize}
61 \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
62 \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
63 \item Requires a separate Register File (16 w/ext to 256)\vspace{10pt}
64 \item Implemented as a separate pipeline (no impact on scalar)\vspace{10pt}
65 \end{itemize}
66 However...\vspace{10pt}
67 \begin{itemize}
68 \item 98 percent opcode duplication with rest of RV (CLIP)
69 \item Extending RVV requires customisation not just of h/w:\\
70 gcc, binutils also need customisation (and maintenance)
71 \end{itemize}
72 }
73
74
75 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
76
77 \begin{itemize}
78 \item Why?
79 Implementors need flexibility in vectorisation to optimise for
80 area or performance depending on the scope:
81 embedded DSP, Mobile GPU's, Server CPU's and more.\vspace{4pt}\\
82 Compilers also need flexibility in vectorisation to optimise for cost
83 of pipeline setup, amount of state to context switch
84 and software portability\vspace{4pt}
85 \item How?
86 By marking INT/FP regs as "Vectorised" and
87 adding a level of indirection,
88 SV expresses how existing instructions should act
89 on [contiguous] blocks of registers, in parallel.\vspace{4pt}
90 \item What?
91 Simple-V is an "API" that implicitly extends
92 existing (scalar) instructions with explicit parallelisation\\
93 (i.e. SV is actually about parallelism NOT vectors per se)
94 \end{itemize}
95 }
96
97
98 \frame{\frametitle{What's the value of SV? Why adopt it even in non-V?}
99
100 \begin{itemize}
101 \item memcpy becomes much smaller (higher bang-per-buck)
102 \item context-switch (LOAD/STORE multiple): 1-2 instructions
103 \item Compressed instrs further reduces I-cache (etc.)
104 \item Greatly-reduced I-cache load (and less reads)
105 \item Amazingly, SIMD becomes (more) tolerable\\
106 (corner-cases for setup and teardown are gone)
107 \item Modularity/Abstraction in both the h/w and the toolchain.
108 \end{itemize}
109 Note:
110 \begin{itemize}
111 \item It's not just about Vectors: it's about instruction effectiveness
112 \item Anything that makes SIMD tolerable has to be a good thing
113 \item Anything implementor is not interested in HW-optimising,\\
114 let it fall through to exceptions (implement as a trap).
115 \end{itemize}
116 }
117
118
119 \frame{\frametitle{How does Simple-V relate to RVV? What's different?}
120
121 \begin{itemize}
122 \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
123 \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
124 \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
125 \item Even Compressed become vectorised (RVV can't)\vspace{10pt}
126 \end{itemize}
127 What Simple-V is not:\vspace{10pt}
128 \begin{itemize}
129 \item A full supercomputer-level Vector Proposal
130 \item A replacement for RVV (SV is designed to be over-ridden\\
131 by - or augmented to become - RVV)
132 \end{itemize}
133 }
134
135
136 \frame{\frametitle{How is Parallelism abstracted in Simple-V?}
137
138 \begin{itemize}
139 \item Register "typing" turns any op into an implicit Vector op:\\
140 registers are reinterpreted through a level of indirection
141 \item Primarily at the Instruction issue phase (except SIMD)\\
142 Note: it's ok to pass predication through to ALU (like SIMD)
143 \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
144 \end{itemize}
145 Note: EVERYTHING is parallelised:
146 \begin{itemize}
147 \item All LOAD/STORE (inc. Compressed, Int/FP versions)
148 \item All ALU ops (soft / hybrid / full HW, on per-op basis)
149 \item All branches become predication targets (C.FNE added?)
150 \item C.MV of particular interest (s/v, v/v, v/s)
151 \item FCVT, FMV, FSGNJ etc. very similar to C.MV
152 \end{itemize}
153 }
154
155
156 \frame{\frametitle{Implementation Options}
157
158 \begin{itemize}
159 \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)
160 \item Hardware loop, single-instruction issue\\
161 (Do / Don't send through predication to ALU)
162 \item Hardware loop, parallel (multi-instruction) issue\\
163 (Do / Don't send through predication to ALU)
164 \item Hardware loop, full parallel ALU (not recommended)
165 \end{itemize}
166 Notes:\vspace{6pt}
167 \begin{itemize}
168 \item 4 (or more?) options above may be deployed on per-op basis
169 \item SIMD always sends predication bits through to ALU
170 \item Minimum MVL MUST be sufficient to cover regfile LD/ST
171 \item Instr. FIFO may repeatedly split off N scalar ops at a time
172 \end{itemize}
173 }
174 % Instr. FIFO may need its own slide. Basically, the vectorised op
175 % gets pushed into the FIFO, where it is then "processed". Processing
176 % will remove the first set of ops from its vector numbering (taking
177 % predication into account) and shoving them **BACK** into the FIFO,
178 % but MODIFYING the remaining "vectorised" op, subtracting the now
179 % scalar ops from it.
180
181 \frame{\frametitle{Predicated 8-parallel ADD: 1-wide ALU}
182 \begin{center}
183 \includegraphics[height=2.5in]{padd9_alu1.png}\\
184 {\bf \red Predicated adds are shuffled down: 6 cycles in total}
185 \end{center}
186 }
187
188
189 \frame{\frametitle{Predicated 8-parallel ADD: 4-wide ALU}
190 \begin{center}
191 \includegraphics[height=2.5in]{padd9_alu4.png}\\
192 {\bf \red Predicated adds are shuffled down: 4 in 1st cycle, 2 in 2nd}
193 \end{center}
194 }
195
196
197 \frame{\frametitle{Predicated 8-parallel ADD: 3 phase FIFO expansion}
198 \begin{center}
199 \includegraphics[height=2.5in]{padd9_fifo.png}\\
200 {\bf \red First cycle takes first four 1s; second takes the rest}
201 \end{center}
202 }
203
204
205 \frame{\frametitle{How are SIMD Instructions Vectorised?}
206
207 \begin{itemize}
208 \item SIMD ALU(s) primarily unchanged\vspace{6pt}
209 \item Predication is added to each SIMD element\vspace{6pt}
210 \item Predication bits sent in groups to the ALU\vspace{6pt}
211 \item End of Vector enables (additional) predication\vspace{10pt}
212 \end{itemize}
213 Considerations:\vspace{4pt}
214 \begin{itemize}
215 \item Many SIMD ALUs possible (parallel execution)
216 \item Implementor free to choose (API remains the same)
217 \item Unused ALU units wasted, but s/w DRASTICALLY simpler
218 \item Very long SIMD ALUs could waste significant die area
219 \end{itemize}
220 }
221 % With multiple SIMD ALUs at for example 32-bit wide they can be used
222 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
223 % or they can be used to cover several operations on totally different
224 % vectors / registers.
225
226 \frame{\frametitle{Predicated 9-parallel SIMD ADD}
227 \begin{center}
228 \includegraphics[height=2.5in]{padd9_simd.png}\\
229 {\bf \red 4-wide 8-bit SIMD, 4 bits of predicate passed to ALU}
230 \end{center}
231 }
232
233
234 \frame{\frametitle{What's the deal / juice / score?}
235
236 \begin{itemize}
237 \item Standard Register File(s) overloaded with CSR "reg is vector"\\
238 (see pseudocode slides for examples)
239 \item Element width (and type?) concepts remain same as RVV\\
240 (CSRs give new size (and meaning?) to elements in registers)
241 \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
242 \end{itemize}
243 Key differences from RVV:\vspace{10pt}
244 \begin{itemize}
245 \item Predication in INT regs as a BIT field (max VL=XLEN)
246 \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)
247 \item SV may condense sparse Vecs: RVV lets ALU do predication
248 \item Choice to Zero or skip non-predicated elements
249 \end{itemize}
250 }
251
252
253 \begin{frame}[fragile]
254 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
255
256 \begin{semiverbatim}
257 function op\_add(rd, rs1, rs2, predr) # add not VADD!
258  int i, id=0, irs1=0, irs2=0;
259  for (i = 0; i < VL; i++)
260   if (ireg[predr] & 1<<i) # predication uses intregs
261    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
262 if (reg\_is\_vectorised[rd]) \{ id += 1; \}
263 if (reg\_is\_vectorised[rs1]) \{ irs1 += 1; \}
264 if (reg\_is\_vectorised[rs2]) \{ irs2 += 1; \}
265 \end{semiverbatim}
266
267 \begin{itemize}
268 \item Above is oversimplified: Reg. indirection left out (for clarity).
269 \item SIMD slightly more complex (case above is elwidth = default)
270 \item Scalar-scalar and scalar-vector and vector-vector now all in one
271 \item OoO may choose to push ADDs into instr. queue (v. busy!)
272 \end{itemize}
273 \end{frame}
274
275 % yes it really *is* ADD not VADD. that's the entire point of
276 % this proposal, that *standard* operations are overloaded to
277 % become vectorised-on-demand
278
279
280 \begin{frame}[fragile]
281 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
282
283 \begin{semiverbatim}
284 s1 = reg\_is\_vectorised(src1);
285 s2 = reg\_is\_vectorised(src2);
286 if (!s2 && !s1) goto branch;
287 for (int i = 0; i < VL; ++i)
288 if (cmp(s1 ? reg[src1+i]:reg[src1],
289 s2 ? reg[src2+i]:reg[src2])
290 ireg[rs3] |= 1<<i;
291 \end{semiverbatim}
292
293 \begin{itemize}
294 \item SIMD slightly more complex (case above is elwidth = default)
295 \item If s1 and s2 both scalars, Standard branch occurs
296 \item Predication stored in integer regfile as a bitfield
297 \item Scalar-vector and vector-vector supported
298 \end{itemize}
299 \end{frame}
300
301 \begin{frame}[fragile]
302 \frametitle{VLD/VLD.S/VLD.X (or trap, or actual hardware loop)}
303
304 \begin{semiverbatim}
305 if (unit-strided) stride = elsize;
306 else stride = areg[as2]; // constant-strided
307 for (int i = 0; i < VL; ++i)
308 if (preg\_enabled[rd] && ([!]preg[rd] & 1<<i))
309 for (int j = 0; j < seglen+1; j++)
310 if (reg\_is\_vectorised[rs2]) offs = vreg[rs2+i]
311 else offs = i*(seglen+1)*stride;
312 vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
313 \end{semiverbatim}
314
315 \begin{itemize}
316 \item Again: elwidth != default slightly more complex
317 \item rs2 vectorised taken to implicitly indicate VLD.X
318 \end{itemize}
319 \end{frame}
320
321
322 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
323
324 \begin{itemize}
325 \item Same register(s) can have multiple "interpretations"
326 \item Set "real" register (scalar) without needing to set/unset CSRs.
327 \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops
328 \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV:\\
329 GREV @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
330 \item RGB 565 (video): BEXTW plus 4x8-bit SIMD plus BDEPW\\
331 (BEXT/BDEP @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
332 \item Same register(s) can be offset (no need for VSLIDE)\vspace{6pt}
333 \end{itemize}
334 Note:
335 \begin{itemize}
336 \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$)
337 \item Hi-Performance: Macro-op fusion (more pipeline stages?)
338 \end{itemize}
339 }
340
341
342 \frame{\frametitle{To Zero or not to place zeros in non-predicated elements?}
343
344 \begin{itemize}
345 \item Zeroing is an implementation optimisation favouring OoO
346 \item Simple implementations may skip non-predicated operations
347 \item Simple implementations explicitly have to destroy data
348 \item Complex implementations may use reg-renames to save power\\
349 Zeroing on predication chains makes optimisation harder
350 \item Compromise: REQUIRE both (specified in predication CSRs).
351 \end{itemize}
352 Considerations:
353 \begin{itemize}
354 \item Complex not really impacted, simple impacted a LOT\\
355 with Zeroing... however it's useful (memzero)
356 \item Non-zero'd overlapping "Vectors" may issue overlapping ops\\
357 (2nd op's predicated elements slot in 1st's non-predicated ops)
358 \item Please don't use Vectors for "security" (use Sec-Ext)
359 \end{itemize}
360 }
361 % with overlapping "vectors" - bearing in mind that "vectors" are
362 % just a remap onto the standard register file, if the top bits of
363 % predication are zero, and there happens to be a second vector
364 % that uses some of the same register file that happens to be
365 % predicated out, the second vector op may be issued *at the same time*
366 % if there are available parallel ALUs to do so.
367
368
369 \frame{\frametitle{Predication key-value CSR store}
370
371 \begin{itemize}
372 \item key is int regfile number or FP regfile number (1 bit)\vspace{6pt}
373 \item register to be predicated if referred to (5 bits, key)\vspace{6pt}
374 \item register to store actual predication in (5 bits, value)\vspace{6pt}
375 \item predication is inverted Y/N (1 bit)\vspace{6pt}
376 \item non-predicated elements are to be zero'd Y/N (1 bit)\vspace{6pt}
377 \end{itemize}
378 Notes:\vspace{10pt}
379 \begin{itemize}
380 \item Table should be expanded out for high-speed implementations
381 \item Multiple "keys" (and values) theoretically permitted
382 \item RVV rules about deleting higher-indexed CSRs followed
383 \end{itemize}
384 }
385
386
387 \begin{frame}[fragile]
388 \frametitle{Predication key-value CSR table decoding pseudocode}
389
390 \begin{semiverbatim}
391 struct pred fp\_pred[32];
392 struct pred int\_pred[32];
393
394 for (i = 0; i < 16; i++) // 16 CSRs?
395 tb = int\_pred if CSRpred[i].type == 0 else fp\_pred
396 idx = CSRpred[i].regidx
397 tb[idx].zero = CSRpred[i].zero
398 tb[idx].inv = CSRpred[i].inv
399 tb[idx].predidx = CSRpred[i].predidx
400 tb[idx].enabled = true
401 \end{semiverbatim}
402
403 \begin{itemize}
404 \item All 64 (int and FP) Entries zero'd before setting
405 \item Might be a bit complex to set up (TBD)
406 \end{itemize}
407
408 \end{frame}
409
410
411 \begin{frame}[fragile]
412 \frametitle{Get Predication value pseudocode}
413
414 \begin{semiverbatim}
415 def get\_pred\_val(bool is\_fp\_op, int reg):
416 tb = int\_pred if is\_fp\_op else fp\_pred
417 if (!tb[reg].enabled):
418 return ~0x0 // all ops enabled
419 predidx = tb[reg].predidx // redirection occurs HERE
420 predicate = intreg[predidx] // actual predicate HERE
421 if (tb[reg].inv):
422 predicate = ~predicate
423 return predicate
424 \end{semiverbatim}
425
426 \begin{itemize}
427 \item References different (internal) mapping table for INT or FP
428 \item Actual predicate bitmask ALWAYS from the INT regfile
429 \end{itemize}
430
431 \end{frame}
432
433
434 \frame{\frametitle{Register key-value CSR store}
435
436 \begin{itemize}
437 \item key is int regfile number or FP regfile number (1 bit)\vspace{6pt}
438 \item treated as vector if referred to in op (5 bits, key)\vspace{6pt}
439 \item starting register to actually be used (5 bits, value)\vspace{6pt}
440 \item element bitwidth: default/8/16/32/64/rsvd (3 bits)\vspace{6pt}
441 \item element type: still under consideration\vspace{6pt}
442 \end{itemize}
443 Notes:\vspace{10pt}
444 \begin{itemize}
445 \item Same notes apply (previous slide) as for predication CSR table
446 \item Level of indirection has implications for pipeline latency
447 \end{itemize}
448 }
449
450
451 \begin{frame}[fragile]
452 \frametitle{Register key-value CSR table decoding pseudocode}
453
454 \begin{semiverbatim}
455 struct vectorised fp\_vec[32];
456 struct vectorised int\_vec[32];
457
458 for (i = 0; i < 16; i++) // 16 CSRs?
459 tb = int\_vec if CSRvectortb[i].type == 0 else fp\_vec
460 idx = CSRvectortb[i].regidx
461 tb[idx].elwidth = CSRpred[i].elwidth
462 tb[idx].regidx = CSRpred[i].regidx
463 tb[idx].isvector = true
464 \end{semiverbatim}
465
466 \begin{itemize}
467 \item All 64 (int and FP) Entries zero'd before setting
468 \item Might be a bit complex to set up (TBD)
469 \end{itemize}
470
471 \end{frame}
472
473
474 \begin{frame}[fragile]
475 \frametitle{ADD pseudocode with redirection, this time}
476
477 \begin{semiverbatim}
478 function op\_add(rd, rs1, rs2) # add not VADD!
479  int i, id=0, irs1=0, irs2=0;
480  rd = int\_vec[rd ].isvector ? int\_vec[rd ].regidx : rd;
481  rs1 = int\_vec[rs1].isvector ? int\_vec[rs1].regidx : rs1;
482  rs2 = int\_vec[rs2].isvector ? int\_vec[rs2].regidx : rs2;
483  predval = get\_pred\_val(FALSE, rd);
484  for (i = 0; i < VL; i++)
485 if (predval \& 1<<i) # predication uses intregs
486    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
487 if (int\_vec[rd ].isvector)  \{ id += 1; \}
488 if (int\_vec[rs1].isvector)  \{ irs1 += 1; \}
489 if (int\_vec[rs2].isvector)  \{ irs2 += 1; \}
490 \end{semiverbatim}
491
492 \begin{itemize}
493 \item SIMD (elwidth != default) not covered above
494 \end{itemize}
495 \end{frame}
496
497
498 \frame{\frametitle{C.MV extremely flexible!}
499
500 \begin{itemize}
501 \item scalar-to-vector (w/ no pred): VSPLAT
502 \item scalar-to-vector (w/ dest-pred): Sparse VSPLAT
503 \item scalar-to-vector (w/ 1-bit dest-pred): VINSERT
504 \item vector-to-scalar (w/ [1-bit?] src-pred): VEXTRACT
505 \item vector-to-vector (w/ no pred): Vector Copy
506 \item vector-to-vector (w/ src pred): Vector Gather
507 \item vector-to-vector (w/ dest pred): Vector Scatter
508 \item vector-to-vector (w/ src \& dest pred): Vector Gather/Scatter
509 \end{itemize}
510 \vspace{4pt}
511 Notes:
512 \begin{itemize}
513 \item Surprisingly powerful!
514 \item Same arrangement for FVCT, FMV, FSGNJ etc.
515 \end{itemize}
516 }
517
518
519 \begin{frame}[fragile]
520 \frametitle{MV pseudocode with predication}
521
522 \begin{semiverbatim}
523 function op\_mv(rd, rs) # MV not VMV!
524  rd = int\_vec[rd].isvector ? int\_vec[rd].regidx : rd;
525  rs = int\_vec[rs].isvector ? int\_vec[rs].regidx : rs;
526  ps = get\_pred\_val(FALSE, rs); # predication on src
527  pd = get\_pred\_val(FALSE, rd); # ... AND on dest
528  for (int i = 0, int j = 0; i < VL && j < VL;):
529 if (int\_vec[rs].isvec) while (!(ps \& 1<<i)) i++;
530 if (int\_vec[rd].isvec) while (!(pd \& 1<<j)) j++;
531 ireg[rd+j] <= ireg[rs+i];
532 if (int\_vec[rs].isvec) i++;
533 if (int\_vec[rd].isvec) j++;
534 \end{semiverbatim}
535
536 \begin{itemize}
537 \item elwidth != default not covered above (might be a bit hairy)
538 \item Ending early with 1-bit predication not included (VINSERT)
539 \end{itemize}
540 \end{frame}
541
542
543 \begin{frame}[fragile]
544 \frametitle{VSELECT: stays or goes? Stays if MV.X exists...}
545
546 \begin{semiverbatim}
547 def op_mv_x(rd, rs): # (hypothetical) RV MX.X
548 rs = regfile[rs] # level of indirection (MV.X)
549 regfile[rd] = regfile[rs] # straight regcopy
550 \end{semiverbatim}
551
552 Vectorised version aka "VSELECT":
553
554 \begin{semiverbatim}
555 def op_mv_x(rd, rs): # SV version of MX.X
556 for i in range(VL):
557 rs1 = regfile[rs+i] # indirection
558 regfile[rd+i] = regfile[rs] # straight regcopy
559 \end{semiverbatim}
560
561 \begin{itemize}
562 \item However MV.X does not exist in RV, so neither can VSELECT
563 \item SV is not about adding new functionality, only parallelism
564 \end{itemize}
565
566
567 \end{frame}
568
569
570 \frame{\frametitle{Opcodes, compared to RVV}
571
572 \begin{itemize}
573 \item All integer and FP opcodes all removed (no CLIP, FNE)
574 \item VMPOP, VFIRST etc. all removed (use xBitManip)
575 \item VSLIDE removed (use regfile overlaps)
576 \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)
577 \item VSETVL, VGETVL stay (the only ones that do!)
578 \item VSELECT stays? no MV.X (add with custom ext?)
579 \item VSNE exists, but no FNE (use predication inversion?)
580 \item Issue: VCLIP is not in RV* (add with custom ext?)
581 \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)
582 \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)
583 \end{itemize}
584 }
585
586
587 \begin{frame}[fragile]
588 \frametitle{Example c code: DAXPY}
589
590 \begin{semiverbatim}
591 void daxpy(size_t n, double a,
592 const double x[], double y[])
593 \{
594 for (size_t i = 0; i < n; i++) \{
595 y[i] = a*x[i] + y[i];
596 \}
597 \}
598 \end{semiverbatim}
599
600 \begin{itemize}
601 \item See "SIMD Considered Harmful" for SIMD/RVV analysis\\
602 https://sigarch.org/simd-instructions-considered-harmful/
603 \end{itemize}
604
605
606 \end{frame}
607
608
609 \begin{frame}[fragile]
610 \frametitle{RVV DAXPY assembly (RV32V)}
611
612 \begin{semiverbatim}
613 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
614 li t0, 2<<25
615 vsetdcfg t0 # enable 2 64b Fl.Pt. registers
616 loop:
617 setvl t0, a0 # vl = t0 = min(mvl, n)
618 vld v0, a1 # load vector x
619 slli t1, t0, 3 # t1 = vl * 8 (in bytes)
620 vld v1, a2 # load vector y
621 add a1, a1, t1 # increment pointer to x by vl*8
622 vfmadd v1, v0, fa0, v1 # v1 += v0 * fa0 (y = a * x + y)
623 sub a0, a0, t0 # n -= vl (t0)
624 vst v1, a2 # store Y
625 add a2, a2, t1 # increment pointer to y by vl*8
626 bnez a0, loop # repeat if n != 0
627 \end{semiverbatim}
628 \end{frame}
629
630
631 \begin{frame}[fragile]
632 \frametitle{SV DAXPY assembly (RV64D)}
633
634 \begin{semiverbatim}
635 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
636 CSRvect1 = \{type: F, key: a3, val: a3, elwidth: dflt\}
637 CSRvect2 = \{type: F, key: a7, val: a7, elwidth: dflt\}
638 loop:
639 setvl t0, a0, 4 # vl = t0 = min(4, n)
640 ld a3, a1 # load 4 registers a3-6 from x
641 slli t1, t0, 3 # t1 = vl * 8 (in bytes)
642 ld a7, a2 # load 4 registers a7-10 from y
643 add a1, a1, t1 # increment pointer to x by vl*8
644 fmadd a7, a3, fa0, a7 # v1 += v0 * fa0 (y = a * x + y)
645 sub a0, a0, t0 # n -= vl (t0)
646 st a7, a2 # store 4 registers a7-10 to y
647 add a2, a2, t1 # increment pointer to y by vl*8
648 bnez a0, loop # repeat if n != 0
649 \end{semiverbatim}
650 \end{frame}
651
652
653 \frame{\frametitle{Under consideration}
654
655 \begin{itemize}
656 \item Is C.FNE actually needed? Should it be added if it is?
657 \item Element type implies polymorphism. Should it be in SV?
658 \item Should use of registers be allowed to "wrap" (x30 x31 x1 x2)?
659 \item Is detection of all-scalar ops ok (without slowing pipeline)?
660 \item Can VSELECT be removed? (it's really complex)
661 \item Can CLIP be done as a CSR (mode, like elwidth)
662 \item SIMD saturation (etc.) also set as a mode?
663 \item Include src1/src2 predication on Comparison Ops?\\
664 (same arrangement as C.MV, with same flexibility/power)
665 \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
666 (a bit like misaligned addressing... for registers)\\
667 or just use predication to skip start?
668 \end{itemize}
669 }
670
671
672 \frame{\frametitle{What's the downside(s) of SV?}
673 \begin{itemize}
674 \item EVERY register operation is inherently parallelised\\
675 (scalar ops are just vectors of length 1)\vspace{4pt}
676 \item Tightly coupled with the core (instruction issue)\\
677 could be disabled through MISA switch\vspace{4pt}
678 \item An extra pipeline phase is pretty much essential\\
679 for fast low-latency implementations\vspace{4pt}
680 \item With zeroing off, skipping non-predicated elements is hard:\\
681 it is however an optimisation (and could be skipped).\vspace{4pt}
682 \item Setting up the Register/Predication tables (interpreting the\\
683 CSR key-value stores) might be a bit complex to optimise
684 (any change to a CSR key-value entry needs to redo the table)
685 \end{itemize}
686 }
687
688
689 \frame{\frametitle{Is this OK (low latency)? Detect scalar-ops (only)}
690 \begin{center}
691 \includegraphics[height=2.5in]{scalardetect.png}\\
692 {\bf \red Detect when all registers are scalar for a given op}
693 \end{center}
694 }
695
696
697 \frame{\frametitle{Summary}
698
699 \begin{itemize}
700 \item Actually about parallelism, not Vectors (or SIMD) per se\\
701 and NOT about adding new ALU/logic/functionality.
702 \item Only needs 2 actual instructions (plus the CSRs).\\
703 RVV - and "standard" SIMD - require ISA duplication
704 \item Designed for flexibility (graded levels of complexity)
705 \item Huge range of implementor freedom
706 \item Fits RISC-V ethos: achieve more with less
707 \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
708 (without SIMD downsides or sacrificing speed trade-off)
709 \item Covers 98\% of RVV, allows RVV to fit "on top"
710 \item Byproduct of SV is a reduction in code size, power usage
711 etc. (increase efficiency, just like Compressed)
712 \end{itemize}
713 }
714
715
716 \frame{
717 \begin{center}
718 {\Huge The end\vspace{20pt}\\
719 Thank you\vspace{20pt}\\
720 Questions?\vspace{20pt}
721 }
722 \end{center}
723
724 \begin{itemize}
725 \item Discussion: ISA-DEV mailing list
726 \item http://libre-riscv.org/simple\_v\_extension/
727 \end{itemize}
728 }
729
730
731 \end{document}