reorg
[libreriscv.git] / simple_v_extension / simple_v_chennai_2018.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
7 \author{Luke Kenneth Casson Leighton}
8
9
10 \begin{document}
11
12 \frame{
13 \begin{center}
14 \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
15 \vspace{32pt}
16 \Large{Flexible Vectorisation}\\
17 \Large{(aka not so Simple-V?)}\\
18 \Large{(aka How to Parallelise the RISC-V ISA)}\\
19 \vspace{24pt}
20 \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
21 \vspace{16pt}
22 \large{\today}
23 \end{center}
24 }
25
26
27 \frame{\frametitle{Credits and Acknowledgements}
28
29 \begin{itemize}
30 \item The Designers of RISC-V\vspace{15pt}
31 \item The RVV Working Group and contributors\vspace{15pt}
32 \item Allen Baum, Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
33 Guy Lemurieux, Jonathan Neuschafer, Roger Brussee,
34 and others\vspace{15pt}
35 \item ISA-Dev Group Members\vspace{10pt}
36 \end{itemize}
37 }
38
39
40 \frame{\frametitle{Quick refresher on SIMD}
41
42 \begin{itemize}
43 \item SIMD very easy to implement (and very seductive)\vspace{8pt}
44 \item Parallelism is in the ALU\vspace{8pt}
45 \item Zero-to-Negligeable impact for rest of core\vspace{8pt}
46 \end{itemize}
47 Where SIMD Goes Wrong:\vspace{10pt}
48 \begin{itemize}
49 \item See "SIMD instructions considered harmful"
50 https://sigarch.org/simd-instructions-considered-harmful
51 \item Setup and corner-cases alone are extremely complex.\\
52 Hardware is easy, but software is hell.
53 \item O($N^{6}$) ISA opcode proliferation!\\
54 opcode, elwidth, veclen, src1-src2-dest hi/lo
55 \end{itemize}
56 }
57
58 \frame{\frametitle{Quick refresher on RVV}
59
60 \begin{itemize}
61 \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
62 \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
63 \item Requires a separate Register File (32 w/ext to 256)\vspace{10pt}
64 \item Implemented as a separate pipeline (no impact on scalar)\vspace{10pt}
65 \end{itemize}
66 However...\vspace{10pt}
67 \begin{itemize}
68 \item 98 percent opcode duplication with rest of RV (CLIP)
69 \item Extending RVV requires customisation not just of h/w:\\
70 gcc, binutils also need customisation (and maintenance)
71 \end{itemize}
72 }
73
74
75 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
76
77 \begin{itemize}
78 \item Why?
79 Implementors need flexibility in vectorisation to optimise for
80 area or performance depending on the scope:
81 embedded DSP, Mobile GPU's, Server CPU's and more.\\
82 Compilers also need flexibility in vectorisation to optimise for cost
83 of pipeline setup, amount of state to context switch
84 and software portability
85 \item How?
86 By marking INT/FP regs as "Vectorised" and
87 adding a level of indirection,
88 SV expresses how existing instructions should act
89 on [contiguous] blocks of registers, in parallel, WITHOUT
90 needing new any actual extra arithmetic opcodes.
91 \item What?
92 Simple-V is an "API" that implicitly extends
93 existing (scalar) instructions with explicit parallelisation\\
94 i.e. SV is actually about parallelism NOT vectors per se.\\
95 Has a lot in common with VLIW (without the actual VLIW).
96 \end{itemize}
97 }
98
99
100 \frame{\frametitle{What's the value of SV? Why adopt it even in non-V?}
101
102 \begin{itemize}
103 \item memcpy becomes much smaller (higher bang-per-buck)
104 \item context-switch (LOAD/STORE multiple): 1-2 instructions
105 \item Compressed instrs further reduces I-cache (etc.)
106 \item Greatly-reduced I-cache load (and less reads)
107 \item Amazingly, SIMD becomes (more) tolerable (no corner-cases)
108 \item Modularity/Abstraction in both the h/w and the toolchain.
109 \item "Reach" of registers accessible by Compressed is enhanced
110 \item Future: double the standard register file size(s).
111 \end{itemize}
112 Note:
113 \begin{itemize}
114 \item It's not just about Vectors: it's about instruction effectiveness
115 \item Anything implementor is not interested in HW-optimising,\\
116 let it fall through to exceptions (implement as a trap).
117 \end{itemize}
118 }
119
120
121 \frame{\frametitle{How does Simple-V relate to RVV? What's different?}
122
123 \begin{itemize}
124 \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
125 \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
126 \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
127 \item Even Compressed become vectorised (RVV can't)\vspace{10pt}
128 \end{itemize}
129 What Simple-V is not:\vspace{10pt}
130 \begin{itemize}
131 \item A full supercomputer-level Vector Proposal
132 \item A replacement for RVV (SV is designed to be over-ridden\\
133 by - or augmented to become - RVV)
134 \end{itemize}
135 }
136
137
138 \frame{\frametitle{How is Parallelism abstracted in Simple-V?}
139
140 \begin{itemize}
141 \item Register "typing" turns any op into an implicit Vector op:\\
142 registers are reinterpreted through a level of indirection
143 \item Primarily at the Instruction issue phase (except SIMD)\\
144 Note: it's ok to pass predication through to ALU (like SIMD)
145 \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
146 \end{itemize}
147 Note: EVERYTHING is parallelised:
148 \begin{itemize}
149 \item All LOAD/STORE (inc. Compressed, Int/FP versions)
150 \item All ALU ops (Int, FP, SIMD, DSP, everything)
151 \item All branches become predication targets (C.FNE added?)
152 \item C.MV of particular interest (s/v, v/v, v/s)
153 \item FCVT, FMV, FSGNJ etc. very similar to C.MV
154 \end{itemize}
155 }
156
157
158 \frame{\frametitle{Implementation Options}
159
160 \begin{itemize}
161 \item Absolute minimum: Exceptions: if CSRs indicate "V", trap.\\
162 (Requires as absolute minimum that CSRs be in H/W)
163 \item Hardware loop, single-instruction issue\\
164 (Do / Don't send through predication to ALU)
165 \item Hardware loop, parallel (multi-instruction) issue\\
166 (Do / Don't send through predication to ALU)
167 \item Hardware loop, full parallel ALU (not recommended)
168 \end{itemize}
169 Notes:\vspace{4pt}
170 \begin{itemize}
171 \item 4 (or more?) options above may be deployed on per-op basis
172 \item SIMD always sends predication bits through to ALU
173 \item Minimum MVL MUST be sufficient to cover regfile LD/ST
174 \item Instr. FIFO may repeatedly split off N scalar ops at a time
175 \end{itemize}
176 }
177 % Instr. FIFO may need its own slide. Basically, the vectorised op
178 % gets pushed into the FIFO, where it is then "processed". Processing
179 % will remove the first set of ops from its vector numbering (taking
180 % predication into account) and shoving them **BACK** into the FIFO,
181 % but MODIFYING the remaining "vectorised" op, subtracting the now
182 % scalar ops from it.
183
184 \frame{\frametitle{Predicated 8-parallel ADD: 1-wide ALU}
185 \begin{center}
186 \includegraphics[height=2.5in]{padd9_alu1.png}\\
187 {\bf \red Predicated adds are shuffled down: 6 cycles in total}
188 \end{center}
189 }
190
191
192 \frame{\frametitle{Predicated 8-parallel ADD: 4-wide ALU}
193 \begin{center}
194 \includegraphics[height=2.5in]{padd9_alu4.png}\\
195 {\bf \red Predicated adds are shuffled down: 4 in 1st cycle, 2 in 2nd}
196 \end{center}
197 }
198
199
200 \frame{\frametitle{Predicated 8-parallel ADD: 3 phase FIFO expansion}
201 \begin{center}
202 \includegraphics[height=2.5in]{padd9_fifo.png}\\
203 {\bf \red First cycle takes first four 1s; second takes the rest}
204 \end{center}
205 }
206
207
208 \frame{\frametitle{How are SIMD Instructions Vectorised?}
209
210 \begin{itemize}
211 \item SIMD ALU(s) primarily unchanged\vspace{6pt}
212 \item Predication is added to each SIMD element\vspace{6pt}
213 \item Predication bits sent in groups to the ALU\vspace{6pt}
214 \item End of Vector enables (additional) predication\\
215 (completely nullifies need for end-case code)
216 \end{itemize}
217 Considerations:\vspace{4pt}
218 \begin{itemize}
219 \item Many SIMD ALUs possible (parallel execution)
220 \item Implementor free to choose (API remains the same)
221 \item Unused ALU units wasted, but s/w DRASTICALLY simpler
222 \item Very long SIMD ALUs could waste significant die area
223 \end{itemize}
224 }
225 % With multiple SIMD ALUs at for example 32-bit wide they can be used
226 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
227 % or they can be used to cover several operations on totally different
228 % vectors / registers.
229
230 \frame{\frametitle{Predicated 9-parallel SIMD ADD}
231 \begin{center}
232 \includegraphics[height=2.5in]{padd9_simd.png}\\
233 {\bf \red 4-wide 8-bit SIMD, 4 bits of predicate passed to ALU}
234 \end{center}
235 }
236
237
238 \frame{\frametitle{What's the deal / juice / score?}
239
240 \begin{itemize}
241 \item Standard Register File(s) overloaded with CSR "reg is vector"\\
242 (see pseudocode slides for examples)
243 \item "2nd FP\&INT register bank" possibility (reserved for future)
244 \item Element width (and type?) concepts remain same as RVV\\
245 (CSRs give new size (and meaning?) to elements in registers)
246 \item CSRs are key-value tables (overlaps allowed: v. important)
247 \end{itemize}
248 Key differences from RVV:
249 \begin{itemize}
250 \item Predication in INT regs as a BIT field (max VL=XLEN)
251 \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)
252 \item SV may condense sparse Vecs: RVV lets ALU do predication
253 \item Choice to Zero or skip non-predicated elements
254 \end{itemize}
255 }
256
257
258 \begin{frame}[fragile]
259 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
260
261 \begin{semiverbatim}
262 function op\_add(rd, rs1, rs2, predr) # add not VADD!
263  int i, id=0, irs1=0, irs2=0;
264  for (i = 0; i < VL; i++)
265   if (ireg[predr] & 1<<i) # predication uses intregs
266    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
267 if (reg\_is\_vectorised[rd]) \{ id += 1; \}
268 if (reg\_is\_vectorised[rs1]) \{ irs1 += 1; \}
269 if (reg\_is\_vectorised[rs2]) \{ irs2 += 1; \}
270 \end{semiverbatim}
271
272 \begin{itemize}
273 \item Above is oversimplified: Reg. indirection left out (for clarity).
274 \item SIMD slightly more complex (case above is elwidth = default)
275 \item Scalar-scalar and scalar-vector and vector-vector now all in one
276 \item OoO may choose to push ADDs into instr. queue (v. busy!)
277 \end{itemize}
278 \end{frame}
279
280 % yes it really *is* ADD not VADD. that's the entire point of
281 % this proposal, that *standard* operations are overloaded to
282 % become vectorised-on-demand
283
284
285 \begin{frame}[fragile]
286 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
287
288 \begin{semiverbatim}
289 s1 = reg\_is\_vectorised(src1);
290 s2 = reg\_is\_vectorised(src2);
291 if (!s2 && !s1) goto branch;
292 for (int i = 0; i < VL; ++i)
293 if (cmp(s1 ? reg[src1+i]:reg[src1],
294 s2 ? reg[src2+i]:reg[src2])
295 ireg[rs3] |= 1<<i;
296 \end{semiverbatim}
297
298 \begin{itemize}
299 \item SIMD slightly more complex (case above is elwidth = default)
300 \item If s1 and s2 both scalars, Standard branch occurs
301 \item Predication stored in integer regfile as a bitfield
302 \item Scalar-vector and vector-vector supported
303 \item Overload Branch immediate to be predication target rs3
304 \end{itemize}
305 \end{frame}
306
307 \begin{frame}[fragile]
308 \frametitle{VLD/VLD.S/VLD.X (or trap, or actual hardware loop)}
309
310 \begin{semiverbatim}
311 if (unit-strided) stride = elsize;
312 else stride = areg[as2]; // constant-strided
313 for (int i = 0; i < VL; ++i)
314 if (preg\_enabled[rd] && ([!]preg[rd] & 1<<i))
315 for (int j = 0; j < seglen+1; j++)
316 if (reg\_is\_vectorised[rs2]) offs = vreg[rs2+i]
317 else offs = i*(seglen+1)*stride;
318 vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
319 \end{semiverbatim}
320
321 \begin{itemize}
322 \item Again: elwidth != default slightly more complex
323 \item rs2 vectorised taken to implicitly indicate VLD.X
324 \end{itemize}
325 \end{frame}
326
327
328 \frame{\frametitle{Predication key-value CSR store}
329
330 \begin{itemize}
331 \item key is int regfile number or FP regfile number (1 bit)\vspace{6pt}
332 \item register to be predicated if referred to (5 bits, key)\vspace{6pt}
333 \item register to store actual predication in (5 bits, value)\vspace{6pt}
334 \item predication is inverted Y/N (1 bit)\vspace{6pt}
335 \item non-predicated elements are to be zero'd Y/N (1 bit)\vspace{6pt}
336 \end{itemize}
337 Notes:\vspace{10pt}
338 \begin{itemize}
339 \item Table should be expanded out for high-speed implementations
340 \item Multiple "keys" (and values) theoretically permitted
341 \item RVV rules about deleting higher-indexed CSRs followed
342 \end{itemize}
343 }
344
345
346 \begin{frame}[fragile]
347 \frametitle{Predication key-value CSR table decoding pseudocode}
348
349 \begin{semiverbatim}
350 struct pred fp\_pred[32];
351 struct pred int\_pred[32];
352
353 for (i = 0; i < 16; i++) // 16 CSRs?
354 tb = int\_pred if CSRpred[i].type == 0 else fp\_pred
355 idx = CSRpred[i].regidx
356 tb[idx].zero = CSRpred[i].zero
357 tb[idx].inv = CSRpred[i].inv
358 tb[idx].predidx = CSRpred[i].predidx
359 tb[idx].enabled = true
360 \end{semiverbatim}
361
362 \begin{itemize}
363 \item All 64 (int and FP) Entries zero'd before setting
364 \item Might be a bit complex to set up (TBD)
365 \end{itemize}
366
367 \end{frame}
368
369
370 \begin{frame}[fragile]
371 \frametitle{Get Predication value pseudocode}
372
373 \begin{semiverbatim}
374 def get\_pred\_val(bool is\_fp\_op, int reg):
375 tb = int\_pred if is\_fp\_op else fp\_pred
376 if (!tb[reg].enabled):
377 return ~0x0 // all ops enabled
378 predidx = tb[reg].predidx // redirection occurs HERE
379 predicate = intreg[predidx] // actual predicate HERE
380 if (tb[reg].inv):
381 predicate = ~predicate // invert ALL bits
382 return predicate
383 \end{semiverbatim}
384
385 \begin{itemize}
386 \item References different (internal) mapping table for INT or FP
387 \item Actual predicate bitmask ALWAYS from the INT regfile
388 \end{itemize}
389
390 \end{frame}
391
392
393 \frame{\frametitle{To Zero or not to place zeros in non-predicated elements?}
394
395 \begin{itemize}
396 \item Zeroing is an implementation optimisation favouring OoO
397 \item Simple implementations may skip non-predicated operations
398 \item Simple implementations explicitly have to destroy data
399 \item Complex implementations may use reg-renames to save power\\
400 Zeroing on predication chains makes optimisation harder
401 \item Compromise: REQUIRE both (specified in predication CSRs).
402 \end{itemize}
403 Considerations:
404 \begin{itemize}
405 \item Complex not really impacted, simple impacted a LOT\\
406 with Zeroing... however it's useful (memzero)
407 \item Non-zero'd overlapping "Vectors" may issue overlapping ops\\
408 (2nd op's predicated elements slot in 1st's non-predicated ops)
409 \item Please don't use Vectors for "security" (use Sec-Ext)
410 \end{itemize}
411 }
412 % with overlapping "vectors" - bearing in mind that "vectors" are
413 % just a remap onto the standard register file, if the top bits of
414 % predication are zero, and there happens to be a second vector
415 % that uses some of the same register file that happens to be
416 % predicated out, the second vector op may be issued *at the same time*
417 % if there are available parallel ALUs to do so.
418
419
420 \frame{\frametitle{Register key-value CSR store}
421
422 \begin{itemize}
423 \item key is int regfile number or FP regfile number (1 bit)
424 \item treated as vector if referred to in op (5 bits, key)
425 \item starting register to actually be used (5 bits, value)
426 \item element bitwidth: default, dflt/2, 8, 16 (2 bits)
427 \item is vector: Y/N (1 bit)
428 \item is packed SIMD: Y/N (1 bit)
429 \item register bank: 0/reserved for future ext. (1 bit)
430 \end{itemize}
431 Notes:
432 \begin{itemize}
433 \item References different (internal) mapping table for INT or FP
434 \item Level of indirection has implications for pipeline latency
435 \item (future) bank bit, no need to extend opcodes: set bank=1,
436 just use normal 5-bit regs, indirection takes care of the rest.
437 \end{itemize}
438 }
439
440
441 \frame{\frametitle{Register element width and packed SIMD}
442
443 Packed SIMD = N:
444 \begin{itemize}
445 \item default: RV32/64/128 opcodes define elwidth = 32/64/128
446 \item default/2: RV32/64/128 opcodes, elwidth = 16/32/64 with
447 top half of register ignored (src), zero'd/s-ext (dest)
448 \item 8 or 16: elwidth = 8 (or 16), similar to default/2
449 \end{itemize}
450 Packed SIMD = Y (default is moot, packing is 1:1)
451 \begin{itemize}
452 \item default/2: 2 elements per register @ opcode-defined bitwidth
453 \item 8 or 16: standard 8 (or 16) packed SIMD
454 \end{itemize}
455 Notes:
456 \begin{itemize}
457 \item Different src/dest widths (and packs) PERMITTED
458 \item RV* already allows (and defines) how RV32 ops work in RV64\\
459 so just logically follow that lead/example.
460 \end{itemize}
461 }
462
463
464 \begin{frame}[fragile]
465 \frametitle{Register key-value CSR table decoding pseudocode}
466
467 \begin{semiverbatim}
468 struct vectorised fp\_vec[32], int\_vec[32]; // 64 in future
469
470 for (i = 0; i < 16; i++) // 16 CSRs?
471 tb = int\_vec if CSRvectortb[i].type == 0 else fp\_vec
472 idx = CSRvectortb[i].regidx
473 tb[idx].elwidth = CSRpred[i].elwidth
474 tb[idx].regidx = CSRpred[i].regidx // indirection
475 tb[idx].isvector = CSRpred[i].isvector
476 tb[idx].packed = CSRpred[i].packed // SIMD or not
477 tb[idx].bank = CSRpred[i].bank // 0 (1=rsvd)
478 \end{semiverbatim}
479
480 \begin{itemize}
481 \item All 32 int (and 32 FP) entries zero'd before setup
482 \item Might be a bit complex to set up (TBD)
483 \end{itemize}
484
485 \end{frame}
486
487
488 \begin{frame}[fragile]
489 \frametitle{ADD pseudocode with redirection, this time}
490
491 \begin{semiverbatim}
492 function op\_add(rd, rs1, rs2) # add not VADD!
493  int i, id=0, irs1=0, irs2=0;
494  rd = int\_vec[rd ].isvector ? int\_vec[rd ].regidx : rd;
495  rs1 = int\_vec[rs1].isvector ? int\_vec[rs1].regidx : rs1;
496  rs2 = int\_vec[rs2].isvector ? int\_vec[rs2].regidx : rs2;
497  predval = get\_pred\_val(FALSE, rd);
498  for (i = 0; i < VL; i++)
499 if (predval \& 1<<i) # predication uses intregs
500    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
501 if (int\_vec[rd ].isvector)  \{ id += 1; \}
502 if (int\_vec[rs1].isvector)  \{ irs1 += 1; \}
503 if (int\_vec[rs2].isvector)  \{ irs2 += 1; \}
504 \end{semiverbatim}
505
506 \begin{itemize}
507 \item SIMD (elwidth != default) not covered above
508 \end{itemize}
509 \end{frame}
510
511
512 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
513
514 \begin{itemize}
515 \item Same register(s) can have multiple "interpretations"
516 \item Set "real" register (scalar) without needing to set/unset CSRs.
517 \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops
518 \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV:\\
519 GREV @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
520 \item RGB 565 (video): BEXTW plus 4x8-bit SIMD plus BDEPW\\
521 (BEXT/BDEP @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
522 \item Same register(s) can be offset (no need for VSLIDE)\vspace{6pt}
523 \end{itemize}
524 Note:
525 \begin{itemize}
526 \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$)
527 \item Hi-Performance: Macro-op fusion (more pipeline stages?)
528 \end{itemize}
529 }
530
531
532 \frame{\frametitle{C.MV extremely flexible!}
533
534 \begin{itemize}
535 \item scalar-to-vector (w/ no pred): VSPLAT
536 \item scalar-to-vector (w/ dest-pred): Sparse VSPLAT
537 \item scalar-to-vector (w/ 1-bit dest-pred): VINSERT
538 \item vector-to-scalar (w/ [1-bit?] src-pred): VEXTRACT
539 \item vector-to-vector (w/ no pred): Vector Copy
540 \item vector-to-vector (w/ src pred): Vector Gather
541 \item vector-to-vector (w/ dest pred): Vector Scatter
542 \item vector-to-vector (w/ src \& dest pred): Vector Gather/Scatter
543 \end{itemize}
544 \vspace{4pt}
545 Notes:
546 \begin{itemize}
547 \item Surprisingly powerful! Zero-predication even more so
548 \item Same arrangement for FVCT, FMV, FSGNJ etc.
549 \end{itemize}
550 }
551
552
553 \begin{frame}[fragile]
554 \frametitle{MV pseudocode with predication}
555
556 \begin{semiverbatim}
557 function op\_mv(rd, rs) # MV not VMV!
558  rd = int\_vec[rd].isvector ? int\_vec[rd].regidx : rd;
559  rs = int\_vec[rs].isvector ? int\_vec[rs].regidx : rs;
560  ps = get\_pred\_val(FALSE, rs); # predication on src
561  pd = get\_pred\_val(FALSE, rd); # ... AND on dest
562  for (int i = 0, int j = 0; i < VL && j < VL;):
563 if (int\_vec[rs].isvec) while (!(ps \& 1<<i)) i++;
564 if (int\_vec[rd].isvec) while (!(pd \& 1<<j)) j++;
565 ireg[rd+j] <= ireg[rs+i];
566 if (int\_vec[rs].isvec) i++;
567 if (int\_vec[rd].isvec) j++;
568 \end{semiverbatim}
569
570 \begin{itemize}
571 \item elwidth != default not covered above (might be a bit hairy)
572 \item Ending early with 1-bit predication not included (VINSERT)
573 \end{itemize}
574 \end{frame}
575
576
577 \begin{frame}[fragile]
578 \frametitle{VSELECT: stays or goes? Stays if MV.X exists...}
579
580 \begin{semiverbatim}
581 def op_mv_x(rd, rs): # (hypothetical) RV MX.X
582 rs = regfile[rs] # level of indirection (MV.X)
583 regfile[rd] = regfile[rs] # straight regcopy
584 \end{semiverbatim}
585
586 Vectorised version aka "VSELECT":
587
588 \begin{semiverbatim}
589 def op_mv_x(rd, rs): # SV version of MX.X
590 for i in range(VL):
591 rs1 = regfile[rs+i] # indirection
592 regfile[rd+i] = regfile[rs] # straight regcopy
593 \end{semiverbatim}
594
595 \begin{itemize}
596 \item However MV.X does not exist in RV, so neither can VSELECT
597 \item \red SV is not about adding new functionality, only parallelism
598 \end{itemize}
599
600
601 \end{frame}
602
603
604 \frame{\frametitle{Opcodes, compared to RVV}
605
606 \begin{itemize}
607 \item All integer and FP opcodes all removed (no CLIP, FNE)
608 \item VMPOP, VFIRST etc. all removed (use xBitManip)
609 \item VSLIDE removed (use regfile overlaps)
610 \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)
611 \item Vector (or scalar-vector) copy: use C.MV (MV is a pseudo-op)
612 \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)
613 \item VSETVL, VGETVL stay (the only ops that do!)
614 \end{itemize}
615 Issues:
616 \begin{itemize}
617 \item VSELECT stays? no MV.X, so no (add with custom ext?)
618 \item VSNE exists, but no FNE (use predication inversion?)
619 \item VCLIP is not in RV* (add with custom ext?)
620 \end{itemize}
621 }
622
623
624 \begin{frame}[fragile]
625 \frametitle{Example c code: DAXPY}
626
627 \begin{semiverbatim}
628 void daxpy(size_t n, double a,
629 const double x[], double y[])
630 \{
631 for (size_t i = 0; i < n; i++) \{
632 y[i] = a*x[i] + y[i];
633 \}
634 \}
635 \end{semiverbatim}
636
637 \begin{itemize}
638 \item See "SIMD Considered Harmful" for SIMD/RVV analysis\\
639 https://sigarch.org/simd-instructions-considered-harmful/
640 \end{itemize}
641
642
643 \end{frame}
644
645
646 \begin{frame}[fragile]
647 \frametitle{RVV DAXPY assembly (RV32V)}
648
649 \begin{semiverbatim}
650 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
651 li t0, 2<<25
652 vsetdcfg t0 # enable 2 64b Fl.Pt. registers
653 loop:
654 setvl t0, a0 # vl = t0 = min(mvl, n)
655 vld v0, a1 # load vector x
656 slli t1, t0, 3 # t1 = vl * 8 (in bytes)
657 vld v1, a2 # load vector y
658 add a1, a1, t1 # increment pointer to x by vl*8
659 vfmadd v1, v0, fa0, v1 # v1 += v0 * fa0 (y = a * x + y)
660 sub a0, a0, t0 # n -= vl (t0)
661 vst v1, a2 # store Y
662 add a2, a2, t1 # increment pointer to y by vl*8
663 bnez a0, loop # repeat if n != 0
664 \end{semiverbatim}
665 \end{frame}
666
667
668 \begin{frame}[fragile]
669 \frametitle{SV DAXPY assembly (RV64D)}
670
671 \begin{semiverbatim}
672 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
673 CSRvect1 = \{type: F, key: a3, val: a3, elwidth: dflt\}
674 CSRvect2 = \{type: F, key: a7, val: a7, elwidth: dflt\}
675 loop:
676 setvl t0, a0, 4 # vl = t0 = min(4, n)
677 ld a3, a1 # load 4 registers a3-6 from x
678 slli t1, t0, 3 # t1 = vl * 8 (in bytes)
679 ld a7, a2 # load 4 registers a7-10 from y
680 add a1, a1, t1 # increment pointer to x by vl*8
681 fmadd a7, a3, fa0, a7 # v1 += v0 * fa0 (y = a * x + y)
682 sub a0, a0, t0 # n -= vl (t0)
683 st a7, a2 # store 4 registers a7-10 to y
684 add a2, a2, t1 # increment pointer to y by vl*8
685 bnez a0, loop # repeat if n != 0
686 \end{semiverbatim}
687 \end{frame}
688
689
690 \frame{\frametitle{Under consideration}
691
692 \begin{itemize}
693 \item Is C.FNE actually needed? Should it be added if it is?
694 \item Element type implies polymorphism. Should it be in SV?
695 \item Should use of registers be allowed to "wrap" (x30 x31 x1 x2)?
696 \item Is detection of all-scalar ops ok (without slowing pipeline)?
697 \item Can VSELECT be removed? (it's really complex)
698 \item Can CLIP be done as a CSR (mode, like elwidth)
699 \item SIMD saturation (etc.) also set as a mode?
700 \item Include src1/src2 predication on Comparison Ops?\\
701 (same arrangement as C.MV, with same flexibility/power)
702 \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
703 (a bit like misaligned addressing... for registers)\\
704 or just use predication to skip start?
705 \end{itemize}
706 }
707
708
709 \frame{\frametitle{What's the downside(s) of SV?}
710 \begin{itemize}
711 \item EVERY register operation is inherently parallelised\\
712 (scalar ops are just vectors of length 1)\vspace{4pt}
713 \item Tightly coupled with the core (instruction issue)\\
714 could be disabled through MISA switch\vspace{4pt}
715 \item An extra pipeline phase almost certainly essential\\
716 for fast low-latency implementations\vspace{4pt}
717 \item With zeroing off, skipping non-predicated elements is hard:\\
718 it is however an optimisation (and could be skipped).\vspace{4pt}
719 \item Setting up the Register/Predication tables (interpreting the\\
720 CSR key-value stores) might be a bit complex to optimise
721 (any change to a CSR key-value entry needs to redo the table)
722 \end{itemize}
723 }
724
725
726 \frame{\frametitle{Is this OK (low latency)? Detect scalar-ops (only)}
727 \begin{center}
728 \includegraphics[height=2.5in]{scalardetect.png}\\
729 {\bf \red Detect when all registers are scalar for a given op}
730 \end{center}
731 }
732
733
734 \frame{\frametitle{Summary}
735
736 \begin{itemize}
737 \item Actually about parallelism, not Vectors (or SIMD) per se\\
738 and NOT about adding new ALU/logic/functionality.
739 \item Only needs 2 actual instructions (plus the CSRs).\\
740 RVV - and "standard" SIMD - require ISA duplication
741 \item Designed for flexibility (graded levels of complexity)
742 \item Huge range of implementor freedom
743 \item Fits RISC-V ethos: achieve more with less
744 \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
745 (without SIMD downsides or sacrificing speed trade-off)
746 \item Covers 98\% of RVV, allows RVV to fit "on top"
747 \item Byproduct of SV is a reduction in code size, power usage
748 etc. (increase efficiency, just like Compressed)
749 \end{itemize}
750 }
751
752
753 \frame{
754 \begin{center}
755 {\Huge The end\vspace{20pt}\\
756 Thank you\vspace{20pt}\\
757 Questions?\vspace{20pt}
758 }
759 \end{center}
760
761 \begin{itemize}
762 \item Discussion: ISA-DEV mailing list
763 \item http://libre-riscv.org/simple\_v\_extension/
764 \end{itemize}
765 }
766
767
768 \end{document}