simple_v_extension/simple_v_chennai_2018.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
   7 \author{Luke Kenneth Casson Leighton}
   8
   9
  10 \begin{document}
  11
  12 \frame{
  13    \begin{center}
  14     \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
  15     \vspace{32pt}
  16     \Large{Flexible Vectorisation}\\
  17     \Large{(aka not so Simple-V?)}\\
  18     \Large{(aka How to Parallelise the RISC-V ISA)}\\
  19     \vspace{24pt}
  20     \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
  21     \vspace{16pt}
  22     \large{\today}
  23   \end{center}
  24 }
  25
  26
  27 \frame{\frametitle{Credits and Acknowledgements}
  28
  29  \begin{itemize}
  30    \item The Designers of RISC-V\vspace{15pt}
  31    \item The RVV Working Group and contributors\vspace{15pt}
  32    \item Allen Baum, Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
  33              Guy Lemurieux, Jonathan Neuschafer, Roger Brussee,
  34              and others\vspace{15pt}
  35    \item ISA-Dev Group Members\vspace{10pt}
  36   \end{itemize}
  37 }
  38
  39
  40 \frame{\frametitle{Quick refresher on SIMD}
  41
  42  \begin{itemize}
  43    \item SIMD very easy to implement (and very seductive)\vspace{8pt}
  44    \item Parallelism is in the ALU\vspace{8pt}
  45    \item Zero-to-Negligeable impact for rest of core\vspace{8pt}
  46   \end{itemize}
  47   Where SIMD Goes Wrong:\vspace{10pt}
  48    \begin{itemize}
  49    \item See "SIMD instructions considered harmful"
  50    https://sigarch.org/simd-instructions-considered-harmful
  51    \item Setup and corner-cases alone are extremely complex.\\
  52              Hardware is easy, but software is hell.
  53    \item O($N^{6}$) ISA opcode proliferation!\\
  54              opcode, elwidth, veclen, src1-src2-dest hi/lo
  55   \end{itemize}
  56 }
  57
  58 \frame{\frametitle{Quick refresher on RVV}
  59
  60  \begin{itemize}
  61    \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
  62    \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
  63    \item Requires a separate Register File (32 w/ext to 256)\vspace{10pt}
  64    \item Implemented as a separate pipeline (no impact on scalar)\vspace{10pt}
  65   \end{itemize}
  66   However...\vspace{10pt}
  67    \begin{itemize}
  68    \item 98 percent opcode duplication with rest of RV (CLIP)
  69    \item Extending RVV requires customisation not just of h/w:\\
  70              gcc, binutils also need customisation (and maintenance)
  71   \end{itemize}
  72 }
  73
  74
  75 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
  76
  77  \begin{itemize}
  78    \item Why?
  79          Implementors need flexibility in vectorisation to optimise for
  80          area or performance depending on the scope:
  81              embedded DSP, Mobile GPU's, Server CPU's and more.\vspace{4pt}\\
  82                  Compilers also need flexibility in vectorisation to optimise for cost
  83                  of pipeline setup, amount of state to context switch
  84                  and software portability\vspace{4pt}
  85    \item How?
  86              By marking INT/FP regs as "Vectorised" and
  87              adding a level of indirection,
  88              SV expresses how existing instructions should act
  89              on [contiguous] blocks of registers, in parallel.\vspace{4pt}
  90    \item What?
  91                  Simple-V is an "API" that implicitly extends
  92                  existing (scalar) instructions with explicit parallelisation\\
  93                  (i.e. SV is actually about parallelism NOT vectors per se)
  94   \end{itemize}
  95 }
  96
  97
  98 \frame{\frametitle{What's the value of SV? Why adopt it even in non-V?}
  99
 100  \begin{itemize}
 101    \item memcpy becomes much smaller (higher bang-per-buck)
 102    \item context-switch (LOAD/STORE multiple): 1-2 instructions
 103    \item Compressed instrs further reduces I-cache (etc.)
 104    \item Greatly-reduced I-cache load (and less reads)
 105    \item Amazingly, SIMD becomes (more) tolerable\\
 106              (corner-cases for setup and teardown are gone)
 107    \item Modularity/Abstraction in both the h/w and the toolchain.
 108   \end{itemize}
 109   Note:
 110    \begin{itemize}
 111    \item It's not just about Vectors: it's about instruction effectiveness
 112    \item Anything that makes SIMD tolerable has to be a good thing
 113    \item Anything implementor is not interested in HW-optimising,\\
 114              let it fall through to exceptions (implement as a trap).
 115   \end{itemize}
 116 }
 117
 118
 119 \frame{\frametitle{How does Simple-V relate to RVV? What's different?}
 120
 121  \begin{itemize}
 122    \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
 123    \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
 124    \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
 125    \item Even Compressed become vectorised (RVV can't)\vspace{10pt}
 126   \end{itemize}
 127   What Simple-V is not:\vspace{10pt}
 128    \begin{itemize}
 129    \item A full supercomputer-level Vector Proposal
 130    \item A replacement for RVV (SV is designed to be over-ridden\\
 131              by - or augmented to become - RVV)
 132   \end{itemize}
 133 }
 134
 135
 136 \frame{\frametitle{How is Parallelism abstracted in Simple-V?}
 137
 138  \begin{itemize}
 139    \item Register "typing" turns any op into an implicit Vector op:\\
 140          registers are reinterpreted through a level of indirection
 141    \item Primarily at the Instruction issue phase (except SIMD)\\
 142          Note: it's ok to pass predication through to ALU (like SIMD)
 143    \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
 144   \end{itemize}
 145   Note: EVERYTHING is parallelised:
 146    \begin{itemize}
 147    \item All LOAD/STORE (inc. Compressed, Int/FP versions)
 148    \item All ALU ops (Int, FP, SIMD, DSP, everything)
 149    \item All branches become predication targets (C.FNE added?)
 150    \item C.MV of particular interest (s/v, v/v, v/s)
 151    \item FCVT, FMV, FSGNJ etc. very similar to C.MV
 152   \end{itemize}
 153 }
 154
 155
 156 \frame{\frametitle{Implementation Options}
 157
 158  \begin{itemize}
 159    \item Absolute minimum: Exceptions: if CSRs indicate "V", trap.\\
 160          (Requires as absolute minimum that CSRs be in H/W)
 161    \item Hardware loop, single-instruction issue\\
 162                  (Do / Don't send through predication to ALU)
 163    \item Hardware loop, parallel (multi-instruction) issue\\
 164                  (Do / Don't send through predication to ALU)
 165    \item Hardware loop, full parallel ALU (not recommended)
 166   \end{itemize}
 167   Notes:\vspace{4pt}
 168   \begin{itemize}
 169    \item 4 (or more?) options above may be deployed on per-op basis
 170    \item SIMD always sends predication bits through to ALU
 171    \item Minimum MVL MUST be sufficient to cover regfile LD/ST
 172    \item Instr. FIFO may repeatedly split off N scalar ops at a time
 173   \end{itemize}
 174 }
 175 % Instr. FIFO may need its own slide.  Basically, the vectorised op
 176 % gets pushed into the FIFO, where it is then "processed".  Processing
 177 % will remove the first set of ops from its vector numbering (taking
 178 % predication into account) and shoving them **BACK** into the FIFO,
 179 % but MODIFYING the remaining "vectorised" op, subtracting the now
 180 % scalar ops from it.
 181
 182 \frame{\frametitle{Predicated 8-parallel ADD: 1-wide ALU}
 183  \begin{center}
 184   \includegraphics[height=2.5in]{padd9_alu1.png}\\
 185   {\bf \red Predicated adds are shuffled down: 6 cycles in total}
 186  \end{center}
 187 }
 188
 189
 190 \frame{\frametitle{Predicated 8-parallel ADD: 4-wide ALU}
 191  \begin{center}
 192   \includegraphics[height=2.5in]{padd9_alu4.png}\\
 193   {\bf \red Predicated adds are shuffled down: 4 in 1st cycle, 2 in 2nd}
 194  \end{center}
 195 }
 196
 197
 198 \frame{\frametitle{Predicated 8-parallel ADD: 3 phase FIFO expansion}
 199  \begin{center}
 200   \includegraphics[height=2.5in]{padd9_fifo.png}\\
 201   {\bf \red First cycle takes first four 1s; second takes the rest}
 202  \end{center}
 203 }
 204
 205
 206 \frame{\frametitle{How are SIMD Instructions Vectorised?}
 207
 208  \begin{itemize}
 209    \item SIMD ALU(s) primarily unchanged\vspace{6pt}
 210    \item Predication is added to each SIMD element\vspace{6pt}
 211    \item Predication bits sent in groups to the ALU\vspace{6pt}
 212    \item End of Vector enables (additional) predication\\
 213              (completely nullifies need for end-case code)
 214   \end{itemize}
 215   Considerations:\vspace{4pt}
 216    \begin{itemize}
 217    \item Many SIMD ALUs possible (parallel execution)
 218    \item Implementor free to choose (API remains the same)
 219    \item Unused ALU units wasted, but s/w DRASTICALLY simpler
 220    \item Very long SIMD ALUs could waste significant die area
 221   \end{itemize}
 222 }
 223 % With multiple SIMD ALUs at for example 32-bit wide they can be used
 224 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
 225 % or they can be used to cover several operations on totally different
 226 % vectors / registers.
 227
 228 \frame{\frametitle{Predicated 9-parallel SIMD ADD}
 229  \begin{center}
 230   \includegraphics[height=2.5in]{padd9_simd.png}\\
 231   {\bf \red 4-wide 8-bit SIMD, 4 bits of predicate passed to ALU}
 232  \end{center}
 233 }
 234
 235
 236 \frame{\frametitle{What's the deal / juice / score?}
 237
 238  \begin{itemize}
 239    \item Standard Register File(s) overloaded with CSR "reg is vector"\\
 240              (see pseudocode slides for examples)
 241    \item Element width (and type?) concepts remain same as RVV\\
 242              (CSRs give new size (and meaning?) to elements in registers)
 243    \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
 244   \end{itemize}
 245   Key differences from RVV:\vspace{10pt}
 246    \begin{itemize}
 247    \item Predication in INT regs as a BIT field (max VL=XLEN)
 248    \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)
 249    \item SV may condense sparse Vecs: RVV lets ALU do predication
 250    \item Choice to Zero or skip non-predicated elements
 251   \end{itemize}
 252 }
 253
 254
 255 \begin{frame}[fragile]
 256 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
 257
 258 \begin{semiverbatim}
 259 function op\_add(rd, rs1, rs2, predr) # add not VADD!
 260   int i, id=0, irs1=0, irs2=0;
 261   for (i = 0; i < VL; i++)
 262     if (ireg[predr] & 1<<i) # predication uses intregs
 263        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 264     if (reg\_is\_vectorised[rd]) \{ id += 1; \}
 265     if (reg\_is\_vectorised[rs1]) \{ irs1 += 1; \}
 266     if (reg\_is\_vectorised[rs2]) \{ irs2 += 1; \}
 267 \end{semiverbatim}
 268
 269   \begin{itemize}
 270    \item Above is oversimplified: Reg. indirection left out (for clarity).
 271    \item SIMD slightly more complex (case above is elwidth = default)
 272    \item Scalar-scalar and scalar-vector and vector-vector now all in one
 273    \item OoO may choose to push ADDs into instr. queue (v. busy!)
 274   \end{itemize}
 275 \end{frame}
 276
 277 % yes it really *is* ADD not VADD.  that's the entire point of
 278 % this proposal, that *standard* operations are overloaded to
 279 % become vectorised-on-demand
 280
 281
 282 \begin{frame}[fragile]
 283 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
 284
 285 \begin{semiverbatim}
 286 s1 = reg\_is\_vectorised(src1);
 287 s2 = reg\_is\_vectorised(src2);
 288 if (!s2 && !s1) goto branch;
 289 for (int i = 0; i < VL; ++i)
 290   if (cmp(s1 ? reg[src1+i]:reg[src1],
 291           s2 ? reg[src2+i]:reg[src2])
 292          ireg[rs3] |= 1<<i;
 293 \end{semiverbatim}
 294
 295   \begin{itemize}
 296    \item SIMD slightly more complex (case above is elwidth = default)
 297    \item If s1 and s2 both scalars, Standard branch occurs
 298    \item Predication stored in integer regfile as a bitfield
 299    \item Scalar-vector and vector-vector supported
 300    \item Overload Branch immediate to be predication target rs3
 301   \end{itemize}
 302 \end{frame}
 303
 304 \begin{frame}[fragile]
 305 \frametitle{VLD/VLD.S/VLD.X (or trap, or actual hardware loop)}
 306
 307 \begin{semiverbatim}
 308 if (unit-strided) stride = elsize;
 309 else stride = areg[as2]; // constant-strided
 310 for (int i = 0; i < VL; ++i)
 311   if (preg\_enabled[rd] && ([!]preg[rd] & 1<<i))
 312     for (int j = 0; j < seglen+1; j++)
 313       if (reg\_is\_vectorised[rs2]) offs = vreg[rs2+i]
 314       else offs = i*(seglen+1)*stride;
 315       vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
 316 \end{semiverbatim}
 317
 318   \begin{itemize}
 319    \item Again: elwidth != default slightly more complex
 320    \item rs2 vectorised taken to implicitly indicate VLD.X
 321   \end{itemize}
 322 \end{frame}
 323
 324
 325 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
 326
 327  \begin{itemize}
 328    \item Same register(s) can have multiple "interpretations"
 329    \item Set "real" register (scalar) without needing to set/unset CSRs.
 330    \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops
 331    \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV:\\
 332              GREV @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
 333    \item RGB 565 (video): BEXTW plus 4x8-bit SIMD plus BDEPW\\
 334              (BEXT/BDEP @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
 335    \item Same register(s) can be offset (no need for VSLIDE)\vspace{6pt}
 336   \end{itemize}
 337   Note:
 338    \begin{itemize}
 339    \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$)
 340    \item Hi-Performance: Macro-op fusion (more pipeline stages?)
 341   \end{itemize}
 342 }
 343
 344
 345 \frame{\frametitle{To Zero or not to place zeros in non-predicated elements?}
 346
 347  \begin{itemize}
 348    \item Zeroing is an implementation optimisation favouring OoO
 349    \item Simple implementations may skip non-predicated operations
 350    \item Simple implementations explicitly have to destroy data
 351    \item Complex implementations may use reg-renames to save power\\
 352              Zeroing on predication chains makes optimisation harder
 353    \item Compromise: REQUIRE both (specified in predication CSRs).
 354   \end{itemize}
 355   Considerations:
 356   \begin{itemize}
 357    \item Complex not really impacted, simple impacted a LOT\\
 358          with Zeroing... however it's useful (memzero)
 359    \item Non-zero'd overlapping "Vectors" may issue overlapping ops\\
 360              (2nd op's predicated elements slot in 1st's non-predicated ops)
 361    \item Please don't use Vectors for "security" (use Sec-Ext)
 362   \end{itemize}
 363 }
 364 % with overlapping "vectors" - bearing in mind that "vectors" are
 365 % just a remap onto the standard register file, if the top bits of
 366 % predication are zero, and there happens to be a second vector
 367 % that uses some of the same register file that happens to be
 368 % predicated out, the second vector op may be issued *at the same time*
 369 % if there are available parallel ALUs to do so.
 370
 371
 372 \frame{\frametitle{Predication key-value CSR store}
 373
 374  \begin{itemize}
 375    \item key is int regfile number or FP regfile number (1 bit)\vspace{6pt}
 376    \item register to be predicated if referred to (5 bits, key)\vspace{6pt}
 377    \item register to store actual predication in (5 bits, value)\vspace{6pt}
 378    \item predication is inverted Y/N (1 bit)\vspace{6pt}
 379    \item non-predicated elements are to be zero'd Y/N (1 bit)\vspace{6pt}
 380   \end{itemize}
 381   Notes:\vspace{10pt}
 382    \begin{itemize}
 383    \item Table should be expanded out for high-speed implementations
 384    \item Multiple "keys" (and values) theoretically permitted
 385    \item RVV rules about deleting higher-indexed CSRs followed
 386   \end{itemize}
 387 }
 388
 389
 390 \begin{frame}[fragile]
 391 \frametitle{Predication key-value CSR table decoding pseudocode}
 392
 393 \begin{semiverbatim}
 394 struct pred fp\_pred[32];
 395 struct pred int\_pred[32];
 396
 397 for (i = 0; i < 16; i++) // 16 CSRs?
 398    tb = int\_pred if CSRpred[i].type == 0 else fp\_pred
 399    idx = CSRpred[i].regidx
 400    tb[idx].zero     = CSRpred[i].zero
 401    tb[idx].inv      = CSRpred[i].inv
 402    tb[idx].predidx  = CSRpred[i].predidx
 403    tb[idx].enabled  = true
 404 \end{semiverbatim}
 405
 406  \begin{itemize}
 407    \item All 64 (int and FP) Entries zero'd before setting
 408    \item Might be a bit complex to set up (TBD)
 409   \end{itemize}
 410
 411 \end{frame}
 412
 413
 414 \begin{frame}[fragile]
 415 \frametitle{Get Predication value pseudocode}
 416
 417 \begin{semiverbatim}
 418 def get\_pred\_val(bool is\_fp\_op, int reg):
 419    tb = int\_pred if is\_fp\_op else fp\_pred
 420    if (!tb[reg].enabled):
 421       return ~0x0              // all ops enabled
 422    predidx = tb[reg].predidx   // redirection occurs HERE
 423    predicate = intreg[predidx] // actual predicate HERE
 424    if (tb[reg].inv):
 425       predicate = ~predicate
 426    return predicate
 427 \end{semiverbatim}
 428
 429  \begin{itemize}
 430    \item References different (internal) mapping table for INT or FP
 431    \item Actual predicate bitmask ALWAYS from the INT regfile
 432   \end{itemize}
 433
 434 \end{frame}
 435
 436
 437 \frame{\frametitle{Register key-value CSR store}
 438
 439  \begin{itemize}
 440    \item key is int regfile number or FP regfile number (1 bit)\vspace{6pt}
 441    \item treated as vector if referred to in op (5 bits, key)\vspace{6pt}
 442    \item starting register to actually be used (5 bits, value)\vspace{6pt}
 443    \item element bitwidth: default/8/16/32/64/rsvd (3 bits)\vspace{6pt}
 444    \item element type: still under consideration\vspace{6pt}
 445   \end{itemize}
 446   Notes:\vspace{10pt}
 447    \begin{itemize}
 448    \item Same notes apply (previous slide) as for predication CSR table
 449    \item Level of indirection has implications for pipeline latency
 450   \end{itemize}
 451 }
 452
 453
 454 \begin{frame}[fragile]
 455 \frametitle{Register key-value CSR table decoding pseudocode}
 456
 457 \begin{semiverbatim}
 458 struct vectorised fp\_vec[32];
 459 struct vectorised int\_vec[32];
 460
 461 for (i = 0; i < 16; i++) // 16 CSRs?
 462    tb = int\_vec if CSRvectortb[i].type == 0 else fp\_vec
 463    idx = CSRvectortb[i].regidx
 464    tb[idx].elwidth  = CSRpred[i].elwidth
 465    tb[idx].regidx   = CSRpred[i].regidx
 466    tb[idx].isvector = true
 467 \end{semiverbatim}
 468
 469  \begin{itemize}
 470    \item All 64 (int and FP) Entries zero'd before setting
 471    \item Might be a bit complex to set up (TBD)
 472   \end{itemize}
 473
 474 \end{frame}
 475
 476
 477 \begin{frame}[fragile]
 478 \frametitle{ADD pseudocode with redirection, this time}
 479
 480 \begin{semiverbatim}
 481 function op\_add(rd, rs1, rs2) # add not VADD!
 482   int i, id=0, irs1=0, irs2=0;
 483   rd  = int\_vec[rd ].isvector ? int\_vec[rd ].regidx : rd;
 484   rs1 = int\_vec[rs1].isvector ? int\_vec[rs1].regidx : rs1;
 485   rs2 = int\_vec[rs2].isvector ? int\_vec[rs2].regidx : rs2;
 486   predval = get\_pred\_val(FALSE, rd);
 487   for (i = 0; i < VL; i++)
 488     if (predval \& 1<<i) # predication uses intregs
 489        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 490     if (int\_vec[rd ].isvector)  \{ id += 1; \}
 491     if (int\_vec[rs1].isvector)  \{ irs1 += 1; \}
 492     if (int\_vec[rs2].isvector)  \{ irs2 += 1; \}
 493 \end{semiverbatim}
 494
 495   \begin{itemize}
 496    \item SIMD (elwidth != default) not covered above
 497   \end{itemize}
 498 \end{frame}
 499
 500
 501 \frame{\frametitle{C.MV extremely flexible!}
 502
 503  \begin{itemize}
 504    \item scalar-to-vector (w/ no pred): VSPLAT
 505    \item scalar-to-vector (w/ dest-pred): Sparse VSPLAT
 506    \item scalar-to-vector (w/ 1-bit dest-pred): VINSERT
 507    \item vector-to-scalar (w/ [1-bit?] src-pred): VEXTRACT
 508    \item vector-to-vector (w/ no pred): Vector Copy
 509    \item vector-to-vector (w/ src pred): Vector Gather
 510    \item vector-to-vector (w/ dest pred): Vector Scatter
 511    \item vector-to-vector (w/ src \& dest pred): Vector Gather/Scatter
 512   \end{itemize}
 513   \vspace{4pt}
 514   Notes:
 515    \begin{itemize}
 516    \item Surprisingly powerful! Zero-predication even more so
 517    \item Same arrangement for FVCT, FMV, FSGNJ etc.
 518   \end{itemize}
 519 }
 520
 521
 522 \begin{frame}[fragile]
 523 \frametitle{MV pseudocode with predication}
 524
 525 \begin{semiverbatim}
 526 function op\_mv(rd, rs) # MV not VMV!
 527   rd = int\_vec[rd].isvector ? int\_vec[rd].regidx : rd;
 528   rs = int\_vec[rs].isvector ? int\_vec[rs].regidx : rs;
 529   ps = get\_pred\_val(FALSE, rs); # predication on src
 530   pd = get\_pred\_val(FALSE, rd); # ... AND on dest
 531   for (int i = 0, int j = 0; i < VL && j < VL;):
 532     if (int\_vec[rs].isvec) while (!(ps \& 1<<i)) i++;
 533     if (int\_vec[rd].isvec) while (!(pd \& 1<<j)) j++;
 534     ireg[rd+j] <= ireg[rs+i];
 535     if (int\_vec[rs].isvec) i++;
 536     if (int\_vec[rd].isvec) j++;
 537 \end{semiverbatim}
 538
 539   \begin{itemize}
 540    \item elwidth != default not covered above (might be a bit hairy)
 541    \item Ending early with 1-bit predication not included (VINSERT)
 542   \end{itemize}
 543 \end{frame}
 544
 545
 546 \begin{frame}[fragile]
 547 \frametitle{VSELECT: stays or goes? Stays if MV.X exists...}
 548
 549 \begin{semiverbatim}
 550 def op_mv_x(rd, rs):         # (hypothetical) RV MX.X
 551    rs = regfile[rs]          # level of indirection (MV.X)
 552    regfile[rd] = regfile[rs] # straight regcopy
 553 \end{semiverbatim}
 554
 555 Vectorised version aka "VSELECT":
 556
 557 \begin{semiverbatim}
 558 def op_mv_x(rd, rs): # SV version of MX.X
 559    for i in range(VL):
 560       rs1 = regfile[rs+i]         # indirection
 561       regfile[rd+i] = regfile[rs] # straight regcopy
 562 \end{semiverbatim}
 563
 564   \begin{itemize}
 565    \item However MV.X does not exist in RV, so neither can VSELECT
 566    \item \red SV is not about adding new functionality, only parallelism
 567   \end{itemize}
 568
 569
 570 \end{frame}
 571
 572
 573 \frame{\frametitle{Opcodes, compared to RVV}
 574
 575  \begin{itemize}
 576    \item All integer and FP opcodes all removed (no CLIP, FNE)
 577    \item VMPOP, VFIRST etc. all removed (use xBitManip)
 578    \item VSLIDE removed (use regfile overlaps)
 579    \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)
 580    \item Vector (or scalar-vector) copy: use C.MV (MV is a pseudo-op)
 581    \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)
 582    \item VSETVL, VGETVL stay (the only ops that do!)
 583   \end{itemize}
 584   Issues:
 585  \begin{itemize}
 586    \item VSELECT stays? no MV.X, so no (add with custom ext?)
 587    \item VSNE exists, but no FNE (use predication inversion?)
 588    \item VCLIP is not in RV* (add with custom ext?)
 589   \end{itemize}
 590 }
 591
 592
 593 \begin{frame}[fragile]
 594 \frametitle{Example c code: DAXPY}
 595
 596 \begin{semiverbatim}
 597     void daxpy(size_t n, double a,
 598                const double x[], double y[])
 599     \{
 600      for (size_t i = 0; i < n; i++) \{
 601        y[i] = a*x[i] + y[i];
 602      \}
 603     \}
 604 \end{semiverbatim}
 605
 606   \begin{itemize}
 607    \item See "SIMD Considered Harmful" for SIMD/RVV analysis\\
 608            https://sigarch.org/simd-instructions-considered-harmful/
 609   \end{itemize}
 610
 611
 612 \end{frame}
 613
 614
 615 \begin{frame}[fragile]
 616 \frametitle{RVV DAXPY assembly (RV32V)}
 617
 618 \begin{semiverbatim}
 619 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
 620  li t0, 2<<25
 621  vsetdcfg t0            # enable 2 64b Fl.Pt. registers
 622 loop:
 623  setvl  t0, a0          # vl = t0 = min(mvl, n)
 624  vld    v0, a1          # load vector x
 625  slli   t1, t0, 3       # t1 = vl * 8 (in bytes)
 626  vld    v1, a2          # load vector y
 627  add    a1, a1, t1      # increment pointer to x by vl*8
 628  vfmadd v1, v0, fa0, v1 # v1 += v0 * fa0 (y = a * x + y)
 629  sub    a0, a0, t0      # n -= vl (t0)
 630  vst    v1, a2          # store Y
 631  add    a2, a2, t1      # increment pointer to y by vl*8
 632  bnez   a0, loop        # repeat if n != 0
 633 \end{semiverbatim}
 634 \end{frame}
 635
 636
 637 \begin{frame}[fragile]
 638 \frametitle{SV DAXPY assembly (RV64D)}
 639
 640 \begin{semiverbatim}
 641 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
 642  CSRvect1 = \{type: F, key: a3, val: a3, elwidth: dflt\}
 643  CSRvect2 = \{type: F, key: a7, val: a7, elwidth: dflt\}
 644 loop:
 645  setvl  t0, a0, 4       # vl = t0 = min(4, n)
 646  ld     a3, a1          # load 4 registers a3-6 from x
 647  slli   t1, t0, 3       # t1 = vl * 8 (in bytes)
 648  ld     a7, a2          # load 4 registers a7-10 from y
 649  add    a1, a1, t1      # increment pointer to x by vl*8
 650  fmadd  a7, a3, fa0, a7 # v1 += v0 * fa0 (y = a * x + y)
 651  sub    a0, a0, t0      # n -= vl (t0)
 652  st     a7, a2          # store 4 registers a7-10 to y
 653  add    a2, a2, t1      # increment pointer to y by vl*8
 654  bnez   a0, loop        # repeat if n != 0
 655 \end{semiverbatim}
 656 \end{frame}
 657
 658
 659 \frame{\frametitle{Under consideration}
 660
 661  \begin{itemize}
 662    \item Is C.FNE actually needed? Should it be added if it is?
 663    \item Element type implies polymorphism.  Should it be in SV?
 664    \item Should use of registers be allowed to "wrap" (x30 x31 x1 x2)?
 665    \item Is detection of all-scalar ops ok (without slowing pipeline)?
 666    \item Can VSELECT be removed? (it's really complex)
 667    \item Can CLIP be done as a CSR (mode, like elwidth)
 668    \item SIMD saturation (etc.) also set as a mode?
 669    \item Include src1/src2 predication on Comparison Ops?\\
 670              (same arrangement as C.MV, with same flexibility/power)
 671    \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
 672          (a bit like misaligned addressing... for registers)\\
 673          or just use predication to skip start?
 674   \end{itemize}
 675 }
 676
 677
 678 \frame{\frametitle{What's the downside(s) of SV?}
 679  \begin{itemize}
 680    \item EVERY register operation is inherently parallelised\\
 681              (scalar ops are just vectors of length 1)\vspace{4pt}
 682    \item Tightly coupled with the core (instruction issue)\\
 683          could be disabled through MISA switch\vspace{4pt}
 684    \item An extra pipeline phase is pretty much essential\\
 685          for fast low-latency implementations\vspace{4pt}
 686    \item With zeroing off, skipping non-predicated elements is hard:\\
 687          it is however an optimisation (and could be skipped).\vspace{4pt}
 688    \item Setting up the Register/Predication tables (interpreting the\\
 689              CSR key-value stores) might be a bit complex to optimise
 690              (any change to a CSR key-value entry needs to redo the table)
 691   \end{itemize}
 692 }
 693
 694
 695 \frame{\frametitle{Is this OK (low latency)? Detect scalar-ops (only)}
 696  \begin{center}
 697   \includegraphics[height=2.5in]{scalardetect.png}\\
 698   {\bf \red Detect when all registers are scalar for a given op}
 699  \end{center}
 700 }
 701
 702
 703 \frame{\frametitle{Summary}
 704
 705  \begin{itemize}
 706    \item Actually about parallelism, not Vectors (or SIMD) per se\\
 707          and NOT about adding new ALU/logic/functionality.
 708    \item Only needs 2 actual instructions (plus the CSRs).\\
 709          RVV - and "standard" SIMD - require ISA duplication
 710    \item Designed for flexibility (graded levels of complexity)
 711    \item Huge range of implementor freedom
 712    \item Fits RISC-V ethos: achieve more with less
 713    \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
 714              (without SIMD downsides or sacrificing speed trade-off)
 715    \item Covers 98\% of RVV, allows RVV to fit "on top"
 716    \item Byproduct of SV is a reduction in code size, power usage
 717              etc. (increase efficiency, just like Compressed)
 718   \end{itemize}
 719 }
 720
 721
 722 \frame{
 723   \begin{center}
 724     {\Huge The end\vspace{20pt}\\
 725                    Thank you\vspace{20pt}\\
 726                    Questions?\vspace{20pt}
 727         }
 728   \end{center}
 729
 730   \begin{itemize}
 731         \item Discussion: ISA-DEV mailing list
 732         \item http://libre-riscv.org/simple\_v\_extension/
 733   \end{itemize}
 734 }
 735
 736
 737 \end{document}