simple_v_extension/simple_v_chennai_2018.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
   7 \author{Luke Kenneth Casson Leighton}
   8
   9
  10 \begin{document}
  11
  12 \frame{
  13    \begin{center}
  14     \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
  15     \vspace{32pt}
  16     \Large{Flexible Vectorisation}\\
  17     \Large{(aka not so Simple-V?)}\\
  18     \Large{(aka How to Parallelise the RISC-V ISA)}\\
  19     \vspace{24pt}
  20     \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
  21     \vspace{16pt}
  22     \large{\today}
  23   \end{center}
  24 }
  25
  26
  27 \frame{\frametitle{Credits and Acknowledgements}
  28
  29  \begin{itemize}
  30    \item The Designers of RISC-V\vspace{15pt}
  31    \item The RVV Working Group and contributors\vspace{15pt}
  32    \item Allen Baum, Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
  33              Guy Lemurieux, Jonathan Neuschafer, Roger Brussee,
  34              and others\vspace{15pt}
  35    \item ISA-Dev Group Members\vspace{10pt}
  36   \end{itemize}
  37 }
  38
  39
  40 \frame{\frametitle{Quick refresher on SIMD}
  41
  42  \begin{itemize}
  43    \item SIMD very easy to implement (and very seductive)\vspace{8pt}
  44    \item Parallelism is in the ALU\vspace{8pt}
  45    \item Zero-to-Negligeable impact for rest of core\vspace{8pt}
  46   \end{itemize}
  47   Where SIMD Goes Wrong:\vspace{10pt}
  48    \begin{itemize}
  49    \item See "SIMD instructions considered harmful"
  50    https://sigarch.org/simd-instructions-considered-harmful
  51    \item Setup and corner-cases alone are extremely complex.\\
  52              Hardware is easy, but software is hell.
  53    \item O($N^{6}$) ISA opcode proliferation!\\
  54              opcode, elwidth, veclen, src1-src2-dest hi/lo
  55   \end{itemize}
  56 }
  57
  58 \frame{\frametitle{Quick refresher on RVV}
  59
  60  \begin{itemize}
  61    \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
  62    \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
  63    \item Requires a separate Register File (16 w/ext to 256)\vspace{10pt}
  64    \item Implemented as a separate pipeline (no impact on scalar)\vspace{10pt}
  65   \end{itemize}
  66   However...\vspace{10pt}
  67    \begin{itemize}
  68    \item 98 percent opcode duplication with rest of RV (CLIP)
  69    \item Extending RVV requires customisation not just of h/w:\\
  70              gcc, binutils also need customisation (and maintenance)
  71   \end{itemize}
  72 }
  73
  74
  75 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
  76
  77  \begin{itemize}
  78    \item Why?
  79          Implementors need flexibility in vectorisation to optimise for
  80          area or performance depending on the scope:
  81              embedded DSP, Mobile GPU's, Server CPU's and more.\vspace{4pt}\\
  82                  Compilers also need flexibility in vectorisation to optimise for cost
  83                  of pipeline setup, amount of state to context switch
  84                  and software portability\vspace{4pt}
  85    \item How?
  86              By marking INT/FP regs as "Vectorised" and
  87              adding a level of indirection,
  88              SV expresses how existing instructions should act
  89              on [contiguous] blocks of registers, in parallel.\vspace{4pt}
  90    \item What?
  91                  Simple-V is an "API" that implicitly extends
  92                  existing (scalar) instructions with explicit parallelisation\\
  93                  (i.e. SV is actually about parallelism NOT vectors per se)
  94   \end{itemize}
  95 }
  96
  97
  98 \frame{\frametitle{What's the value of SV? Why adopt it even in non-V?}
  99
 100  \begin{itemize}
 101    \item memcpy becomes much smaller (higher bang-per-buck)
 102    \item context-switch (LOAD/STORE multiple): 1-2 instructions
 103    \item Compressed instrs further reduces I-cache (etc.)
 104    \item Greatly-reduced I-cache load (and less reads)
 105    \item Amazingly, SIMD becomes (more) tolerable\\
 106              (corner-cases for setup and teardown are gone)
 107    \item Modularity/Abstraction in both the h/w and the toolchain.
 108   \end{itemize}
 109   Note:
 110    \begin{itemize}
 111    \item It's not just about Vectors: it's about instruction effectiveness
 112    \item Anything that makes SIMD tolerable has to be a good thing
 113    \item Anything implementor is not interested in HW-optimising,\\
 114              let it fall through to exceptions (implement as a trap).
 115   \end{itemize}
 116 }
 117
 118
 119 \frame{\frametitle{How does Simple-V relate to RVV? What's different?}
 120
 121  \begin{itemize}
 122    \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
 123    \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
 124    \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
 125    \item Even Compressed become vectorised (RVV can't)\vspace{10pt}
 126   \end{itemize}
 127   What Simple-V is not:\vspace{10pt}
 128    \begin{itemize}
 129    \item A full supercomputer-level Vector Proposal
 130    \item A replacement for RVV (SV is designed to be over-ridden\\
 131              by - or augmented to become - RVV)
 132   \end{itemize}
 133 }
 134
 135
 136 \frame{\frametitle{How is Parallelism abstracted in Simple-V?}
 137
 138  \begin{itemize}
 139    \item Register "typing" turns any op into an implicit Vector op:\\
 140          registers are reinterpreted through a level of indirection
 141    \item Primarily at the Instruction issue phase (except SIMD)\\
 142          Note: it's ok to pass predication through to ALU (like SIMD)
 143    \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
 144   \end{itemize}
 145   Note: EVERYTHING is parallelised:
 146    \begin{itemize}
 147    \item All LOAD/STORE (inc. Compressed, Int/FP versions)
 148    \item All ALU ops (soft / hybrid / full HW, on per-op basis)
 149    \item All branches become predication targets (C.FNE added?)
 150    \item C.MV of particular interest (s/v, v/v, v/s)
 151    \item FCVT, FMV, FSGNJ etc. very similar to C.MV
 152   \end{itemize}
 153 }
 154
 155
 156 \frame{\frametitle{Implementation Options}
 157
 158  \begin{itemize}
 159    \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)
 160    \item Hardware loop, single-instruction issue\\
 161                  (Do / Don't send through predication to ALU)
 162    \item Hardware loop, parallel (multi-instruction) issue\\
 163                  (Do / Don't send through predication to ALU)
 164    \item Hardware loop, full parallel ALU (not recommended)
 165   \end{itemize}
 166   Notes:\vspace{6pt}
 167   \begin{itemize}
 168    \item 4 (or more?) options above may be deployed on per-op basis
 169    \item SIMD always sends predication bits through to ALU
 170    \item Minimum MVL MUST be sufficient to cover regfile LD/ST
 171    \item Instr. FIFO may repeatedly split off N scalar ops at a time
 172   \end{itemize}
 173 }
 174 % Instr. FIFO may need its own slide.  Basically, the vectorised op
 175 % gets pushed into the FIFO, where it is then "processed".  Processing
 176 % will remove the first set of ops from its vector numbering (taking
 177 % predication into account) and shoving them **BACK** into the FIFO,
 178 % but MODIFYING the remaining "vectorised" op, subtracting the now
 179 % scalar ops from it.
 180
 181 \frame{\frametitle{Predicated 8-parallel ADD: 1-wide ALU}
 182  \begin{center}
 183   \includegraphics[height=2.5in]{padd9_alu1.png}\\
 184   {\bf \red Predicated adds are shuffled down: 6 cycles in total}
 185  \end{center}
 186 }
 187
 188
 189 \frame{\frametitle{Predicated 8-parallel ADD: 4-wide ALU}
 190  \begin{center}
 191   \includegraphics[height=2.5in]{padd9_alu4.png}\\
 192   {\bf \red Predicated adds are shuffled down: 4 in 1st cycle, 2 in 2nd}
 193  \end{center}
 194 }
 195
 196
 197 \frame{\frametitle{Predicated 8-parallel ADD: 3 phase FIFO expansion}
 198  \begin{center}
 199   \includegraphics[height=2.5in]{padd9_fifo.png}\\
 200   {\bf \red First cycle takes first four 1s; second takes the rest}
 201  \end{center}
 202 }
 203
 204
 205 \frame{\frametitle{How are SIMD Instructions Vectorised?}
 206
 207  \begin{itemize}
 208    \item SIMD ALU(s) primarily unchanged\vspace{6pt}
 209    \item Predication is added to each SIMD element\vspace{6pt}
 210    \item Predication bits sent in groups to the ALU\vspace{6pt}
 211    \item End of Vector enables (additional) predication\vspace{10pt}
 212   \end{itemize}
 213   Considerations:\vspace{4pt}
 214    \begin{itemize}
 215    \item Many SIMD ALUs possible (parallel execution)
 216    \item Implementor free to choose (API remains the same)
 217    \item Unused ALU units wasted, but s/w DRASTICALLY simpler
 218    \item Very long SIMD ALUs could waste significant die area
 219   \end{itemize}
 220 }
 221 % With multiple SIMD ALUs at for example 32-bit wide they can be used
 222 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
 223 % or they can be used to cover several operations on totally different
 224 % vectors / registers.
 225
 226 \frame{\frametitle{Predicated 9-parallel SIMD ADD}
 227  \begin{center}
 228   \includegraphics[height=2.5in]{padd9_simd.png}\\
 229   {\bf \red 4-wide 8-bit SIMD, 4 bits of predicate passed to ALU}
 230  \end{center}
 231 }
 232
 233
 234 \frame{\frametitle{What's the deal / juice / score?}
 235
 236  \begin{itemize}
 237    \item Standard Register File(s) overloaded with CSR "reg is vector"\\
 238              (see pseudocode slides for examples)
 239    \item Element width (and type?) concepts remain same as RVV\\
 240              (CSRs give new size (and meaning?) to elements in registers)
 241    \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
 242   \end{itemize}
 243   Key differences from RVV:\vspace{10pt}
 244    \begin{itemize}
 245    \item Predication in INT regs as a BIT field (max VL=XLEN)
 246    \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)
 247    \item SV may condense sparse Vecs: RVV lets ALU do predication
 248    \item Choice to Zero or skip non-predicated elements
 249   \end{itemize}
 250 }
 251
 252
 253 \begin{frame}[fragile]
 254 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
 255
 256 \begin{semiverbatim}
 257 function op\_add(rd, rs1, rs2, predr) # add not VADD!
 258   int i, id=0, irs1=0, irs2=0;
 259   for (i = 0; i < VL; i++)
 260     if (ireg[predr] & 1<<i) # predication uses intregs
 261        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 262     if (reg\_is\_vectorised[rd]) \{ id += 1; \}
 263     if (reg\_is\_vectorised[rs1]) \{ irs1 += 1; \}
 264     if (reg\_is\_vectorised[rs2]) \{ irs2 += 1; \}
 265 \end{semiverbatim}
 266
 267   \begin{itemize}
 268    \item Above is oversimplified: Reg. indirection left out (for clarity).
 269    \item SIMD slightly more complex (case above is elwidth = default)
 270    \item Scalar-scalar and scalar-vector and vector-vector now all in one
 271    \item OoO may choose to push ADDs into instr. queue (v. busy!)
 272   \end{itemize}
 273 \end{frame}
 274
 275 % yes it really *is* ADD not VADD.  that's the entire point of
 276 % this proposal, that *standard* operations are overloaded to
 277 % become vectorised-on-demand
 278
 279
 280 \begin{frame}[fragile]
 281 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
 282
 283 \begin{semiverbatim}
 284 s1 = reg\_is\_vectorised(src1);
 285 s2 = reg\_is\_vectorised(src2);
 286 if (!s2 && !s1) goto branch;
 287 for (int i = 0; i < VL; ++i)
 288   if (cmp(s1 ? reg[src1+i]:reg[src1],
 289           s2 ? reg[src2+i]:reg[src2])
 290          ireg[rs3] |= 1<<i;
 291 \end{semiverbatim}
 292
 293   \begin{itemize}
 294    \item SIMD slightly more complex (case above is elwidth = default)
 295    \item If s1 and s2 both scalars, Standard branch occurs
 296    \item Predication stored in integer regfile as a bitfield
 297    \item Scalar-vector and vector-vector supported
 298   \end{itemize}
 299 \end{frame}
 300
 301 \begin{frame}[fragile]
 302 \frametitle{VLD/VLD.S/VLD.X (or trap, or actual hardware loop)}
 303
 304 \begin{semiverbatim}
 305 if (unit-strided) stride = elsize;
 306 else stride = areg[as2]; // constant-strided
 307 for (int i = 0; i < VL; ++i)
 308   if (preg\_enabled[rd] && ([!]preg[rd] & 1<<i))
 309     for (int j = 0; j < seglen+1; j++)
 310       if (reg\_is\_vectorised[rs2]) offs = vreg[rs2+i]
 311       else offs = i*(seglen+1)*stride;
 312       vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
 313 \end{semiverbatim}
 314
 315   \begin{itemize}
 316    \item Again: elwidth != default slightly more complex
 317    \item rs2 vectorised taken to implicitly indicate VLD.X
 318   \end{itemize}
 319 \end{frame}
 320
 321
 322 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
 323
 324  \begin{itemize}
 325    \item Same register(s) can have multiple "interpretations"
 326    \item Set "real" register (scalar) without needing to set/unset CSRs.
 327    \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops
 328    \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV:\\
 329              GREV @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
 330    \item RGB 565 (video): BEXTW plus 4x8-bit SIMD plus BDEPW\\
 331              (BEXT/BDEP @ VL=N,wid=32; SIMD @ VL=Nx4,wid=8)
 332    \item Same register(s) can be offset (no need for VSLIDE)\vspace{6pt}
 333   \end{itemize}
 334   Note:
 335    \begin{itemize}
 336    \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$)
 337    \item Hi-Performance: Macro-op fusion (more pipeline stages?)
 338   \end{itemize}
 339 }
 340
 341
 342 \frame{\frametitle{To Zero or not to place zeros in non-predicated elements?}
 343
 344  \begin{itemize}
 345    \item Zeroing is an implementation optimisation favouring OoO
 346    \item Simple implementations may skip non-predicated operations
 347    \item Simple implementations explicitly have to destroy data
 348    \item Complex implementations may use reg-renames to save power\\
 349              Zeroing on predication chains makes optimisation harder
 350    \item Compromise: REQUIRE both (specified in predication CSRs).
 351   \end{itemize}
 352   Considerations:
 353   \begin{itemize}
 354    \item Complex not really impacted, simple impacted a LOT\\
 355          with Zeroing... however it's useful (memzero)
 356    \item Non-zero'd overlapping "Vectors" may issue overlapping ops\\
 357              (2nd op's predicated elements slot in 1st's non-predicated ops)
 358    \item Please don't use Vectors for "security" (use Sec-Ext)
 359   \end{itemize}
 360 }
 361 % with overlapping "vectors" - bearing in mind that "vectors" are
 362 % just a remap onto the standard register file, if the top bits of
 363 % predication are zero, and there happens to be a second vector
 364 % that uses some of the same register file that happens to be
 365 % predicated out, the second vector op may be issued *at the same time*
 366 % if there are available parallel ALUs to do so.
 367
 368
 369 \frame{\frametitle{Predication key-value CSR store}
 370
 371  \begin{itemize}
 372    \item key is int regfile number or FP regfile number (1 bit)\vspace{6pt}
 373    \item register to be predicated if referred to (5 bits, key)\vspace{6pt}
 374    \item register to store actual predication in (5 bits, value)\vspace{6pt}
 375    \item predication is inverted Y/N (1 bit)\vspace{6pt}
 376    \item non-predicated elements are to be zero'd Y/N (1 bit)\vspace{6pt}
 377   \end{itemize}
 378   Notes:\vspace{10pt}
 379    \begin{itemize}
 380    \item Table should be expanded out for high-speed implementations
 381    \item Multiple "keys" (and values) theoretically permitted
 382    \item RVV rules about deleting higher-indexed CSRs followed
 383   \end{itemize}
 384 }
 385
 386
 387 \begin{frame}[fragile]
 388 \frametitle{Predication key-value CSR table decoding pseudocode}
 389
 390 \begin{semiverbatim}
 391 struct pred fp\_pred[32];
 392 struct pred int\_pred[32];
 393
 394 for (i = 0; i < 16; i++) // 16 CSRs?
 395    tb = int\_pred if CSRpred[i].type == 0 else fp\_pred
 396    idx = CSRpred[i].regidx
 397    tb[idx].zero     = CSRpred[i].zero
 398    tb[idx].inv      = CSRpred[i].inv
 399    tb[idx].predidx  = CSRpred[i].predidx
 400    tb[idx].enabled  = true
 401 \end{semiverbatim}
 402
 403  \begin{itemize}
 404    \item All 64 (int and FP) Entries zero'd before setting
 405    \item Might be a bit complex to set up (TBD)
 406   \end{itemize}
 407
 408 \end{frame}
 409
 410
 411 \begin{frame}[fragile]
 412 \frametitle{Get Predication value pseudocode}
 413
 414 \begin{semiverbatim}
 415 def get\_pred\_val(bool is\_fp\_op, int reg):
 416    tb = int\_pred if is\_fp\_op else fp\_pred
 417    if (!tb[reg].enabled):
 418       return ~0x0              // all ops enabled
 419    predidx = tb[reg].predidx   // redirection occurs HERE
 420    predicate = intreg[predidx] // actual predicate HERE
 421    if (tb[reg].inv):
 422       predicate = ~predicate
 423    return predicate
 424 \end{semiverbatim}
 425
 426  \begin{itemize}
 427    \item References different (internal) mapping table for INT or FP
 428    \item Actual predicate bitmask ALWAYS from the INT regfile
 429   \end{itemize}
 430
 431 \end{frame}
 432
 433
 434 \frame{\frametitle{Register key-value CSR store}
 435
 436  \begin{itemize}
 437    \item key is int regfile number or FP regfile number (1 bit)\vspace{6pt}
 438    \item treated as vector if referred to in op (5 bits, key)\vspace{6pt}
 439    \item starting register to actually be used (5 bits, value)\vspace{6pt}
 440    \item element bitwidth: default/8/16/32/64/rsvd (3 bits)\vspace{6pt}
 441    \item element type: still under consideration\vspace{6pt}
 442   \end{itemize}
 443   Notes:\vspace{10pt}
 444    \begin{itemize}
 445    \item Same notes apply (previous slide) as for predication CSR table
 446    \item Level of indirection has implications for pipeline latency
 447   \end{itemize}
 448 }
 449
 450
 451 \begin{frame}[fragile]
 452 \frametitle{Register key-value CSR table decoding pseudocode}
 453
 454 \begin{semiverbatim}
 455 struct vectorised fp\_vec[32];
 456 struct vectorised int\_vec[32];
 457
 458 for (i = 0; i < 16; i++) // 16 CSRs?
 459    tb = int\_vec if CSRvectortb[i].type == 0 else fp\_vec
 460    idx = CSRvectortb[i].regidx
 461    tb[idx].elwidth  = CSRpred[i].elwidth
 462    tb[idx].regidx   = CSRpred[i].regidx
 463    tb[idx].isvector = true
 464 \end{semiverbatim}
 465
 466  \begin{itemize}
 467    \item All 64 (int and FP) Entries zero'd before setting
 468    \item Might be a bit complex to set up (TBD)
 469   \end{itemize}
 470
 471 \end{frame}
 472
 473
 474 \begin{frame}[fragile]
 475 \frametitle{ADD pseudocode with redirection, this time}
 476
 477 \begin{semiverbatim}
 478 function op\_add(rd, rs1, rs2) # add not VADD!
 479   int i, id=0, irs1=0, irs2=0;
 480   rd  = int\_vec[rd ].isvector ? int\_vec[rd ].regidx : rd;
 481   rs1 = int\_vec[rs1].isvector ? int\_vec[rs1].regidx : rs1;
 482   rs2 = int\_vec[rs2].isvector ? int\_vec[rs2].regidx : rs2;
 483   predval = get\_pred\_val(FALSE, rd);
 484   for (i = 0; i < VL; i++)
 485     if (predval \& 1<<i) # predication uses intregs
 486        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 487     if (int\_vec[rd ].isvector)  \{ id += 1; \}
 488     if (int\_vec[rs1].isvector)  \{ irs1 += 1; \}
 489     if (int\_vec[rs2].isvector)  \{ irs2 += 1; \}
 490 \end{semiverbatim}
 491
 492   \begin{itemize}
 493    \item SIMD (elwidth != default) not covered above
 494   \end{itemize}
 495 \end{frame}
 496
 497
 498 \frame{\frametitle{C.MV extremely flexible!}
 499
 500  \begin{itemize}
 501    \item scalar-to-vector (w/ no pred): VSPLAT
 502    \item scalar-to-vector (w/ dest-pred): Sparse VSPLAT
 503    \item scalar-to-vector (w/ 1-bit dest-pred): VINSERT
 504    \item vector-to-scalar (w/ [1-bit?] src-pred): VEXTRACT
 505    \item vector-to-vector (w/ no pred): Vector Copy
 506    \item vector-to-vector (w/ src pred): Vector Gather
 507    \item vector-to-vector (w/ dest pred): Vector Scatter
 508    \item vector-to-vector (w/ src \& dest pred): Vector Gather/Scatter
 509   \end{itemize}
 510   \vspace{4pt}
 511   Notes:
 512    \begin{itemize}
 513    \item Surprisingly powerful!
 514    \item Same arrangement for FVCT, FMV, FSGNJ etc.
 515   \end{itemize}
 516 }
 517
 518
 519 \begin{frame}[fragile]
 520 \frametitle{MV pseudocode with predication}
 521
 522 \begin{semiverbatim}
 523 function op\_mv(rd, rs) # MV not VMV!
 524   rd = int\_vec[rd].isvector ? int\_vec[rd].regidx : rd;
 525   rs = int\_vec[rs].isvector ? int\_vec[rs].regidx : rs;
 526   ps = get\_pred\_val(FALSE, rs); # predication on src
 527   pd = get\_pred\_val(FALSE, rd); # ... AND on dest
 528   for (int i = 0, int j = 0; i < VL && j < VL;):
 529     if (int\_vec[rs].isvec) while (!(ps \& 1<<i)) i++;
 530     if (int\_vec[rd].isvec) while (!(pd \& 1<<j)) j++;
 531     ireg[rd+j] <= ireg[rs+i];
 532     if (int\_vec[rs].isvec) i++;
 533     if (int\_vec[rd].isvec) j++;
 534 \end{semiverbatim}
 535
 536   \begin{itemize}
 537    \item elwidth != default not covered above (might be a bit hairy)
 538    \item Ending early with 1-bit predication not included (VINSERT)
 539   \end{itemize}
 540 \end{frame}
 541
 542
 543 \begin{frame}[fragile]
 544 \frametitle{VSELECT: stays or goes? Stays if MV.X exists...}
 545
 546 \begin{semiverbatim}
 547 def op_mv_x(rd, rs):         # (hypothetical) RV MX.X
 548    rs = regfile[rs]          # level of indirection (MV.X)
 549    regfile[rd] = regfile[rs] # straight regcopy
 550 \end{semiverbatim}
 551
 552 Vectorised version aka "VSELECT":
 553
 554 \begin{semiverbatim}
 555 def op_mv_x(rd, rs): # SV version of MX.X
 556    for i in range(VL):
 557       rs1 = regfile[rs+i]         # indirection
 558       regfile[rd+i] = regfile[rs] # straight regcopy
 559 \end{semiverbatim}
 560
 561   \begin{itemize}
 562    \item However MV.X does not exist in RV, so neither can VSELECT
 563    \item SV is not about adding new functionality, only parallelism
 564   \end{itemize}
 565
 566
 567 \end{frame}
 568
 569
 570 \frame{\frametitle{Opcodes, compared to RVV}
 571
 572  \begin{itemize}
 573    \item All integer and FP opcodes all removed (no CLIP, FNE)
 574    \item VMPOP, VFIRST etc. all removed (use xBitManip)
 575    \item VSLIDE removed (use regfile overlaps)
 576    \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)
 577    \item VSETVL, VGETVL stay (the only ones that do!)
 578    \item VSELECT stays? no MV.X (add with custom ext?)
 579    \item VSNE exists, but no FNE (use predication inversion?)
 580    \item Issue: VCLIP is not in RV* (add with custom ext?)
 581    \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)
 582    \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)
 583   \end{itemize}
 584 }
 585
 586
 587 \begin{frame}[fragile]
 588 \frametitle{Example c code: DAXPY}
 589
 590 \begin{semiverbatim}
 591     void daxpy(size_t n, double a,
 592                const double x[], double y[])
 593     \{
 594      for (size_t i = 0; i < n; i++) \{
 595        y[i] = a*x[i] + y[i];
 596      \}
 597     \}
 598 \end{semiverbatim}
 599
 600   \begin{itemize}
 601    \item See "SIMD Considered Harmful" for SIMD/RVV analysis\\
 602            https://sigarch.org/simd-instructions-considered-harmful/
 603   \end{itemize}
 604
 605
 606 \end{frame}
 607
 608
 609 \begin{frame}[fragile]
 610 \frametitle{RVV DAXPY assembly (RV32V)}
 611
 612 \begin{semiverbatim}
 613 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
 614  li t0, 2<<25
 615  vsetdcfg t0            # enable 2 64b Fl.Pt. registers
 616 loop:
 617  setvl  t0, a0          # vl = t0 = min(mvl, n)
 618  vld    v0, a1          # load vector x
 619  slli   t1, t0, 3       # t1 = vl * 8 (in bytes)
 620  vld    v1, a2          # load vector y
 621  add    a1, a1, t1      # increment pointer to x by vl*8
 622  vfmadd v1, v0, fa0, v1 # v1 += v0 * fa0 (y = a * x + y)
 623  sub    a0, a0, t0      # n -= vl (t0)
 624  vst    v1, a2          # store Y
 625  add    a2, a2, t1      # increment pointer to y by vl*8
 626  bnez   a0, loop        # repeat if n != 0
 627 \end{semiverbatim}
 628 \end{frame}
 629
 630
 631 \begin{frame}[fragile]
 632 \frametitle{SV DAXPY assembly (RV64D)}
 633
 634 \begin{semiverbatim}
 635 # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a
 636  CSRvect1 = \{type: F, key: a3, val: a3, elwidth: dflt\}
 637  CSRvect2 = \{type: F, key: a7, val: a7, elwidth: dflt\}
 638 loop:
 639  setvl  t0, a0, 4       # vl = t0 = min(4, n)
 640  ld     a3, a1          # load 4 registers a3-6 from x
 641  slli   t1, t0, 3       # t1 = vl * 8 (in bytes)
 642  ld     a7, a2          # load 4 registers a7-10 from y
 643  add    a1, a1, t1      # increment pointer to x by vl*8
 644  fmadd  a7, a3, fa0, a7 # v1 += v0 * fa0 (y = a * x + y)
 645  sub    a0, a0, t0      # n -= vl (t0)
 646  st     a7, a2          # store 4 registers a7-10 to y
 647  add    a2, a2, t1      # increment pointer to y by vl*8
 648  bnez   a0, loop        # repeat if n != 0
 649 \end{semiverbatim}
 650 \end{frame}
 651
 652
 653 \frame{\frametitle{Under consideration}
 654
 655  \begin{itemize}
 656    \item Is C.FNE actually needed? Should it be added if it is?
 657    \item Element type implies polymorphism.  Should it be in SV?
 658    \item Should use of registers be allowed to "wrap" (x30 x31 x1 x2)?
 659    \item Is detection of all-scalar ops ok (without slowing pipeline)?
 660    \item Can VSELECT be removed? (it's really complex)
 661    \item Can CLIP be done as a CSR (mode, like elwidth)
 662    \item SIMD saturation (etc.) also set as a mode?
 663    \item Include src1/src2 predication on Comparison Ops?\\
 664              (same arrangement as C.MV, with same flexibility/power)
 665    \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
 666          (a bit like misaligned addressing... for registers)\\
 667          or just use predication to skip start?
 668   \end{itemize}
 669 }
 670
 671
 672 \frame{\frametitle{What's the downside(s) of SV?}
 673  \begin{itemize}
 674    \item EVERY register operation is inherently parallelised\\
 675              (scalar ops are just vectors of length 1)\vspace{4pt}
 676    \item Tightly coupled with the core (instruction issue)\\
 677          could be disabled through MISA switch\vspace{4pt}
 678    \item An extra pipeline phase is pretty much essential\\
 679          for fast low-latency implementations\vspace{4pt}
 680    \item With zeroing off, skipping non-predicated elements is hard:\\
 681          it is however an optimisation (and could be skipped).\vspace{4pt}
 682    \item Setting up the Register/Predication tables (interpreting the\\
 683              CSR key-value stores) might be a bit complex to optimise
 684              (any change to a CSR key-value entry needs to redo the table)
 685   \end{itemize}
 686 }
 687
 688
 689 \frame{\frametitle{Is this OK (low latency)? Detect scalar-ops (only)}
 690  \begin{center}
 691   \includegraphics[height=2.5in]{scalardetect.png}\\
 692   {\bf \red Detect when all registers are scalar for a given op}
 693  \end{center}
 694 }
 695
 696
 697 \frame{\frametitle{Summary}
 698
 699  \begin{itemize}
 700    \item Actually about parallelism, not Vectors (or SIMD) per se\\
 701          and NOT about adding new ALU/logic/functionality.
 702    \item Only needs 2 actual instructions (plus the CSRs).\\
 703          RVV - and "standard" SIMD - require ISA duplication
 704    \item Designed for flexibility (graded levels of complexity)
 705    \item Huge range of implementor freedom
 706    \item Fits RISC-V ethos: achieve more with less
 707    \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
 708              (without SIMD downsides or sacrificing speed trade-off)
 709    \item Covers 98\% of RVV, allows RVV to fit "on top"
 710    \item Byproduct of SV is a reduction in code size, power usage
 711              etc. (increase efficiency, just like Compressed)
 712   \end{itemize}
 713 }
 714
 715
 716 \frame{
 717   \begin{center}
 718     {\Huge The end\vspace{20pt}\\
 719                    Thank you\vspace{20pt}\\
 720                    Questions?\vspace{20pt}
 721         }
 722   \end{center}
 723
 724   \begin{itemize}
 725         \item Discussion: ISA-DEV mailing list
 726         \item http://libre-riscv.org/simple\_v\_extension/
 727   \end{itemize}
 728 }
 729
 730
 731 \end{document}