simple_v_extension/simple_v_chennai_2018.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
   7 \author{Luke Kenneth Casson Leighton}
   8
   9
  10 \begin{document}
  11
  12 \frame{
  13    \begin{center}
  14     \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
  15     \vspace{32pt}
  16     \Large{Flexible Vectorisation}\\
  17     \Large{(aka not so Simple-V?)}\\
  18     \vspace{24pt}
  19     \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
  20     \vspace{24pt}
  21     \large{\today}
  22   \end{center}
  23 }
  24
  25
  26 \frame{\frametitle{Credits and Acknowledgements}
  27
  28  \begin{itemize}
  29    \item The Designers of RISC-V\vspace{15pt}
  30    \item The RVV Working Group and contributors\vspace{15pt}
  31    \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang and others\vspace{15pt}
  32    \item ISA-Dev Group Members\vspace{10pt}
  33   \end{itemize}
  34 }
  35
  36
  37 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
  38
  39  \begin{itemize}
  40    \item Vectorisation needs to fit (be useful within) an implementor's\\
  41              scope: RV32E, Embedded/Mobile, DSP, Servers and more.\vspace{15pt}
  42    \item By implicitly marking INT/FP regs as "Vectorised",\\
  43              everything else follows from there.\vspace{15pt}
  44    \item A Standard Vector "API" with flexibility for implementors:\\
  45              choice to optimise for area or performance as desired\vspace{10pt}
  46   \end{itemize}
  47 }
  48
  49
  50 \frame{\frametitle{Why another Vector Extension?}
  51
  52  \begin{itemize}
  53    \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
  54    \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
  55    \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
  56    \item Even Compressed instructions become vectorised\vspace{10pt}
  57   \end{itemize}
  58   What Simple-V is not:\vspace{10pt}
  59    \begin{itemize}
  60    \item A full supercomputer-level Vector Proposal\vspace{10pt}
  61    \item A replacement for RVV (designed to be augmented)\vspace{10pt}
  62   \end{itemize}
  63 }
  64
  65
  66 \frame{\frametitle{Quick refresher on SIMD}
  67
  68  \begin{itemize}
  69    \item SIMD very easy to implement (and very seductive)\vspace{10pt}
  70    \item Parallelism is in the ALU\vspace{10pt}
  71    \item Zero-to-Negligeable impact for rest of core\vspace{10pt}
  72   \end{itemize}
  73   Where SIMD Goes Wrong:\vspace{10pt}
  74    \begin{itemize}
  75    \item See "SIMD instructions considered harmful"
  76    https://www.sigarch.org/simd-instructions-considered-harmful
  77    \item Corner-cases alone are extremely complex.\\
  78              Hardware is easy, but software is hell.
  79    \item O($N^{6}$) ISA opcode proliferation!\\
  80              opcode, elwidth, veclen, src1-src2-dest hi/lo
  81   \end{itemize}
  82 }
  83
  84 \frame{\frametitle{Quick refresher on RVV}
  85
  86  \begin{itemize}
  87    \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
  88    \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
  89    \item Requires a separate Register File\vspace{10pt}
  90    \item Can be implemented as a separate pipeline\vspace{10pt}
  91   \end{itemize}
  92   However...\vspace{10pt}
  93    \begin{itemize}
  94    \item 98 percent opcode duplication with rest of RV (CLIP)
  95    \item Extending RVV requires customisation not just of h/w:\\
  96              gcc and s/w also need customisation (and maintenance)
  97   \end{itemize}
  98 }
  99
 100
 101 \frame{\frametitle{How is Parallelism abstracted?}
 102
 103  \begin{itemize}
 104    \item Register "typing" turns any op into an implicit Vector op\vspace{10pt}
 105    \item Primarily at the Instruction issue phase (except SIMD)\vspace{10pt}
 106    \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
 107   \end{itemize}
 108   Notes:\vspace{10pt}
 109    \begin{itemize}
 110    \item LOAD/STORE (inc. C.LD and C.ST, LD.X: everything)
 111    \item All ALU ops (soft / hybrid / full HW, on per-op basis)
 112    \item All branches become predication targets (C.FNE added)
 113    \item C.MV of particular interest (s/v, v/v, v/s)
 114   \end{itemize}
 115 }
 116
 117
 118 \frame{\frametitle{Implementation Options}
 119
 120  \begin{itemize}
 121    \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)\vspace{10pt}
 122    \item Hardware loop, single-instruction issue\vspace{10pt}
 123    \item Hardware loop, parallel (multi-instruction) issue\vspace{10pt}
 124    \item Hardware loop, full parallel ALU (not recommended)\vspace{10pt}
 125   \end{itemize}
 126   Notes:\vspace{10pt}
 127   \begin{itemize}
 128    \item 4 (or more?) options above may be deployed on per-op basis
 129    \item Minimum MVL MUST be sufficient to cover regfile LD/ST
 130    \item Instr. FIFO may repeatedly split off N scalar ops at a time
 131   \end{itemize}
 132 }
 133 % Instr. FIFO may need its own slide.  Basically, the vectorised op
 134 % gets pushed into the FIFO, where it is then "processed".  Processing
 135 % will remove the first set of ops from its vector numbering (taking
 136 % predication into account) and shoving them **BACK** into the FIFO,
 137 % but MODIFYING the remaining "vectorised" op, subtracting the now
 138 % scalar ops from it.
 139
 140 \frame{\frametitle{How are SIMD Instructions Vectorised?}
 141
 142  \begin{itemize}
 143    \item SIMD ALU(s) primarily unchanged\vspace{10pt}
 144    \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt}
 145    \item End of Vector enables predication (NO ZEROING!)\vspace{10pt}
 146   \end{itemize}
 147   Considerations:\vspace{10pt}
 148    \begin{itemize}
 149    \item Many SIMD ALUs possible (parallel execution)\vspace{10pt}
 150    \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt}
 151    \item Implementor free to choose (API remains the same)\vspace{10pt}
 152   \end{itemize}
 153 }
 154 % With multiple SIMD ALUs at for example 32-bit wide they can be used
 155 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
 156 % or they can be used to cover several operations on totally different
 157 % vectors / registers.
 158
 159 \frame{\frametitle{What's the deal / juice / score?}
 160
 161  \begin{itemize}
 162    \item Standard Register File(s) overloaded with "vector span"\vspace{10pt}
 163    \item Element width and type concepts remain same as RVV\vspace{10pt}
 164    \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
 165   \end{itemize}
 166   Key differences from RVV:\vspace{10pt}
 167    \begin{itemize}
 168    \item Predication in INT regs as a BIT field (max VL=XLEN)\vspace{10pt}
 169    \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)\vspace{10pt}
 170    \item NO ZEROING: non-predicated elements are skipped\vspace{10pt}
 171   \end{itemize}
 172 }
 173
 174
 175 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
 176
 177  \begin{itemize}
 178    \item Same register(s) can have multiple "interpretations"\vspace{10pt}
 179    \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops\vspace{10pt}
 180    \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV)\vspace{10pt}
 181    \item Same register(s) can be offset (no need for VSLIDE)\vspace{10pt}
 182   \end{itemize}
 183   Note:\vspace{10pt}
 184    \begin{itemize}
 185    \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) \vspace{10pt}
 186    \item Hi-Performance: Macro-op fusion (more pipeline stages?)\vspace{10pt}
 187   \end{itemize}
 188 }
 189
 190
 191 \frame{\frametitle{Why no Zeroing (place zeros in non-predicated elements)?}
 192
 193  \begin{itemize}
 194    \item Zeroing is an implementation optimisation favouring OoO\vspace{8pt}
 195    \item Simple implementations may skip non-predicated operations\vspace{8pt}
 196    \item Simple implementations explicitly have to destroy data\vspace{8pt}
 197    \item Complex implementations may use reg-renames to save power\\
 198              Zeroing on predication chains makes optimisation harder
 199   \end{itemize}
 200   Considerations:\vspace{10pt}
 201   \begin{itemize}
 202    \item Complex not really impacted, Simple impacted a LOT
 203    \item Overlapping "Vectors" may issue overlapping ops
 204    \item Please don't use Vectors for "security" (use Sec-Ext)
 205   \end{itemize}
 206 }
 207 % with overlapping "vectors" - bearing in mind that "vectors" are
 208 % just a remap onto the standard register file, if the top bits of
 209 % predication are zero, and there happens to be a second vector
 210 % that uses some of the same register file that happens to be
 211 % predicated out, the second vector op may be issued *at the same time*
 212 % if there are available parallel ALUs to do so.
 213
 214
 215 \frame{\frametitle{Predication key-value CSR store}
 216
 217  \begin{itemize}
 218    \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
 219    \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
 220    \item register to store actual predication in (5 bits, value)\vspace{10pt}
 221    \item predication is inverted (1 bit)\vspace{10pt}
 222   \end{itemize}
 223   Notes:\vspace{10pt}
 224    \begin{itemize}
 225    \item Table should be expanded out for high-speed implementations
 226    \item Multiple "keys" (and values) theoretically permitted
 227    \item RVV rules about deleting higher-indexed CSRs followed
 228   \end{itemize}
 229 }
 230
 231
 232 \frame{\frametitle{Register key-value CSR store}
 233
 234  \begin{itemize}
 235    \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
 236    \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
 237    \item register to store actual predication in (5 bits, value)\vspace{10pt}
 238    \item TODO\vspace{10pt}
 239   \end{itemize}
 240   Notes:\vspace{10pt}
 241    \begin{itemize}
 242    \item Table should be expanded out for high-speed implementations
 243    \item Multiple "keys" (and values) theoretically permitted
 244    \item RVV rules about deleting higher-indexed CSRs followed
 245   \end{itemize}
 246 }
 247
 248
 249 \begin{frame}[fragile]
 250 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
 251
 252 \begin{semiverbatim}
 253 function op_add(rd, rs1, rs2, predr) # add not VADD!
 254   int i, id=0, irs1=0, irs2=0;
 255   for (i=0; i < MIN(VL, vectorlen[rd]); i++)
 256     if (ireg[predr] & 1<<i) # predication uses intregs
 257        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 258     if (reg_is_vectorised[rd]) \{ id += 1; \}
 259     if (reg_is_vectorised[rs1]) \{ irs1 += 1; \}
 260     if (reg_is_vectorised[rs2]) \{ irs2 += 1; \}
 261 \end{semiverbatim}
 262
 263   \begin{itemize}
 264    \item SIMD slightly more complex (case above is elwidth = default)
 265    \item Scalar-scalar and scalar-vector and vector-vector now all in one
 266    \item OoO may choose to push ADDs into instr. queue (v. busy!)
 267   \end{itemize}
 268 \end{frame}
 269
 270 \begin{frame}[fragile]
 271 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
 272
 273 \begin{semiverbatim}
 274 s1 = vectorlen[src1] > 1;
 275 s2 = vectorlen[src2] > 1;
 276 for (int i = 0; i < VL; ++i)
 277    preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1],
 278                          s2 ? reg[src2+i] : reg[src2]);
 279 \end{semiverbatim}
 280
 281   \begin{itemize}
 282    \item SIMD slightly more complex (case above is elwidth = default)
 283    \item If s1 and s2 both scalars, Standard branch occurs
 284    \item Predication stored in integer regfile as a bitfield
 285    \item Scalar-vector and vector-vector supported
 286   \end{itemize}
 287 \end{frame}
 288
 289 \begin{frame}[fragile]
 290 \frametitle{LD/LD.S/LD.X (or trap, or actual hardware loop)}
 291
 292 \begin{semiverbatim}
 293 if (unit-strided) stride = elsize;
 294 else stride = areg[as2]; // constant-strided
 295 for (int i = 0; i < VL; ++i)
 296   if (preg_enabled[rd] && ([!]preg[rd] & 1<<i))
 297     for (int j = 0; j < seglen+1; j++)
 298       if (vectorised[rs2]) offs = vreg[rs2][i]
 299       else offs = i*(seglen+1)*stride;
 300       vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
 301 \end{semiverbatim}
 302
 303   \begin{itemize}
 304    \item Again: SIMD slightly more complex
 305    \item rs2 vectorised taken to implicitly indicate LD.X
 306   \end{itemize}
 307 \end{frame}
 308
 309
 310 \frame{\frametitle{C.MV extremely flexible!}
 311
 312  \begin{itemize}
 313    \item scalar-to-vector (w/no pred): VSPLAT
 314    \item scalar-to-vector (w/dest-pred): Sparse VSPLAT
 315    \item scalar-to-vector (w/single dest-pred): VINSERT
 316    \item vector-to-scalar (w/src-pred): VEXTRACT
 317    \item vector-to-vector (w/no pred): Vector Copy
 318    \item vector-to-vector (w/src xor dest pred): Sparse Vector Copy
 319    \item vector-to-vector (w/src and dest pred): Vector Shuffle
 320   \end{itemize}
 321   \vspace{8pt}
 322   Notes:\vspace{10pt}
 323    \begin{itemize}
 324    \item Really powerful!
 325    \item Any other options?
 326   \end{itemize}
 327 }
 328
 329
 330 \frame{\frametitle{Opcodes, compared to RVV}
 331
 332  \begin{itemize}
 333    \item All integer and FP opcodes all removed (no CLIP!)\vspace{8pt}
 334    \item VMPOP, VFIRST etc. all removed (use xBitManip)\vspace{8pt}
 335    \item VSLIDE removed (use regfile overlaps)\vspace{8pt}
 336    \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)\vspace{8pt}
 337    \item VSETVL, VGETVL, VSELECT stay\vspace{8pt}
 338    \item Issue: VCLIP is not in RV* (add with custom ext?)\vspace{8pt}
 339    \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)\vspace{8pt}
 340    \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)\vspace{8pt}
 341   \end{itemize}
 342 }
 343
 344
 345 \frame{\frametitle{Under consideration}
 346
 347  \begin{itemize}
 348    \item Can VSELECT be removed? (it's really complex)\vspace{10pt}
 349    \item Can CLIP be done as a CSR (mode, like elwidth)\vspace{10pt}
 350    \item SIMD saturation (etc.) also set as a mode?\vspace{10pt}
 351    \item C.MV src predication no different from dest predication\\
 352          What to do? Make one have different meaning?\vspace{10pt}
 353    \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
 354          (a bit like misaligned addressing... for registers)\\
 355          or just use predication to skip start?\vspace{10pt}
 356   \end{itemize}
 357 }
 358
 359
 360 \frame{\frametitle{Summary}
 361
 362  \begin{itemize}
 363    \item Designed for simplicity (graded levels of complexity)\vspace{10pt}
 364    \item Fits RISC-V ethos: do more with less\vspace{10pt}
 365    \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
 366              (without SIMD downsides or sacrificing speed trade-off)\vspace{10pt}
 367    \item Covers 98\% of RVV, allows RVV to fit "on top"\vspace{10pt}
 368    \item Huge range of implementor freedom and flexibility\vspace{10pt}
 369    \item Not designed for supercomputing (that's RVV), designed for
 370          in between: DSPs, RV32E, Embedded 3D GPUs etc.\vspace{10pt}
 371   \end{itemize}
 372 }
 373
 374
 375 \frame{\frametitle{slide}
 376
 377  \begin{itemize}
 378    \item \vspace{10pt}
 379   \end{itemize}
 380   Considerations:\vspace{10pt}
 381   \begin{itemize}
 382    \item \vspace{10pt}
 383   \end{itemize}
 384 }
 385
 386
 387 \frame{\frametitle{Including a plot}
 388  \begin{center}
 389 %  \includegraphics[height=2in]{dental.ps}\\
 390   {\bf \red Dental trajectories for 27 children:}
 391  \end{center}
 392 }
 393
 394 \frame{\frametitle{Creating .pdf slides in WinEdt}
 395
 396  \begin{itemize}
 397    \item LaTeX [Shift-Control-L]\vspace{10pt}
 398    \item dvi2pdf [click the button]\vspace{24pt}
 399   \end{itemize}
 400   To print 4 slides per page in acrobat click\vspace{10pt}
 401    \begin{itemize}
 402    \item File/print/properties\vspace{10pt}
 403    \item Change ``pages per sheet'' to 4\vspace{10pt}
 404   \end{itemize}
 405 }
 406
 407 \frame{
 408   \begin{center}
 409     {\Huge \red The end}
 410   \end{center}
 411 }
 412
 413
 414 \end{document}