simple_v_extension/simple_v_chennai_2018.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
   7 \author{Luke Kenneth Casson Leighton}
   8
   9
  10 \begin{document}
  11
  12 \frame{
  13    \begin{center}
  14     \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
  15     \vspace{32pt}
  16     \Large{Flexible Vectorisation}\\
  17     \Large{(aka not so Simple-V?)}\\
  18     \vspace{24pt}
  19     \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
  20     \vspace{24pt}
  21     \large{\today}
  22   \end{center}
  23 }
  24
  25
  26 \frame{\frametitle{Credits and Acknowledgements}
  27
  28  \begin{itemize}
  29    \item The Designers of RISC-V\vspace{15pt}
  30    \item The RVV Working Group and contributors\vspace{15pt}
  31    \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
  32              Guy Lemurieux, Jonathan Neuschafer, Roger Bruisse,
  33              and others\vspace{15pt}
  34    \item ISA-Dev Group Members\vspace{10pt}
  35   \end{itemize}
  36 }
  37
  38
  39 \frame{\frametitle{Quick refresher on SIMD}
  40
  41  \begin{itemize}
  42    \item SIMD very easy to implement (and very seductive)\vspace{10pt}
  43    \item Parallelism is in the ALU\vspace{10pt}
  44    \item Zero-to-Negligeable impact for rest of core\vspace{10pt}
  45   \end{itemize}
  46   Where SIMD Goes Wrong:\vspace{10pt}
  47    \begin{itemize}
  48    \item See "SIMD instructions considered harmful"
  49    https://www.sigarch.org/simd-instructions-considered-harmful
  50    \item Corner-cases alone are extremely complex.\\
  51              Hardware is easy, but software is hell.
  52    \item O($N^{6}$) ISA opcode proliferation!\\
  53              opcode, elwidth, veclen, src1-src2-dest hi/lo
  54   \end{itemize}
  55 }
  56
  57 \frame{\frametitle{Quick refresher on RVV}
  58
  59  \begin{itemize}
  60    \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
  61    \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
  62    \item Requires a separate Register File\vspace{10pt}
  63    \item Can be implemented as a separate pipeline\vspace{10pt}
  64   \end{itemize}
  65   However...\vspace{10pt}
  66    \begin{itemize}
  67    \item 98 percent opcode duplication with rest of RV (CLIP)
  68    \item Extending RVV requires customisation not just of h/w:\\
  69              gcc and s/w also need customisation (and maintenance)
  70   \end{itemize}
  71 }
  72
  73
  74 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
  75
  76  \begin{itemize}
  77    \item Why?
  78          Implementors need flexibility in vectorisation to optimise for
  79          area or performance depending on the scope:
  80              embedded DSP, Mobile GPU's, Server CPU's and more.\vspace{4pt}\\
  81                  Compilers also need flexibility in vectorisation to optimise for cost
  82                  of pipeline setup, amount of state to context switch
  83                  and software portability\vspace{4pt}
  84    \item How?
  85              By implicitly marking INT/FP regs as "Vectorised",\\
  86              SV expresses how existing instructions should act
  87              on [contiguous] blocks of registers, in parallel.\vspace{4pt}
  88    \item What?
  89                  Simple-V is an "API" that implicitly extends
  90                  existing (scalar) instructions with explicit parallelisation.
  91   \end{itemize}
  92 }
  93
  94
  95 \frame{\frametitle{What's the value of SV? Why adopt it even in non-V?}
  96
  97  \begin{itemize}
  98    \item memcpy becomes much smaller (higher bang-per-buck)\vspace{10pt}
  99    \item context-switch (LOAD/STORE multiple): 1-2 instructions\vspace{10pt}
 100    \item greatly-reduced I-cache load (and less reads)\vspace{10pt}
 101    \item parallelisation of C further reduces I-cache (etc.)\vspace{10pt}
 102   \end{itemize}
 103   Note:\vspace{10pt}
 104    \begin{itemize}
 105    \item It's not just about Vectors: it's about instruction effectiveness
 106    \item Anything implementor is not interested in HW-optimising,\\
 107              let it fall through to exceptions (implement as a trap).
 108   \end{itemize}
 109 }
 110
 111
 112 \frame{\frametitle{How does Simple-V relate to RVV?}
 113
 114  \begin{itemize}
 115    \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
 116    \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
 117    \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
 118    \item Even Compressed instructions become vectorised\vspace{10pt}
 119   \end{itemize}
 120   What Simple-V is not:\vspace{10pt}
 121    \begin{itemize}
 122    \item A full supercomputer-level Vector Proposal
 123    \item A replacement for RVV (SV is designed to be over-ridden\\
 124              by - or augmented to become, or just be replaced by  - RVV)
 125   \end{itemize}
 126 }
 127
 128
 129 \frame{\frametitle{How is Parallelism abstracted in Simple-V?}
 130
 131  \begin{itemize}
 132    \item Register "typing" turns any op into an implicit Vector op\vspace{10pt}
 133    \item Primarily at the Instruction issue phase (except SIMD)\\
 134          Note: it's ok to pass predication through to ALU (like SIMD)
 135    \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
 136   \end{itemize}
 137   Notes:\vspace{6pt}
 138    \begin{itemize}
 139    \item All LOAD/STORE (inc. Compressed, Int/FP versions)
 140    \item All ALU ops (soft / hybrid / full HW, on per-op basis)
 141    \item All branches become predication targets (C.FNE added)
 142    \item C.MV of particular interest (s/v, v/v, v/s)
 143   \end{itemize}
 144 }
 145
 146
 147 \frame{\frametitle{Implementation Options}
 148
 149  \begin{itemize}
 150    \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)
 151    \item Hardware loop, single-instruction issue\\
 152                  (Do / Don't send through predication to ALU)
 153    \item Hardware loop, parallel (multi-instruction) issue\\
 154                  (Do / Don't send through predication to ALU)
 155    \item Hardware loop, full parallel ALU (not recommended)
 156   \end{itemize}
 157   Notes:\vspace{6pt}
 158   \begin{itemize}
 159    \item 4 (or more?) options above may be deployed on per-op basis
 160    \item SIMD always sends predication bits through to ALU
 161    \item Minimum MVL MUST be sufficient to cover regfile LD/ST
 162    \item Instr. FIFO may repeatedly split off N scalar ops at a time
 163   \end{itemize}
 164 }
 165 % Instr. FIFO may need its own slide.  Basically, the vectorised op
 166 % gets pushed into the FIFO, where it is then "processed".  Processing
 167 % will remove the first set of ops from its vector numbering (taking
 168 % predication into account) and shoving them **BACK** into the FIFO,
 169 % but MODIFYING the remaining "vectorised" op, subtracting the now
 170 % scalar ops from it.
 171
 172 \frame{\frametitle{How are SIMD Instructions Vectorised?}
 173
 174  \begin{itemize}
 175    \item SIMD ALU(s) primarily unchanged\vspace{10pt}
 176    \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt}
 177    \item End of Vector enables predication (NO ZEROING!)\vspace{10pt}
 178   \end{itemize}
 179   Considerations:\vspace{10pt}
 180    \begin{itemize}
 181    \item Many SIMD ALUs possible (parallel execution)\vspace{10pt}
 182    \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt}
 183    \item Implementor free to choose (API remains the same)\vspace{10pt}
 184   \end{itemize}
 185 }
 186 % With multiple SIMD ALUs at for example 32-bit wide they can be used
 187 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
 188 % or they can be used to cover several operations on totally different
 189 % vectors / registers.
 190
 191 \frame{\frametitle{What's the deal / juice / score?}
 192
 193  \begin{itemize}
 194    \item Standard Register File(s) overloaded with "vector span"\vspace{10pt}
 195    \item Element width and type concepts remain same as RVV\vspace{10pt}
 196    \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
 197   \end{itemize}
 198   Key differences from RVV:\vspace{10pt}
 199    \begin{itemize}
 200    \item Predication in INT regs as a BIT field (max VL=XLEN)
 201    \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)
 202    \item SV may condense sparse Vecs: RVV lets ALU do predication
 203    \item NO ZEROING: non-predicated elements are skipped
 204   \end{itemize}
 205 }
 206
 207
 208 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
 209
 210  \begin{itemize}
 211    \item Same register(s) can have multiple "interpretations"\vspace{10pt}
 212    \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops\vspace{10pt}
 213    \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV)\vspace{10pt}
 214    \item Same register(s) can be offset (no need for VSLIDE)\vspace{10pt}
 215   \end{itemize}
 216   Note:\vspace{10pt}
 217    \begin{itemize}
 218    \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) \vspace{10pt}
 219    \item Hi-Performance: Macro-op fusion (more pipeline stages?)\vspace{10pt}
 220   \end{itemize}
 221 }
 222
 223
 224 \frame{\frametitle{Why no Zeroing (place zeros in non-predicated elements)?}
 225
 226  \begin{itemize}
 227    \item Zeroing is an implementation optimisation favouring OoO\vspace{8pt}
 228    \item Simple implementations may skip non-predicated operations\vspace{8pt}
 229    \item Simple implementations explicitly have to destroy data\vspace{8pt}
 230    \item Complex implementations may use reg-renames to save power\\
 231              Zeroing on predication chains makes optimisation harder
 232   \end{itemize}
 233   Considerations:\vspace{10pt}
 234   \begin{itemize}
 235    \item Complex not really impacted, Simple impacted a LOT
 236    \item Overlapping "Vectors" may issue overlapping ops
 237    \item Please don't use Vectors for "security" (use Sec-Ext)
 238   \end{itemize}
 239 }
 240 % with overlapping "vectors" - bearing in mind that "vectors" are
 241 % just a remap onto the standard register file, if the top bits of
 242 % predication are zero, and there happens to be a second vector
 243 % that uses some of the same register file that happens to be
 244 % predicated out, the second vector op may be issued *at the same time*
 245 % if there are available parallel ALUs to do so.
 246
 247
 248 \frame{\frametitle{Predication key-value CSR store}
 249
 250  \begin{itemize}
 251    \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
 252    \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
 253    \item register to store actual predication in (5 bits, value)\vspace{10pt}
 254    \item predication is inverted (1 bit)\vspace{10pt}
 255   \end{itemize}
 256   Notes:\vspace{10pt}
 257    \begin{itemize}
 258    \item Table should be expanded out for high-speed implementations
 259    \item Multiple "keys" (and values) theoretically permitted
 260    \item RVV rules about deleting higher-indexed CSRs followed
 261   \end{itemize}
 262 }
 263
 264
 265 \frame{\frametitle{Register key-value CSR store}
 266
 267  \begin{itemize}
 268    \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
 269    \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
 270    \item register to store actual predication in (5 bits, value)\vspace{10pt}
 271    \item TODO\vspace{10pt}
 272   \end{itemize}
 273   Notes:\vspace{10pt}
 274    \begin{itemize}
 275    \item Table should be expanded out for high-speed implementations
 276    \item Multiple "keys" (and values) theoretically permitted
 277    \item RVV rules about deleting higher-indexed CSRs followed
 278   \end{itemize}
 279 }
 280
 281
 282 \begin{frame}[fragile]
 283 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
 284
 285 \begin{semiverbatim}
 286 function op_add(rd, rs1, rs2, predr) # add not VADD!
 287   int i, id=0, irs1=0, irs2=0;
 288   for (i=0; i < MIN(VL, vectorlen[rd]); i++)
 289     if (ireg[predr] & 1<<i) # predication uses intregs
 290        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 291     if (reg_is_vectorised[rd]) \{ id += 1; \}
 292     if (reg_is_vectorised[rs1]) \{ irs1 += 1; \}
 293     if (reg_is_vectorised[rs2]) \{ irs2 += 1; \}
 294 \end{semiverbatim}
 295
 296   \begin{itemize}
 297    \item SIMD slightly more complex (case above is elwidth = default)
 298    \item Scalar-scalar and scalar-vector and vector-vector now all in one
 299    \item OoO may choose to push ADDs into instr. queue (v. busy!)
 300   \end{itemize}
 301 \end{frame}
 302
 303 \begin{frame}[fragile]
 304 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
 305
 306 \begin{semiverbatim}
 307 s1 = vectorlen[src1] > 1;
 308 s2 = vectorlen[src2] > 1;
 309 for (int i = 0; i < VL; ++i)
 310    preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1],
 311                          s2 ? reg[src2+i] : reg[src2]);
 312 \end{semiverbatim}
 313
 314   \begin{itemize}
 315    \item SIMD slightly more complex (case above is elwidth = default)
 316    \item If s1 and s2 both scalars, Standard branch occurs
 317    \item Predication stored in integer regfile as a bitfield
 318    \item Scalar-vector and vector-vector supported
 319   \end{itemize}
 320 \end{frame}
 321
 322 \begin{frame}[fragile]
 323 \frametitle{VLD/VLD.S/VLD.X (or trap, or actual hardware loop)}
 324
 325 \begin{semiverbatim}
 326 if (unit-strided) stride = elsize;
 327 else stride = areg[as2]; // constant-strided
 328 for (int i = 0; i < VL; ++i)
 329   if (preg_enabled[rd] && ([!]preg[rd] & 1<<i))
 330     for (int j = 0; j < seglen+1; j++)
 331       if (vectorised[rs2]) offs = vreg[rs2][i]
 332       else offs = i*(seglen+1)*stride;
 333       vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
 334 \end{semiverbatim}
 335
 336   \begin{itemize}
 337    \item Again: SIMD slightly more complex
 338    \item rs2 vectorised taken to implicitly indicate VLD.X
 339   \end{itemize}
 340 \end{frame}
 341
 342
 343 \frame{\frametitle{C.MV extremely flexible!}
 344
 345  \begin{itemize}
 346    \item scalar-to-vector (w/no pred): VSPLAT
 347    \item scalar-to-vector (w/dest-pred): Sparse VSPLAT
 348    \item scalar-to-vector (w/single dest-pred): VINSERT
 349    \item vector-to-scalar (w/src-pred): VEXTRACT
 350    \item vector-to-vector (w/no pred): Vector Copy
 351    \item vector-to-vector (w/src xor dest pred): Sparse Vector Copy
 352    \item vector-to-vector (w/src and dest pred): Vector Gather/Scatter
 353   \end{itemize}
 354   \vspace{8pt}
 355   Notes:\vspace{10pt}
 356    \begin{itemize}
 357    \item Really powerful!
 358    \item Any other options?
 359   \end{itemize}
 360 }
 361
 362
 363 \frame{\frametitle{Opcodes, compared to RVV}
 364
 365  \begin{itemize}
 366    \item All integer and FP opcodes all removed (no CLIP!)\vspace{8pt}
 367    \item VMPOP, VFIRST etc. all removed (use xBitManip)\vspace{8pt}
 368    \item VSLIDE removed (use regfile overlaps)\vspace{8pt}
 369    \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)\vspace{8pt}
 370    \item VSETVL, VGETVL, VSELECT stay\vspace{8pt}
 371    \item Issue: VCLIP is not in RV* (add with custom ext?)\vspace{8pt}
 372    \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)\vspace{8pt}
 373    \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)\vspace{8pt}
 374   \end{itemize}
 375 }
 376
 377
 378 \frame{\frametitle{Under consideration}
 379
 380  \begin{itemize}
 381    \item Is C.FNE actually needed? Should it be added if it is?
 382    \item Is detection of all-scalar ops ok (without slowing pipeline)?
 383    \item Can VSELECT be removed? (it's really complex)
 384    \item Can CLIP be done as a CSR (mode, like elwidth)
 385    \item SIMD saturation (etc.) also set as a mode?
 386    \item C.MV src predication no different from dest predication\\
 387          What to do? Make one have different meaning?
 388    \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
 389          (a bit like misaligned addressing... for registers)\\
 390          or just use predication to skip start?
 391   \end{itemize}
 392 }
 393
 394
 395 \frame{\frametitle{Summary}
 396
 397  \begin{itemize}
 398    \item Designed for simplicity (graded levels of complexity)\vspace{10pt}
 399    \item Fits RISC-V ethos: do more with less\vspace{10pt}
 400    \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
 401              (without SIMD downsides or sacrificing speed trade-off)\vspace{10pt}
 402    \item Covers 98\% of RVV, allows RVV to fit "on top"\vspace{10pt}
 403    \item Huge range of implementor freedom and flexibility\vspace{10pt}
 404    \item Not designed for supercomputing (that's RVV), designed for
 405          in between: DSPs, RV32E, Embedded 3D GPUs etc.\vspace{10pt}
 406   \end{itemize}
 407 }
 408
 409
 410 \frame{\frametitle{slide}
 411
 412  \begin{itemize}
 413    \item \vspace{10pt}
 414   \end{itemize}
 415   Considerations:\vspace{10pt}
 416   \begin{itemize}
 417    \item \vspace{10pt}
 418   \end{itemize}
 419 }
 420
 421
 422 \frame{\frametitle{Including a plot}
 423  \begin{center}
 424 %  \includegraphics[height=2in]{dental.ps}\\
 425   {\bf \red Dental trajectories for 27 children:}
 426  \end{center}
 427 }
 428
 429 \frame{\frametitle{Creating .pdf slides in WinEdt}
 430
 431  \begin{itemize}
 432    \item LaTeX [Shift-Control-L]\vspace{10pt}
 433    \item dvi2pdf [click the button]\vspace{24pt}
 434   \end{itemize}
 435   To print 4 slides per page in acrobat click\vspace{10pt}
 436    \begin{itemize}
 437    \item File/print/properties\vspace{10pt}
 438    \item Change ``pages per sheet'' to 4\vspace{10pt}
 439   \end{itemize}
 440 }
 441
 442 \frame{
 443   \begin{center}
 444     {\Huge \red The end}
 445   \end{center}
 446 }
 447
 448
 449 \end{document}