(no commit message)

[libreriscv.git] / simple_v_extension.mdwn
diff --git a/simple_v_extension.mdwn b/simple_v_extension.mdwn

index 99ae03031cdf81bf80a89a6d9e58dd98027effcb..e422e11f7c8b38c33b20642e31586ee7dc8b652b 100644 (file)
--- a/simple_v_extension.mdwn
+++ b/simple_v_extension.mdwn
@@ -1,14 +1,5 @@
  # Variable-width Variable-packed SIMD / Simple-V / Parallelism Extension Proposal
  
  # Variable-width Variable-packed SIMD / Simple-V / Parallelism Extension Proposal
  
-* TODO 23may2018: CSR-CAM-ify regfile tables
-* TODO 23may2018: zero-mark predication CSR
-* TODO 28may2018: sort out VSETVL: CSR length to be removed?
-* TODO 09jun2018: Chennai Presentation more up-to-date
-* TODO 09jun2019: elwidth only 4 values (dflt, dflt/2, 8, 16)
-* TODO 09jun2019: extra register banks (future option)
-* TODO 09jun2019: new Reg CSR table (incl. packed=Y/N)
-
-
  Key insight: Simple-V is intended as an abstraction layer to provide
  a consistent "API" to parallelisation of existing *and future* operations.
  *Actual* internal hardware-level parallelism is *not* required, such
  Key insight: Simple-V is intended as an abstraction layer to provide
  a consistent "API" to parallelisation of existing *and future* operations.
  *Actual* internal hardware-level parallelism is *not* required, such
@@ -18,8 +9,10 @@ instruction queue (FIFO), pending execution.
  
  *Actual* parallelism, if added independently of Simple-V in the form
  of Out-of-order restructuring (including parallel ALU lanes) or VLIW
  
  *Actual* parallelism, if added independently of Simple-V in the form
  of Out-of-order restructuring (including parallel ALU lanes) or VLIW
-implementations, or SIMD, or anything else, would then benefit *if*
-Simple-V was added on top.
+implementations, or SIMD, or anything else, would then benefit from
+the uniformity of a consistent API.
+
+Talk slides: <http://hands.com/~lkcl/simple_v_chennai_2018.pdf>
  
  [[!toc ]]
  
  
  [[!toc ]]
  
@@ -1068,7 +1061,7 @@ Similar rules apply to the destination register.
  * Throw an exception.  Whether that actually results in spawning threads
    as part of the trap-handling remains to be seen.
  
  * Throw an exception.  Whether that actually results in spawning threads
    as part of the trap-handling remains to be seen.
  
-# Under consideration
+# Under consideration <a name="issues"></a>
  
  From the Chennai 2018 slides the following issues were raised.
  Efforts to analyse and answer these questions are below.
  
  From the Chennai 2018 slides the following issues were raised.
  Efforts to analyse and answer these questions are below.
@@ -1191,7 +1184,44 @@ It is quite complex, in other words, and needs careful consideration.
  
  ## 8/16-bit ops is it worthwhile adding a "start offset"?
  
  
  ## 8/16-bit ops is it worthwhile adding a "start offset"?
  
-TBD
+The idea here is to make it possible, particularly in a "Packed SIMD"
+case, to be able to avoid doing unaligned Load/Store operations
+by specifying that operations, instead of being carried out
+element-for-element, are offset by a fixed amount *even* in 8 and 16-bit
+element Packed SIMD cases.
+
+For example rather than take 2 32-bit registers divided into 4 8-bit
+elements and have them ADDed element-for-element as follows:
+
+    r3[0] = add r4[0], r6[0]
+    r3[1] = add r4[1], r6[1]
+    r3[2] = add r4[2], r6[2]
+    r3[3] = add r4[3], r6[3]
+
+an offset of 1 would result in four operations as follows, instead:
+
+    r3[0] = add r4[1], r6[0]
+    r3[1] = add r4[2], r6[1]
+    r3[2] = add r4[3], r6[2]
+    r3[3] = add r5[0], r6[3]
+
+In non-packed-SIMD mode there is no benefit at all, as a vector may
+be created using a different CSR that has the offset built-in.  So this
+leaves just the packed-SIMD case to consider.
+
+Two ways in which this could be implemented / emulated (without special
+hardware):
+
+* bit-manipulation that shuffles the data along by one byte (or one word)
+  either prior to or as part of the operation requiring the offset.
+* just use an unaligned Load/Store sequence, even if there are performance
+  penalties for doing so.
+
+The question then is whether the performance hit is worth the extra hardware
+involving byte-shuffling/shifting the data by an arbitrary offset.  On
+balance given that there are two reasonable instruction-based options, the
+hardware-offset option should be left out for the initial version of SV,
+with the option to consider it in an "advanced" version of the specification.
  
  # Impementing V on top of Simple-V
  
  
  # Impementing V on top of Simple-V
  
@@ -1474,37 +1504,35 @@ the question is asked "How can each of the proposals effectively implement
  
  ### Example Instruction translation: <a name="example_translation"></a>
  
  
  ### Example Instruction translation: <a name="example_translation"></a>
  
-Instructions "ADD r2 r4 r4" would result in three instructions being
-generated and placed into the FIFO:
+Instructions "ADD r7 r4 r4" would result in three instructions being
+generated and placed into the FIFO.  r7 and r4 are marked as "vectorised":
  
  
-* ADD r2 r4 r4
-* ADD r2 r5 r5
-* ADD r2 r6 r6
+* ADD r7 r4 r4
+* ADD r8 r5 r5
+* ADD r9 r6 r6
+
+Instructions "ADD r7 r4 r1" would result in three instructions being
+generated and placed into the FIFO.  r7 and r1 are marked as "vectorised"
+whilst r4 is not:
+
+* ADD r7 r4 r1
+* ADD r8 r4 r2
+* ADD r9 r4 r3
  
  ## Example of vector / vector, vector / scalar, scalar / scalar => vector add
  
  
  ## Example of vector / vector, vector / scalar, scalar / scalar => vector add
  
-    register CSRvectorlen[XLEN][4]; # not quite decided yet about this one...
-    register CSRpredicate[XLEN][4]; # 2^4 is max vector length
-    register CSRreg_is_vectorised[XLEN]; # just for fun support scalars as well
-    register x[32][XLEN];
-
-    function op_add(rd, rs1, rs2, predr)
-    {
-       /* note that this is ADD, not PADD */
-       int i, id, irs1, irs2;
-       # checks CSRvectorlen[rd] == CSRvectorlen[rs] etc. ignored
-       # also destination makes no sense as a scalar but what the hell...
-       for (i = 0, id=0, irs1=0, irs2=0; i<CSRvectorlen[rd]; i++)
-          if (CSRpredicate[predr][i]) # i *think* this is right...
-             x[rd+id] <= x[rs1+irs1] + x[rs2+irs2];
-          # now increment the idxs
-          if (CSRreg_is_vectorised[rd]) # bitfield check rd, scalar/vector?
-             id += 1;
-          if (CSRreg_is_vectorised[rs1]) # bitfield check rs1, scalar/vector?
-             irs1 += 1;
-          if (CSRreg_is_vectorised[rs2]) # bitfield check rs2, scalar/vector?
-             irs2 += 1;
-    }
+    function op_add(rd, rs1, rs2) # add not VADD!
+      int i, id=0, irs1=0, irs2=0;
+      rd  = int_vec[rd ].isvector ? int_vec[rd ].regidx : rd;
+      rs1 = int_vec[rs1].isvector ? int_vec[rs1].regidx : rs1;
+      rs2 = int_vec[rs2].isvector ? int_vec[rs2].regidx : rs2;
+      predval = get_pred_val(FALSE, rd);
+      for (i = 0; i < VL; i++)
+        if (predval & 1<<i) # predication uses intregs
+           ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
+        if (int_vec[rd ].isvector)  { id += 1; }
+        if (int_vec[rs1].isvector)  { irs1 += 1; }
+        if (int_vec[rs2].isvector)  { irs2 += 1; }
  
  ## Retro-fitting Predication into branch-explicit ISA <a name="predication_retrofit"></a>
  
  
  ## Retro-fitting Predication into branch-explicit ISA <a name="predication_retrofit"></a>
  
@@ -2316,3 +2344,5 @@ TBD: floating-point compare and other exception handling
  * <http://www.ece.ubc.ca/~lemieux/publications/severance-fpga2015.pdf>
  * Full Description (last page) of RVV instructions
    <https://inst.eecs.berkeley.edu/~cs152/sp18/handouts/lab4-1.0.pdf>
  * <http://www.ece.ubc.ca/~lemieux/publications/severance-fpga2015.pdf>
  * Full Description (last page) of RVV instructions
    <https://inst.eecs.berkeley.edu/~cs152/sp18/handouts/lab4-1.0.pdf>
+* PULP Low-energy Cluster Vector Processor
+  <http://iis-projects.ee.ethz.ch/index.php/Low-Energy_Cluster-Coupled_Vector_Coprocessor_for_Special-Purpose_PULP_Acceleration>