X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=simple_v_extension.mdwn;h=e422e11f7c8b38c33b20642e31586ee7dc8b652b;hb=3c2af6f3a70d20b0cb777cf75f8294511de0037e;hp=423f49fead1def110e988aef5aa2a511e8ead7fc;hpb=2d1ef4f0b3b22b1662d07cc068201e5c1b0aff48;p=libreriscv.git

diff --git a/simple_v_extension.mdwn b/simple_v_extension.mdwn
index 423f49fea..e422e11f7 100644
--- a/simple_v_extension.mdwn
+++ b/simple_v_extension.mdwn
@@ -1,14 +1,5 @@
 # Variable-width Variable-packed SIMD / Simple-V / Parallelism Extension Proposal
 
-* TODO 23may2018: CSR-CAM-ify regfile tables
-* TODO 23may2018: zero-mark predication CSR
-* TODO 28may2018: sort out VSETVL: CSR length to be removed?
-* TODO 09jun2018: Chennai Presentation more up-to-date
-* TODO 09jun2019: elwidth only 4 values (dflt, dflt/2, 8, 16)
-* TODO 09jun2019: extra register banks (future option)
-* TODO 09jun2019: new Reg CSR table (incl. packed=Y/N)
-
-
 Key insight: Simple-V is intended as an abstraction layer to provide
 a consistent "API" to parallelisation of existing *and future* operations.
 *Actual* internal hardware-level parallelism is *not* required, such
@@ -18,8 +9,10 @@ instruction queue (FIFO), pending execution.
 
 *Actual* parallelism, if added independently of Simple-V in the form
 of Out-of-order restructuring (including parallel ALU lanes) or VLIW
-implementations, or SIMD, or anything else, would then benefit *if*
-Simple-V was added on top.
+implementations, or SIMD, or anything else, would then benefit from
+the uniformity of a consistent API.
+
+Talk slides: <http://hands.com/~lkcl/simple_v_chennai_2018.pdf>
 
 [[!toc ]]
 
@@ -1068,7 +1061,7 @@ Similar rules apply to the destination register.
 * Throw an exception.  Whether that actually results in spawning threads
   as part of the trap-handling remains to be seen.
 
-# Under consideration
+# Under consideration <a name="issues"></a>
 
 From the Chennai 2018 slides the following issues were raised.
 Efforts to analyse and answer these questions are below.
@@ -1511,37 +1504,35 @@ the question is asked "How can each of the proposals effectively implement
 
 ### Example Instruction translation: <a name="example_translation"></a>
 
-Instructions "ADD r2 r4 r4" would result in three instructions being
-generated and placed into the FIFO:
+Instructions "ADD r7 r4 r4" would result in three instructions being
+generated and placed into the FIFO.  r7 and r4 are marked as "vectorised":
 
-* ADD r2 r4 r4
-* ADD r2 r5 r5
-* ADD r2 r6 r6
+* ADD r7 r4 r4
+* ADD r8 r5 r5
+* ADD r9 r6 r6
+
+Instructions "ADD r7 r4 r1" would result in three instructions being
+generated and placed into the FIFO.  r7 and r1 are marked as "vectorised"
+whilst r4 is not:
+
+* ADD r7 r4 r1
+* ADD r8 r4 r2
+* ADD r9 r4 r3
 
 ## Example of vector / vector, vector / scalar, scalar / scalar => vector add
 
-    register CSRvectorlen[XLEN][4]; # not quite decided yet about this one...
-    register CSRpredicate[XLEN][4]; # 2^4 is max vector length
-    register CSRreg_is_vectorised[XLEN]; # just for fun support scalars as well
-    register x[32][XLEN];
-
-    function op_add(rd, rs1, rs2, predr)
-    {
-    Â  Â /* note that this is ADD, not PADD */
-    Â  Â int i, id, irs1, irs2;
-    Â  Â # checks CSRvectorlen[rd] == CSRvectorlen[rs] etc. ignored
-    Â  Â # also destination makes no sense as a scalar but what the hell...
-    Â  Â for (i = 0, id=0, irs1=0, irs2=0; i<CSRvectorlen[rd]; i++)
-    Â  Â  Â  if (CSRpredicate[predr][i]) # i *think* this is right...
-    Â  Â  Â  Â  Â x[rd+id] <= x[rs1+irs1] + x[rs2+irs2];
-    Â  Â  Â  # now increment the idxs
-    Â  Â  Â  if (CSRreg_is_vectorised[rd]) # bitfield check rd, scalar/vector?
-    Â  Â  Â  Â  Â id += 1;
-    Â  Â  Â  if (CSRreg_is_vectorised[rs1]) # bitfield check rs1, scalar/vector?
-    Â  Â  Â  Â  Â irs1 += 1;
-    Â  Â  Â  if (CSRreg_is_vectorised[rs2]) # bitfield check rs2, scalar/vector?
-    Â  Â  Â  Â  Â irs2 += 1;
-    }
+    function op_add(rd, rs1, rs2) # add not VADD!
+     Â int i, id=0, irs1=0, irs2=0;
+     Â rd  = int_vec[rd ].isvector ? int_vec[rd ].regidx : rd;
+     Â rs1 = int_vec[rs1].isvector ? int_vec[rs1].regidx : rs1;
+     Â rs2 = int_vec[rs2].isvector ? int_vec[rs2].regidx : rs2;
+     Â predval = get_pred_val(FALSE, rd);
+     Â for (i = 0; i < VL; i++)
+        if (predval & 1<<i) # predication uses intregs
+     Â     Â ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
+        if (int_vec[rd ].isvector) Â { id += 1; }
+        if (int_vec[rs1].isvector) Â { irs1 += 1; }
+        if (int_vec[rs2].isvector) Â { irs2 += 1; }
 
 ## Retro-fitting Predication into branch-explicit ISA <a name="predication_retrofit"></a>
 
@@ -2353,3 +2344,5 @@ TBD: floating-point compare and other exception handling
 * <http://www.ece.ubc.ca/~lemieux/publications/severance-fpga2015.pdf>
 * Full Description (last page) of RVV instructions
   <https://inst.eecs.berkeley.edu/~cs152/sp18/handouts/lab4-1.0.pdf>
+* PULP Low-energy Cluster Vector Processor
+  <http://iis-projects.ee.ethz.ch/index.php/Low-Energy_Cluster-Coupled_Vector_Coprocessor_for_Special-Purpose_PULP_Acceleration>