From bc1a22b23bcba46db75c4a4dc7a2fea69973961f Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Tue, 22 May 2018 22:23:58 +0100
Subject: [PATCH] feedback from rogier bruisse

---
 simple_v_extension/simple_v_chennai_2018.tex | 92 +++++++++++---------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/simple_v_extension/simple_v_chennai_2018.tex b/simple_v_extension/simple_v_chennai_2018.tex
index 114b67912..045c8713c 100644
--- a/simple_v_extension/simple_v_chennai_2018.tex
+++ b/simple_v_extension/simple_v_chennai_2018.tex
@@ -29,41 +29,13 @@
    \item The Designers of RISC-V\vspace{15pt}
    \item The RVV Working Group and contributors\vspace{15pt}
    \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
-	     Guy Lemurieux and others\vspace{15pt}
+	     Guy Lemurieux, Jonathan NeuschÃ¤fer, Roger Bruisse,
+	     and others\vspace{15pt}
    \item ISA-Dev Group Members\vspace{10pt}
   \end{itemize}
 }
 
 
-\frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
-
- \begin{itemize}
-   \item Vectorisation needs to fit (be useful within) an implementor's\\ 
-	     scope: RV32E, Embedded/Mobile, DSP, Servers and more.\vspace{15pt}
-   \item By implicitly marking INT/FP regs as "Vectorised",\\
-	     everything else follows from there.\vspace{15pt}
-   \item A Standard Vector "API" with flexibility for implementors:\\
-	     choice to optimise for area or performance as desired\vspace{10pt}
-  \end{itemize}
-}
-
-
-\frame{\frametitle{Why another Vector Extension?}
-
- \begin{itemize}
-   \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
-   \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
-   \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
-   \item Even Compressed instructions become vectorised\vspace{10pt}
-  \end{itemize}
-  What Simple-V is not:\vspace{10pt}
-   \begin{itemize}
-   \item A full supercomputer-level Vector Proposal\vspace{10pt}
-   \item A replacement for RVV (designed to be augmented)\vspace{10pt}
-  \end{itemize}
-}
-
-
 \frame{\frametitle{Quick refresher on SIMD}
 
  \begin{itemize}
@@ -99,14 +71,53 @@
 }
 
 
-\frame{\frametitle{How is Parallelism abstracted?}
+\frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
+
+ \begin{itemize}
+   \item Why?
+         Implementors need flexibility in vectorisation to optimise for
+         area or performance depending on the scope:
+	     embedded DSP, Mobile GPU's, Server CPU's and more.\vspace{4pt}\\
+		 Compilers also need flexibility in vectorisation to optimise for cost 
+		 of pipeline setup, amount of state to context switch
+		 and software portability\vspace{4pt}
+   \item How?
+	     By implicitly marking INT/FP regs as "Vectorised":\\
+	     it expresses how existing instructions should act 
+	     on (contiguous) blocks of registers, in parallel.\vspace{4pt}
+   \item What?
+		 Simple-V is a vectorisation "API" that extends existing
+		 (scalar) instructions with explicit parallelisation. 
+  \end{itemize}
+}
+
+
+\frame{\frametitle{How does Simple-V relate to RVV?}
+
+ \begin{itemize}
+   \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
+   \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
+   \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt}
+   \item Even Compressed instructions become vectorised\vspace{10pt}
+  \end{itemize}
+  What Simple-V is not:\vspace{10pt}
+   \begin{itemize}
+   \item A full supercomputer-level Vector Proposal
+   \item A replacement for RVV (SV is designed to be over-ridden\\
+	     by - or augmented to become - RVV)
+  \end{itemize}
+}
+
+
+\frame{\frametitle{How is Parallelism abstracted in Simple-V?}
 
  \begin{itemize}
    \item Register "typing" turns any op into an implicit Vector op\vspace{10pt}
-   \item Primarily at the Instruction issue phase (except SIMD)\vspace{10pt}
+   \item Primarily at the Instruction issue phase (except SIMD)\\
+         Note: it's ok to pass predication through to ALU (like SIMD)
    \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
   \end{itemize}
-  Notes:\vspace{10pt}
+  Notes:\vspace{6pt}
    \begin{itemize}
    \item LOAD/STORE (inc. C.LD and C.ST, LD.X: everything)
    \item All ALU ops (soft / hybrid / full HW, on per-op basis)
@@ -119,14 +130,17 @@
 \frame{\frametitle{Implementation Options}
 
  \begin{itemize}
-   \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)\vspace{10pt}
-   \item Hardware loop, single-instruction issue\vspace{10pt}
-   \item Hardware loop, parallel (multi-instruction) issue\vspace{10pt}
-   \item Hardware loop, full parallel ALU (not recommended)\vspace{10pt}
-  \end{itemize}
-  Notes:\vspace{10pt}
+   \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)
+   \item Hardware loop, single-instruction issue\\
+		 (Do / Don't send through predication to ALU)
+   \item Hardware loop, parallel (multi-instruction) issue\\
+   		 (Do / Don't send through predication to ALU)
+   \item Hardware loop, full parallel ALU (not recommended)
+  \end{itemize}
+  Notes:\vspace{6pt}
   \begin{itemize}
    \item 4 (or more?) options above may be deployed on per-op basis
+   \item SIMD always sends predication bits through to ALU
    \item Minimum MVL MUST be sufficient to cover regfile LD/ST
    \item Instr. FIFO may repeatedly split off N scalar ops at a time
   \end{itemize}
-- 
2.30.2