add slide

author Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Tue, 22 May 2018 08:48:44 +0000 (09:48 +0100)

committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Tue, 22 May 2018 08:48:44 +0000 (09:48 +0100)
author Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 22 May 2018 08:48:44 +0000 (09:48 +0100)
committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 22 May 2018 08:48:44 +0000 (09:48 +0100)
diff --git a/simple_v_extension.mdwn b/simple_v_extension.mdwn

index 53f75f490cc9650b8fe575b60c41ca261d290bbf..51fe43d14b4a3629fbc9c51898ab63d0c94de1af 100644 (file)
--- a/simple_v_extension.mdwn
+++ b/simple_v_extension.mdwn
@@ -1774,6 +1774,36 @@ discussion then led to the question of OoO architectures
  > relevant, is that the imprecise model increases the size of the context
  > structure, as the microarchitectural guts have to be spilled to memory.)
  
+-----
+
+>> >  it just occurred to me that there's another reason why the data
+>> > should be left instead of zeroed.  if the standard register file is
+>> > used, such that vectorised operations are translated to mean "please
+>> > insert multiple register-contiguous operations into the instruction
+>> > FIFO" and predication is used to *skip* some of those, then if the
+>> > next "vector" operation uses the (standard) registers that were masked
+>> > *out* of the previous operation it may proceed without blocking.
+>> >
+>> >  if however zeroing is made mandatory then that optimisation becomes
+>> > flat-out impossible to deploy.
+>> >
+>> >  whilst i haven't fully thought through the full implications, i
+>> > suspect RVV might also be able to benefit by being able to fit more
+>> > overlapping operations into the available SRAM by doing something
+>> > similar.
+>
+>
+> Luke, this is called density time masking. It doesn’t apply to only your
+> model with the “standard register file” is used. it applies to any
+> architecture that attempts to speed up by skipping computation and writeback
+> of masked elements.
+>
+> That said, the writing of zeros need not be explicit. It is possible to add
+> a “zero bit” per element that, when set, forces a zero to be read from the
+> vector (although the underlying storage may have old data). In this case,
+> there may be a way to implement DTM as well.
+
+
  
  ## Implementation Paradigms <a name="implementation_paradigms"></a>
  
diff --git a/simple_v_extension/simple_v_chennai_2018.tex b/simple_v_extension/simple_v_chennai_2018.tex

index 0d0a47759c378d8bfd8bed2a4e43e0012b8afdba..114b67912ebe590702f7b59ce0437cb540544f36 100644 (file)
--- a/simple_v_extension/simple_v_chennai_2018.tex
+++ b/simple_v_extension/simple_v_chennai_2018.tex
@@ -28,7 +28,8 @@
   \begin{itemize}
     \item The Designers of RISC-V\vspace{15pt}
     \item The RVV Working Group and contributors\vspace{15pt}
-   \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang and others\vspace{15pt}
+   \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\
+            Guy Lemurieux and others\vspace{15pt}
     \item ISA-Dev Group Members\vspace{10pt}
    \end{itemize}
  }
@@ -165,9 +166,10 @@
    \end{itemize}
    Key differences from RVV:\vspace{10pt}
     \begin{itemize}
-   \item Predication in INT regs as a BIT field (max VL=XLEN)\vspace{10pt}
-   \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)\vspace{10pt}
-   \item NO ZEROING: non-predicated elements are skipped\vspace{10pt}
+   \item Predication in INT regs as a BIT field (max VL=XLEN)
+   \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)
+   \item SV may condense sparse Vecs: RVV lets ALU do predication
+   \item NO ZEROING: non-predicated elements are skipped
    \end{itemize}
  }
author	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Tue, 22 May 2018 08:48:44 +0000 (09:48 +0100)
committer	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Tue, 22 May 2018 08:48:44 +0000 (09:48 +0100)
simple_v_extension.mdwn		patch \| blob \| history
simple_v_extension/simple_v_chennai_2018.tex		patch \| blob \| history