From 2fa2b6dabc706b5dfbd56336304a0382ad258698 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Wed, 13 Jun 2018 05:27:35 +0100 Subject: [PATCH] update --- simple_v_extension.mdwn | 83 ++++++++++++++++++++ simple_v_extension/simple_v_chennai_2018.tex | 18 ++--- 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/simple_v_extension.mdwn b/simple_v_extension.mdwn index 92b616e5c..0bfa39ee5 100644 --- a/simple_v_extension.mdwn +++ b/simple_v_extension.mdwn @@ -1068,6 +1068,89 @@ Similar rules apply to the destination register. * Throw an exception. Whether that actually results in spawning threads as part of the trap-handling remains to be seen. +# Under consideration + +From the Chennai 2018 slides the following issues were raised. +Efforts to analyse and answer these questions are below. + +* Should future extra bank be included now? +* How many Register and Predication CSRs should there be? + (and how many in RV32E) +* How many in M-Mode (for doing context-switch)? +* Should use of registers be allowed to "wrap" (x30 x31 x1 x2)? +* Can CLIP be done as a CSR (mode, like elwidth) +* SIMD saturation (etc.) also set as a mode? +* Include src1/src2 predication on Comparison Ops? + (same arrangement as C.MV, with same flexibility/power) +* 8/16-bit ops is it worthwhile adding a "start offset"? + (a bit like misaligned addressing... for registers) + or just use predication to skip start? + +## Future (extra) bank be included (made mandatory) + +The implications of expanding the *standard* register file from +32 entries per bank to 64 per bank is quite an extensive architectural +change. Also it has implications for context-switching. + +Therefore, on balance, it is not recommended and certainly should +not be made a *mandatory* requirement for the use of SV. SV's design +ethos is to be minimally-disruptive for implementors to shoe-horn +into an existing design. + +## How large should the Register and Predication CSR key-value stores be? + +This is something that definitely needs actual evaluation and for +code to be run and the results analysed. At the time of writing +(12jul2018) that is too early to tell. An approximate best-guess +however would be 16 entries. + +RV32E however is a special case, given that it is highly unlikely +(but not outside the realm of possibility) that it would be used +for performance reasons but instead for reducing instruction count. +The number of CSR entries therefore has to be considered extremely +carefully. + +## How many CSR entries in M-Mode or S-Mode (for context-switching)? + +The minimum required CSR entries would be 1 for each register-bank: +one for integer and one for floating-point. However, as shown +in the "Context Switch Example" section, for optimal efficiency +(minimal instructions in a low-latency situation) the CSRs for +the context-switch should be set up *and left alone*. + +This means that it is not really a good idea to touch the CSRs +used for context-switching in the M-Mode (or S-Mode) trap, so +if there is ever demonstrated a need for vectors then there would +need to be *at least* one more free. However just one does not make +much sense (as it one only covers scalar-vector ops) so it is more +likely that at least two extra would be needed. + +This *in addition* - in the RV32E case - if an RV32E implementation +happens also to support U/S/M modes. This would be considered quite +rare but not outside of the realm of possibility. + +Conclusion: all needs careful analysis and future work. + +## Should use of registers be allowed to "wrap" (x30 x31 x1 x2)? + +TBD + +## Can CLIP be done as a CSR (mode, like elwidth) + +TBD + +## SIMD saturation (etc.) also set as a mode? + +TBD + +## Include src1/src2 predication on Comparison Ops? + +TBD + +## 8/16-bit ops is it worthwhile adding a "start offset"? + +TBD + # Impementing V on top of Simple-V With Simple-V converting the original RVV draft concept-for-concept diff --git a/simple_v_extension/simple_v_chennai_2018.tex b/simple_v_extension/simple_v_chennai_2018.tex index 33b2a3fb8..cb10e9559 100644 --- a/simple_v_extension/simple_v_chennai_2018.tex +++ b/simple_v_extension/simple_v_chennai_2018.tex @@ -70,7 +70,7 @@ \item Extending RVV requires customisation not just of h/w:\\ gcc, binutils also need customisation (and maintenance) \item Fascinatingly, despite being a SIMD-variant, RVV only has - O(1) opcode proliferation! (extremely well designed) + O(N) opcode proliferation! (extremely well designed) \end{itemize} } @@ -301,7 +301,6 @@ for (int i = 0; i < VL; ++i) \begin{semiverbatim} struct vectorised fp\_vec[32], int\_vec[32]; // 64 in future - for (i = 0; i < 16; i++) // 16 CSRs? tb = int\_vec if CSRvec[i].type == 0 else fp\_vec idx = CSRvec[i].regkey // INT/FP src/dst reg in opcode @@ -310,6 +309,7 @@ for (i = 0; i < 16; i++) // 16 CSRs? tb[idx].isvector = CSRvec[i].isvector tb[idx].packed = CSRvec[i].packed // SIMD or not tb[idx].bank = CSRvec[i].bank // 0 (1=rsvd) + tb[idx].enabled = true \end{semiverbatim} \begin{itemize} @@ -344,7 +344,6 @@ for (i = 0; i < 16; i++) // 16 CSRs? \begin{semiverbatim} struct pred fp\_pred[32], int\_pred[32]; // 64 in future - for (i = 0; i < 16; i++) // 16 CSRs? tb = int\_pred if CSRpred[i].type == 0 else fp\_pred idx = CSRpred[i].regkey @@ -356,7 +355,8 @@ for (i = 0; i < 16; i++) // 16 CSRs? \end{semiverbatim} \begin{itemize} - \item All 32 int and 32 FP entries zero'd before setting + \item All 32 int and 32 FP entries zero'd before setting\\ + (predication disabled) \item Might be a bit complex to set up in hardware (keep as CAM?) \end{itemize} @@ -524,7 +524,7 @@ function op\_add(rd, rs1, rs2) # add not VADD! \frame{\frametitle{Why are overlaps allowed in Regfiles?} \begin{itemize} - \item Same register(s) can have multiple "interpretations" + \item Same target register(s) can have multiple "interpretations" \item CSRs are costly to write to (do it once) \item Set "real" register (scalar) without needing to set/unset CSRs. \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops @@ -536,7 +536,7 @@ function op\_add(rd, rs1, rs2) # add not VADD! \end{itemize} Note: \begin{itemize} - \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) + \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) on its own. \item Hi-Performance: Macro-op fusion (more pipeline stages?) \end{itemize} } @@ -550,8 +550,8 @@ function op\_add(rd, rs1, rs2) # add not VADD! \item scalar-to-vector (w/ 1-bit dest-pred): VINSERT \item vector-to-scalar (w/ [1-bit?] src-pred): VEXTRACT \item vector-to-vector (w/ no pred): Vector Copy - \item vector-to-vector (w/ src pred): Vector Gather - \item vector-to-vector (w/ dest pred): Vector Scatter + \item vector-to-vector (w/ src pred): Vector Gather (inc VSLIDE) + \item vector-to-vector (w/ dest pred): Vector Scatter (inc. VSLIDE) \item vector-to-vector (w/ src \& dest pred): Vector Gather/Scatter \end{itemize} \vspace{4pt} @@ -686,7 +686,7 @@ loop: CSRvect1 = \{type: F, key: a3, val: a3, elwidth: dflt\} CSRvect2 = \{type: F, key: a7, val: a7, elwidth: dflt\} loop: - setvl t0, a0, 4 # vl = t0 = min(4, n) + setvl t0, a0, 4 # vl = t0 = min(min(mvl, 4, n)) ld a3, a1 # load 4 registers a3-6 from x slli t1, t0, 3 # t1 = vl * 8 (in bytes) ld a7, a2 # load 4 registers a7-10 from y -- 2.30.2