From 7f45d3b51b2ae9fb1a00ee756bf70c443381b30d Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Tue, 13 Feb 2024 12:09:46 +0000 Subject: [PATCH] bug 676: noted a way to reduce the number of instructions --- .../fosdem2024_ddffirst.tex | 15 ++++++++++-- .../fosdem2024/fosdem2024_ddffirst/maxloc.s | 2 +- openpower/sv/cookbook/fortran_maxloc.mdwn | 23 +++++++++++-------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex index fdbc83369..f196881e6 100644 --- a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex +++ b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex @@ -220,11 +220,22 @@ for (i = 0; i < VL; i++) \lstinputlisting[language={}]{maxloc.py} \begin{itemize} - \item "TODO + \item FORTRAN MAXLOC - find the index of largest number + \item notoriously difficult to optimally implement for SIMD + \item algorithms include \textit{depth-first} recursive + descent (!) mapreduce-style, offsetting the + locally-computed largest index (plus value) which + are then tested in upper level(s) + \item SVP64 through Data-Dependent Fail-First can perform + each of the two key while-loop tests with + \textit{single instructions}. + \item There is however quite a bit of "housekeeping". + Full analysis: \\ + https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc \end{itemize} } -\frame{\frametitle{maxlocassembler} +\frame{\frametitle{maxloc assembler} \lstinputlisting[language={}]{maxloc.s} diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s b/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s index f72de742a..2639c112d 100644 --- a/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s +++ b/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s @@ -8,7 +8,7 @@ mtcrf 128,0 # clear CR0 (in case VL=0?) # while (im): sv.minmax./ff=le/m=ge/mr 4,*10,4,1 # r4 accumulate crternlogi 0,1,2,127 # test >= (or VL=0) -sv.crand *19,*16,0 # clear if CR0.eq=0 +sv.crnand/m=lt/zz *19,*16,0 # SO=~LT, if CR0.eq=0 # nm = i: count masked bits. could use crweirds sv.svstep/mr/m=so 1,0,6,1 # get vector dststep sv.creqv *16,*16,*16 # set mask on already-tested diff --git a/openpower/sv/cookbook/fortran_maxloc.mdwn b/openpower/sv/cookbook/fortran_maxloc.mdwn index 5bfb537e6..232008ac9 100644 --- a/openpower/sv/cookbook/fortran_maxloc.mdwn +++ b/openpower/sv/cookbook/fortran_maxloc.mdwn @@ -126,12 +126,11 @@ setvl 2,0,4,0,1,1 # set MVL=4, VL=MIN(MVL,CTR) mtcrf 128, 0 # clear CR0 (in case VL=0?) # while (im): sv.minmax./ff=le/m=ge/mr 4,*10,4,1 # uses r4 as accumulator -crternlogi 0,1,2,127 # test greater/equal or VL=0 -sv.crand *19,*16,0 # clear if CR0.eq=0 +sv.crnand/m=lt/zz *19,*16,0 # SO=~LT, if CR0.eq=0 # nm = i (count masked bits. could use crweirds here TODO) sv.svstep/mr/m=so 1, 0, 6, 1 # svstep: get vector dststep sv.creqv *16,*16,*16 # set mask on already-tested -bc 12,0, -0x40 # CR0 lt bit clear, branch back +bc 12,0, -0x3c # CR0 lt bit clear, branch back ``` `sv.cmp` can be used in the first while loop because m (r4, the current @@ -171,12 +170,16 @@ by Vector Length (VL) being truncated - potentially even to zero! that happens then CR0 would be left it in its previous state: a very much undesirable behaviour!) -`crternlogi 0,1,2,127` will combine the setting of CR0.EQ and CR0.LT -to give us a true Greater-than-or-equal, including under the circumstance -where VL=0. The `sv.crand` will then take a copy of the `i-in-unary` -mask, but only when CR0.EQ is set. This is why the third operand `BB` -is a Scalar not a Vector (BT=16/Vector, BA=19/Vector, BB=0/Scalar) -which effectively performs a broadcast-splat-ANDing, as follows: +`sv.crnand/m=lt/zz` is quite sophisticated - a lot is going on behind +the scenes. The effect is (through the NAND) to invert the Less-than +to give us a Greater-than-or-equal, including under the circumstance +where VL=0, but only when CR0.EQ is set. Note that the third operand +`BB` (CR0.EQ) is a *scalar*, but that zeroing is used here. Therefore +whenever the Vector of `LT` bits is zero, a zero is put into the +Vector `SO` result. In effect, the predication is being exploited +as a way to combine a third operand into what would otherwise be a +2-in 1-out Condition Register operation, making it effectively 3-in 1-out. +Obscure but effective! ``` CR4.SO = CR4.EQ AND CR0.EQ (if VL >= 1) @@ -194,7 +197,7 @@ the `crweird` suite of instructions, combined with `popcnt`. However there is a very straightforward way to it: use `sv.svstep`. ``` -crternlogi 0,1,2,127 +sv.crnand/m=lt/zz *19,*16,0 # Vector SO = Vector ~LT, if CR0.eq=0 i ----> 0 1 2 3 CR4.EQ CR5.EQ CR6.EQ CR7.EQ -- 2.30.2