bug 676: noted a way to reduce the number of instructions

author Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Tue, 13 Feb 2024 12:09:46 +0000 (12:09 +0000)

committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Tue, 13 Feb 2024 12:09:46 +0000 (12:09 +0000)
author Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 13 Feb 2024 12:09:46 +0000 (12:09 +0000)
committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 13 Feb 2024 12:09:46 +0000 (12:09 +0000)
diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex

index fdbc83369f330f9c73983aa6705a336a584d619f..f196881e6a73e487ded1bfc6f9b52e46c03e1f37 100644 (file)
--- a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex
+++ b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex
@@ -220,11 +220,22 @@ for (i = 0; i < VL; i++)
      \lstinputlisting[language={}]{maxloc.py}
  
         \begin{itemize}
-               \item "TODO
+               \item FORTRAN MAXLOC - find the index of largest number
+               \item notoriously difficult to optimally implement for SIMD
+               \item algorithms include \textit{depth-first} recursive
+                     descent (!) mapreduce-style, offsetting the
+                     locally-computed largest index (plus value) which
+                     are then tested in upper level(s)
+               \item SVP64 through Data-Dependent Fail-First can perform
+                         each of the two key while-loop tests with
+                         \textit{single instructions}.
+               \item There is however quite a bit of "housekeeping".
+                       Full analysis: \\
+       https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
         \end{itemize}
  }
  
-\frame{\frametitle{maxlocassembler}
+\frame{\frametitle{maxloc assembler}
         
         \lstinputlisting[language={}]{maxloc.s}
         
diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s b/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s

index f72de742ada525b7e148fb4dedffa11faf37d2fa..2639c112d8ffbe3de4ce8a1b17ea951d12342891 100644 (file)
--- a/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s
+++ b/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s
@@ -8,7 +8,7 @@ mtcrf 128,0           # clear CR0 (in case VL=0?)
  #  while (i<n and a[i]>m):
  sv.minmax./ff=le/m=ge/mr 4,*10,4,1 # r4 accumulate
  crternlogi 0,1,2,127  # test >= (or VL=0)
-sv.crand *19,*16,0    # clear if CR0.eq=0
+sv.crnand/m=lt/zz *19,*16,0 # SO=~LT, if CR0.eq=0
  #   nm = i: count masked bits. could use crweirds
  sv.svstep/mr/m=so 1,0,6,1 # get vector dststep
  sv.creqv *16,*16,*16  # set mask on already-tested
diff --git a/openpower/sv/cookbook/fortran_maxloc.mdwn b/openpower/sv/cookbook/fortran_maxloc.mdwn

index 5bfb537e6f4a9fecea99ff8c578e112dbdbe170d..232008ac9b9b2f8b30e29b50461260fc71688102 100644 (file)
--- a/openpower/sv/cookbook/fortran_maxloc.mdwn
+++ b/openpower/sv/cookbook/fortran_maxloc.mdwn
@@ -126,12 +126,11 @@ setvl 2,0,4,0,1,1                  # set MVL=4, VL=MIN(MVL,CTR)
  mtcrf 128, 0                       # clear CR0 (in case VL=0?)
  #    while (i<n and a[i]>m):
  sv.minmax./ff=le/m=ge/mr 4,*10,4,1 # uses r4 as accumulator
-crternlogi 0,1,2,127               # test greater/equal or VL=0
-sv.crand *19,*16,0                 # clear if CR0.eq=0
+sv.crnand/m=lt/zz *19,*16,0 # SO=~LT, if CR0.eq=0
  #      nm = i (count masked bits. could use crweirds here TODO)
  sv.svstep/mr/m=so 1, 0, 6, 1       # svstep: get vector dststep
  sv.creqv *16,*16,*16               # set mask on already-tested
-bc 12,0, -0x40                     # CR0 lt bit clear, branch back
+bc 12,0, -0x3c                     # CR0 lt bit clear, branch back
  ```
  
  `sv.cmp` can be used in the first while loop because m (r4, the current
@@ -171,12 +170,16 @@ by Vector Length (VL) being truncated - potentially even to zero!
  that happens then CR0 would be left it in its previous state: a
  very much undesirable behaviour!)
  
-`crternlogi 0,1,2,127` will combine the setting of CR0.EQ and CR0.LT
-to give us a true Greater-than-or-equal, including under the circumstance
-where VL=0. The `sv.crand` will then take a copy of the `i-in-unary`
-mask, but only when CR0.EQ is set. This is why the third operand `BB`
-is a Scalar not a Vector (BT=16/Vector, BA=19/Vector, BB=0/Scalar)
-which effectively performs a broadcast-splat-ANDing, as follows:
+`sv.crnand/m=lt/zz` is quite sophisticated - a lot is going on behind
+the scenes.  The effect is (through the NAND) to invert the Less-than
+to give us a Greater-than-or-equal, including under the circumstance
+where VL=0, but only when CR0.EQ is set. Note that the third operand
+`BB` (CR0.EQ) is a *scalar*, but that zeroing is used here. Therefore
+whenever the Vector of `LT` bits is zero, a zero is put into the
+Vector `SO` result. In effect, the predication is being exploited
+as a way to combine a third operand into what would otherwise be a
+2-in 1-out Condition Register operation, making it effectively 3-in 1-out.
+Obscure but effective!
  
  ```
      CR4.SO = CR4.EQ AND CR0.EQ (if VL >= 1)
@@ -194,7 +197,7 @@ the `crweird` suite of instructions, combined with `popcnt`. However
  there is a very straightforward way to it: use `sv.svstep`.
  
  ```
-crternlogi 0,1,2,127
+sv.crnand/m=lt/zz *19,*16,0 # Vector SO = Vector ~LT, if CR0.eq=0
  
      i ----> 0        1        2        3
              CR4.EQ   CR5.EQ   CR6.EQ   CR7.EQ
author	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Tue, 13 Feb 2024 12:09:46 +0000 (12:09 +0000)
committer	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Tue, 13 Feb 2024 12:09:46 +0000 (12:09 +0000)
conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex		patch \| blob \| history
conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s		patch \| blob \| history
openpower/sv/cookbook/fortran_maxloc.mdwn		patch \| blob \| history