bug 1244: update maxloc slides after finding shorter algorithm
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 13 Feb 2024 12:15:05 +0000 (12:15 +0000)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Tue, 13 Feb 2024 12:15:05 +0000 (12:15 +0000)
conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex
conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s

index f196881e6a73e487ded1bfc6f9b52e46c03e1f37..49e37798cf32501352e374b3a9d2fc2c08fdd6d5 100644 (file)
@@ -221,14 +221,15 @@ for (i = 0; i < VL; i++)
 
        \begin{itemize}
                \item FORTRAN MAXLOC - find the index of largest number
-               \item notoriously difficult to optimally implement for SIMD
+                     notoriously difficult to optimally implement for SIMD
                \item algorithms include \textit{depth-first} recursive
                      descent (!) mapreduce-style, offsetting the
                      locally-computed largest index (plus value) which
                      are then tested in upper level(s)
-               \item SVP64 through Data-Dependent Fail-First can perform
-                         each of the two key while-loop tests with
-                         \textit{single instructions}.
+               \item SVP64: note below the sv.cmp (first while-loop),
+               sv.minmax. (second while-loop) and the sv.crnand which
+               by Predicate masking is 3-in 1-out CR ops
+               not the usual 2-in 1-out
                \item There is however quite a bit of "housekeeping".
                        Full analysis: \\
        https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc
@@ -238,7 +239,6 @@ for (i = 0; i < VL; i++)
 \frame{\frametitle{maxloc assembler}
        
        \lstinputlisting[language={}]{maxloc.s}
-       
 }
 
 \frame{\frametitle{Summary}
index 2639c112d8ffbe3de4ce8a1b17ea951d12342891..a343ab93153415837a670f160c43ffe565c76c66 100644 (file)
@@ -12,4 +12,4 @@ sv.crnand/m=lt/zz *19,*16,0 # SO=~LT, if CR0.eq=0
 #   nm = i: count masked bits. could use crweirds
 sv.svstep/mr/m=so 1,0,6,1 # get vector dststep
 sv.creqv *16,*16,*16  # set mask on already-tested
-bc 12,0,-0x40         # CR0 lt clear, branch back
+bc 12,0,-0x3c         # CR0 lt clear, branch back