From 00404b7e25809a967600a88b23b3186f10aabf69 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Tue, 13 Feb 2024 12:15:05 +0000 Subject: [PATCH] bug 1244: update maxloc slides after finding shorter algorithm --- .../fosdem2024_ddffirst/fosdem2024_ddffirst.tex | 10 +++++----- conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex index f196881e6..49e37798c 100644 --- a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex +++ b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex @@ -221,14 +221,15 @@ for (i = 0; i < VL; i++) \begin{itemize} \item FORTRAN MAXLOC - find the index of largest number - \item notoriously difficult to optimally implement for SIMD + notoriously difficult to optimally implement for SIMD \item algorithms include \textit{depth-first} recursive descent (!) mapreduce-style, offsetting the locally-computed largest index (plus value) which are then tested in upper level(s) - \item SVP64 through Data-Dependent Fail-First can perform - each of the two key while-loop tests with - \textit{single instructions}. + \item SVP64: note below the sv.cmp (first while-loop), + sv.minmax. (second while-loop) and the sv.crnand which + by Predicate masking is 3-in 1-out CR ops + not the usual 2-in 1-out \item There is however quite a bit of "housekeeping". Full analysis: \\ https://libre-soc.org/openpower/sv/cookbook/fortran\_maxloc @@ -238,7 +239,6 @@ for (i = 0; i < VL; i++) \frame{\frametitle{maxloc assembler} \lstinputlisting[language={}]{maxloc.s} - } \frame{\frametitle{Summary} diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s b/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s index 2639c112d..a343ab931 100644 --- a/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s +++ b/conferences/fosdem2024/fosdem2024_ddffirst/maxloc.s @@ -12,4 +12,4 @@ sv.crnand/m=lt/zz *19,*16,0 # SO=~LT, if CR0.eq=0 # nm = i: count masked bits. could use crweirds sv.svstep/mr/m=so 1,0,6,1 # get vector dststep sv.creqv *16,*16,*16 # set mask on already-tested -bc 12,0,-0x40 # CR0 lt clear, branch back +bc 12,0,-0x3c # CR0 lt clear, branch back -- 2.30.2