From b2e1c45b3913e10e3f24fadd10799df49246238d Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Sat, 20 May 2023 09:04:06 +0100
Subject: [PATCH]

---
 simple_v_extension/daxpy_example.mdwn | 39 +++++++++++----------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/simple_v_extension/daxpy_example.mdwn b/simple_v_extension/daxpy_example.mdwn
index d99c3b58f..902f02d88 100644
--- a/simple_v_extension/daxpy_example.mdwn
+++ b/simple_v_extension/daxpy_example.mdwn
@@ -9,40 +9,31 @@
 
 # SVP64 Power ISA version
 
-Summary: 9 instructions (see below, 8 instructions) 5 of which are 64-bit
+Summary: 9 instructions, 5 of which are 64-bit
 for a total of 14 "words".
 
+Relies on post-increment, relies on no overlap between x and y
+in memory, and critically relies on y overwrite. x is post-incremented
+when read, but y is post-incremented on write. Element-Strided
+ensures the Immediate (8) results in a contiguous LD (or store)
+despite RA being marked Scalar.  RA is Scalar so that only one
+LD/ST Update "wins": the last write to RA is the address for
+the next block.
+
 ```
     # r5: n count; r6: x ptr; r7: y ptr; fp1: a
-    1  mtctr 5                # move n to CTR
-    2  addi r10,r6,0          # copy y-ptr into r10 (y')
+    1  addi r3,r7,0           # return result
+    2  mtctr 5                # move n to CTR
     3  .L2
     4  setvl MAXVL=32,VL=CTR  # actually VL=MIN(MAXVL,CTR)
-    5  sv.lfdup *32,8(6)      # load x into fp32-63, inc x
-    6  sv.lfdup *64,8(7)      # load y into fp64-95, inc y
-    7  sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1
-    8  sv.stfdup *64,8(10)    # store at y-copy, inc y'
+    5  sv.lfdup/els *32,8(6)  # load x into fp32-63, incr x
+    6  sv.lfd/els *64,8(7)    # load y into fp64-95, NO INC
+    7  sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + a
+    8  sv.stfdup/els *64,8(7) # store at y, incr y
     9  sv.bc/ctr .L2          # decr CTR by VL, jump !zero
     10 blr                    # return
 ```
 
-A refinement, reducing 1 instruction and register port usage.
-Relies on post-increment, relies on no overlap between x and y
-in memory, and critically relies on y overwrite.
-
-```
-    # r5: n count; r6: x ptr; r7: y ptr; fp1: a
-    1  mtctr 5                # move n to CTR
-    2  .L2
-    3  setvl MAXVL=32,VL=CTR  # actually VL=MIN(MAXVL,CTR)
-    4  sv.lfdup *32,8(6)      # load x into fp32-63, incr x
-    5  sv.lfd *64,8(7)        # load y into fp64-95, NO INC
-    6  sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1
-    7  sv.stfdup *64,8(7)     # store at y, incr y
-    8  sv.bc/ctr .L2          # decr CTR by VL, jump !zero
-    9  blr                    # return
-```
-
 # RVV version
 
 Summary: 12 instructions, 7 32-bit and 5 16-bit for a total of 9.5 "words"
-- 
2.30.2