From: lkcl <lkcl@web>
Date: Thu, 18 May 2023 14:11:55 +0000 (+0100)
Subject: (no commit message)
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=c3882020d2f0533ae20d20a638324a913d4ba547;p=libreriscv.git

---

diff --git a/simple_v_extension/daxpy_example.mdwn b/simple_v_extension/daxpy_example.mdwn
index 4e3163242..e9cacf006 100644
--- a/simple_v_extension/daxpy_example.mdwn
+++ b/simple_v_extension/daxpy_example.mdwn
@@ -11,22 +11,41 @@
 
 # SVP64 Power ISA version
 
-```  
+```
+    # r5: n count
+    # r6: x ptr
+    # r7: y ptr
+    # fp1: a mul-scalar
+    1  mtctr 5                # move n to CTR
+    2  addi r10,r6,0          # copy y-ptr into r10 (y')
+    3  .L2
+    4  setvl MAXVL=32,VL=CTR  # actually VL=MIN(MAXVL,CTR)
+    5  sv.lfdup *32,8(6)      # load x into fp32-63, inc x
+    6  sv.lfdup *64,8(7)      # load y into fp64-95, inc y
+    7  sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1
+    8  sv.stfdup *64,8(10)    # store at y-copy, inc y'
+    9  sv.bc/ctr .L2          # decrement CTR by VL
+    blr                       # return
+```
 
+A refinement, reducing 1 instruction and register port usage.
+Relies on post-increment, relies on no overlap between x and y
+in memory, and critically relies on y overwrite.
+
+```
     # r5: n count
     # r6: x ptr
     # r7: y ptr
     # fp1: a mul-scalar
-    mtctr 5                # move n to CTR
-    addi r10,r6,0          # copy y-ptr into r10
-.L2
-    setvl MAXVL=32,VL=CTR  # could do more
-    sv.lfdup/els *32,8(6)  # load from x into fp32-63
-    sv.lfdup/els *64,8(7)  # load from y into fp64-95
-    sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1
-    stfdup/els *64,8(10)   # store y-copy
-    sv.bc/ctr .L2          # decrement VL by CTR
-    blr                    # return
+    1  mtctr 5                # move n to CTR
+    2  .L2
+    3  setvl MAXVL=32,VL=CTR  # actually VL=MIN(MAXVL,CTR)
+    4  sv.lfdup *32,8(6)      # load x into fp32-63, incr x
+    5  sv.lfd *64,8(7)        # load y into fp64-95, NO INC
+    6  sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1
+    7  sv.stfdup *64,8(7)     # store at y, incr y
+    8  sv.bc/ctr .L2          # decrement CTR by VL
+    9  blr                    # return
 ```
 
 # RVV version