From b2e1c45b3913e10e3f24fadd10799df49246238d Mon Sep 17 00:00:00 2001 From: lkcl Date: Sat, 20 May 2023 09:04:06 +0100 Subject: [PATCH] --- simple_v_extension/daxpy_example.mdwn | 39 +++++++++++---------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/simple_v_extension/daxpy_example.mdwn b/simple_v_extension/daxpy_example.mdwn index d99c3b58f..902f02d88 100644 --- a/simple_v_extension/daxpy_example.mdwn +++ b/simple_v_extension/daxpy_example.mdwn @@ -9,40 +9,31 @@ # SVP64 Power ISA version -Summary: 9 instructions (see below, 8 instructions) 5 of which are 64-bit +Summary: 9 instructions, 5 of which are 64-bit for a total of 14 "words". +Relies on post-increment, relies on no overlap between x and y +in memory, and critically relies on y overwrite. x is post-incremented +when read, but y is post-incremented on write. Element-Strided +ensures the Immediate (8) results in a contiguous LD (or store) +despite RA being marked Scalar. RA is Scalar so that only one +LD/ST Update "wins": the last write to RA is the address for +the next block. + ``` # r5: n count; r6: x ptr; r7: y ptr; fp1: a - 1 mtctr 5 # move n to CTR - 2 addi r10,r6,0 # copy y-ptr into r10 (y') + 1 addi r3,r7,0 # return result + 2 mtctr 5 # move n to CTR 3 .L2 4 setvl MAXVL=32,VL=CTR # actually VL=MIN(MAXVL,CTR) - 5 sv.lfdup *32,8(6) # load x into fp32-63, inc x - 6 sv.lfdup *64,8(7) # load y into fp64-95, inc y - 7 sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1 - 8 sv.stfdup *64,8(10) # store at y-copy, inc y' + 5 sv.lfdup/els *32,8(6) # load x into fp32-63, incr x + 6 sv.lfd/els *64,8(7) # load y into fp64-95, NO INC + 7 sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + a + 8 sv.stfdup/els *64,8(7) # store at y, incr y 9 sv.bc/ctr .L2 # decr CTR by VL, jump !zero 10 blr # return ``` -A refinement, reducing 1 instruction and register port usage. -Relies on post-increment, relies on no overlap between x and y -in memory, and critically relies on y overwrite. - -``` - # r5: n count; r6: x ptr; r7: y ptr; fp1: a - 1 mtctr 5 # move n to CTR - 2 .L2 - 3 setvl MAXVL=32,VL=CTR # actually VL=MIN(MAXVL,CTR) - 4 sv.lfdup *32,8(6) # load x into fp32-63, incr x - 5 sv.lfd *64,8(7) # load y into fp64-95, NO INC - 6 sv.fmadd *64,*64,1,*32 # (*y) = (*y) * (*x) + fp1 - 7 sv.stfdup *64,8(7) # store at y, incr y - 8 sv.bc/ctr .L2 # decr CTR by VL, jump !zero - 9 blr # return -``` - # RVV version Summary: 12 instructions, 7 32-bit and 5 16-bit for a total of 9.5 "words" -- 2.30.2