simple_v_extension/daxpy_example.mdwn

   1 ```
   2     # c code
   3     void daxpy(size_t n, double a, const double x[], double y[])
   4     {
   5      for (size_t i = 0; i < n; i++) {
   6        y[i] = a*x[i] + y[i];
   7      }
   8     }
   9 ```
  10
  11 -----
  12
  13 ```
  14     # SVP64 Power ISA version
  15     # r5: n
  16     # r5: x
  17     # r6: y
  18     # fp1: a
  19     mtctr 5                 # move n to CTR
  20     addi r10,r6,0           # copy y-ptr into r10
  21 .L2
  22     setvl MAXVL=32,VL=CTR   # could do more
  23     sv.lfdup/els *32,8(6)   # load from x
  24     sv.lfdup/els *64,8(7)   # load from y
  25     sv.fmadd *64,*64,1,*32  # fmadd
  26     stfdup/els *64,8(10)    # store y-copy
  27     sv.bc/ctr .L2           # decrement VL by CTR
  28     blr                     # return
  29 ```
  30
  31 -----
  32
  33 ```
  34     # SV Version
  35     # a0 is n, a1 is ptr to x[0], a2 is ptr to y[0], fa0 is a (scalar)
  36       VBLK.REG[0] = {type: F, isvec: 1, regkey: a3, regidx: a3, elwidth: dflt}
  37       VBLK.REG[1] = {type: F, isvec: 1, regkey: a7, regidx: a7, elwidth: dflt}
  38     loop:
  39       VBLK.SETVL  t0, a0, #4   # MVL=4, vl = t0 = min(a0, MVL))
  40       c.ld     a3, a1          # load 4 registers a3-6 from x
  41       c.slli   t1, t0, 3       # t1 = vl * 8 (in bytes: FP is double)
  42       c.ld     a7, a2          # load 4 registers a7-10 from y
  43       c.add    a1, a1, t1      # increment pointer to x by vl*8
  44       fmadd  a7, a3, fa0, a7   # v1 += v0 * fa0 (y = a * x + y)
  45       c.sub    a0, a0, t0      # n -= vl (t0)
  46       c.st     a7, a2          # store 4 registers a7-10 to y
  47       c.add    a2, a2, t1      # increment pointer to y by vl*8
  48       c.bnez   a0, loop        # repeat if n != 0
  49       c.ret                    # return
  50 ```
  51
  52 -----
  53
  54 ```
  55     # RVV version
  56     # a0 is n, a1 is pointer to x[0], a2 is pointer to y[0], fa0 is a
  57       li t0, 2<<25
  58       vsetdcfg t0             # enable 2 64b Fl.Pt. registers
  59     loop:
  60       setvl  t0, a0           # vl = t0 = min(mvl, n)
  61       vld    v0, a1           # load vector x
  62       c.slli   t1, t0, 3      # t1 = vl * 8 (in bytes)
  63       vld    v1, a2           # load vector y
  64       c.add    a1, a1, t1     # increment pointer to x by vl*8
  65       vfmadd v1, v0, fa0, v1  # v1 += v0 * fa0 (y = a * x + y)
  66       c.sub    a0, a0, t0     # n -= vl (t0)
  67       vst    v1, a2           # store Y
  68       c.add    a2, a2, t1     # increment pointer to y by vl*8
  69       c.bnez   a0, loop       # repeat if n != 0
  70       c.ret                   # return
  71 ```