benchmarks/vec-cmplxmult/vec_cmplxmult_asm.S

   1 #*****************************************************************************
   2 # cmplxmult function (assembly version)
   3 #-----------------------------------------------------------------------------
   4
   5
   6 #--------------------------------------------------------------------------
   7 # Headers and Defines
   8 #--------------------------------------------------------------------------
   9
  10 # Here are some defines that make writing assembly code easier.
  11
  12 # I'm using the knowledge that rN will be placed in register a0, rA will be
  13 # placed into register a1, etc., based on the calling convention for functions.
  14
  15
  16 #define rN      a0
  17 #define rA      a1
  18 #define rB      a2
  19 #define rC      a3
  20
  21 #define rVlen   a6
  22 #define rStride a7
  23
  24 #define rAI t0
  25 #define rBI t1
  26 #define rCI t2
  27
  28 # WARNING: do not write to the s0,...,s9 registers without first saving them to
  29 # the stack.
  30
  31 #--------------------------------------------------------------------------
  32 # void scalar_cmplxmult_asm( int n, float a[], float b[], float c[] )
  33 #--------------------------------------------------------------------------
  34
  35         .text
  36         .align 2
  37         .globl scalar_cmplxmult_asm
  38         .type  scalar_cmplxmult_asm,@function
  39
  40 scalar_cmplxmult_asm:
  41
  42         # *****   Scalar Example   *****
  43
  44         blez rN, done    # exit early if n < 0
  45
  46 loop:
  47       # The following code is a naive implementation...
  48       # Re-ordering instructions may increase performance, also,
  49       # RISC-V supports instrucitons such as the "fmuladd" and "fmulsub".
  50       # fmsub.s fa2,fa4,fa3,ft1
  51       # Finally, unrolling and other fun transformations can also provide
  52       # performance gains.
  53
  54         flw  f2, 0(rA)
  55         flw  f3, 4(rA)
  56         flw  f4, 0(rB)
  57         flw  f5, 4(rB)
  58         fmul.s f6, f2, f4
  59         fmul.s f7, f3, f5
  60         fmul.s f8, f3, f4
  61         fmul.s f9, f2, f5
  62         fsub.s f10, f6, f7
  63         fadd.s f11, f8, f9
  64         fsw  f10, 0(rC)
  65         fsw  f11, 4(rC)
  66         addi rN, rN, -1
  67         addi rA, rA, 8
  68         addi rB, rB, 8
  69         addi rC, rC, 8
  70         bne  rN, zero, loop
  71 done:
  72         ret
  73
  74
  75 #--------------------------------------------------------------------------
  76 # void vt_cmplxmult_asm( int n, float a[], float b[], float c[] )
  77 #--------------------------------------------------------------------------
  78
  79
  80         # ***** Vector-Thread Example *****
  81
  82         .globl vt_cmplxmult_asm
  83         .type  vt_cmplxmult_asm,@function
  84
  85         # HINT: because you are dealing with an array of structures, a regular,
  86         # vanilla vector-load/vector-store won't work here!
  87
  88 vt_cmplxmult_asm:
  89
  90         blez rN, cpdone
  91         la a4, vtcode
  92         li rStride, 8
  93
  94         vvcfgivl rVlen, rN, 1, 7
  95
  96 stripmineloop:
  97
  98          # ADD YOUR CODE HERE....
  99         vsetvl rVlen, rN   # set the vector length
 100                            # rN is the desired (application) vector length
 101                            # rVLen is what vector length we were given
 102
 103         vflstw vf2, rA, rStride       # real number vector load of A
 104         addi rAI, rA, 4
 105         vflstw vf4, rB, rStride  # real number vector load of B
 106         addi rBI, rB, 4
 107         vflstw vf3, rAI, rStride #imaginary number vector load of A
 108         vflstw vf5, rBI, rStride #imaginary vector number load of B
 109
 110         vf 0(a4)           # jump to vector-fetch code
 111
 112         vfsstw vf0, rC, rStride       # real number vector store C
 113         addi rCI, rC, 4
 114         vfsstw vf1, rCI, rStride # imaginary
 115
 116         slli a5, rVlen, 3
 117         sub rN, rN, rVlen  # book keeping
 118         add rA, rA, a5
 119         add rB, rB, a5
 120         add rC, rC, a5
 121         bne rN, zero, stripmineloop
 122          # Step 0: set the vector length
 123          # Step 1: perform your vector loads
 124          # Step 2: jump to the vector-fetch code to perform the calculation
 125          # Step 3: perform the vector store
 126          # Step 4: book keeping, update the pointers, etc.
 127
 128 cpdone:
 129         fence.v.l
 130         ret
 131
 132 vtcode:
 133         # ADD YOUR VECTOR-ELEMENT CODE HERE ...
 134         fmul.s f0, f2, f4
 135         fmsub.s f0, f3, f5, f0
 136
 137         fmul.s f1, f2, f5
 138         fmadd.s f1, f3, f4, f1
 139         stop
 140
 141         # The C code uses a jalr instruction to call this function
 142         # so we can use a jr to return back to where the function
 143         # was called.  Also known as "ret", for "return".
 144
 145         ret
 146