f14d1869559efbf218f3ccdbd95f1ddf37dd6a41
[riscv-tests.git] / benchmarks / vec-matmul / vec_matmul_asm.S
1 #*****************************************************************************
2 # matmul function (assembly version)
3 #-----------------------------------------------------------------------------
4
5
6 #--------------------------------------------------------------------------
7 # Headers and Defines
8 #--------------------------------------------------------------------------
9
10 # Here are some defines that make writing assembly code easier.
11
12 # I'm using the knowledge that rLda will be placed in register a0, rA will be
13 # placed into register a1, etc., based on the calling convention for functions.
14
15 #define rN a0
16 #define rLda a0
17 #define rA a1
18 #define rB a2
19 #define rC a3
20 #define rATmp2 v0
21 #define rBTmp2 s0
22
23
24 # given vector-length
25 #define rVlen a7
26
27 # address of VT function
28 #define rVTAddr v1
29 #define rTemp0 a8
30
31 # desired app vector length (number of elements to vectorize)
32 #define rNum a9
33
34 #define rATemp a10
35 #define rBTemp a11
36 #define rCTemp a12
37 #define rI a13
38 #define rJ s1
39 #define rK s2
40 #define rLda4 a4
41 #define rK4 a5
42 #define rI4 a6
43
44 # WARNING: do not write to the s0,...,s9 registers without first saving them to
45 # the stack!
46
47 #--------------------------------------------------------------------------
48 # void scalar_matmul_asm( int n, float a[], float b[], float c[] )
49 #--------------------------------------------------------------------------
50
51 .text
52 .align 2
53 .globl scalar_matmul_asm
54 .type scalar_matmul_asm,@function
55
56 scalar_matmul_asm:
57
58 # ***** Scalar Example *****
59
60 blez rLda, done # exit early if lda < 0
61
62 move rJ, zero
63 loopj:
64 move rI, zero
65 loopi:
66 move rK, zero
67 loopk:
68 mul rTemp0, rJ, rLda # calculate indices... I'm being SUPER naive here:
69 add rATemp, rK, rTemp0 # this could be a lot more clever!
70 slli rATemp, rATemp, 2
71 add rATemp, rA, rATemp
72
73 mul rTemp0, rK, rLda
74 add rBTemp, rI, rTemp0
75 slli rBTemp, rBTemp, 2
76 add rBTemp, rB, rBTemp
77
78 mul rTemp0, rJ, rLda
79 add rCTemp, rI, rTemp0
80 slli rCTemp, rCTemp, 2
81 add rCTemp, rC, rCTemp
82
83 flw f2, 0(rATemp) # again, I'm being very lazy...
84 # I can lift this out of the inner loop!
85 flw f3, 0(rBTemp)
86 flw f4, 0(rCTemp)
87 fmul.s f3, f2, f3
88 fadd.s f4, f4, f3
89 fsw f4, 0(rCTemp)
90 endk:
91 addi rK, rK, 1
92 blt rK, rLda, loopk
93 endi:
94 addi rI, rI, 1
95 blt rI, rLda, loopi
96 endj:
97 addi rJ, rJ, 1
98 blt rJ, rLda, loopj
99 done:
100 ret
101
102
103 #--------------------------------------------------------------------------
104 # void vt_matmul_asm( int n, float a[], float b[], float c[] )
105 #--------------------------------------------------------------------------
106
107
108 # ***** Vector-Thread Example *****
109
110 .globl vt_matmul_asm
111 .type vt_matmul_asm,@function
112
113 vt_matmul_asm:
114 addi sp, sp, -24
115 sd s0, 0(sp)
116 sd s1, 8(sp)
117 sd s2, 16(sp)
118
119
120 # turn on vector unit
121 mfpcr a13,cr0
122 ori a13,a13,4
123 mtpcr x0,a13,cr0
124
125 blez rLda, cpdone # exit early if lda < 0
126
127
128 la rVTAddr, vtcode
129 slli rLda4, rLda, 2
130
131 #for starters ask for all the registers. We shouldn't need this many
132 #but we'll trim it when we have correctness in hand
133 vvcfgivl rVlen, rNum, 1, 5
134
135
136 move rJ, zero
137 vec_loopj:
138 move rI, zero
139 vec_loopi:
140 slli rI4, rI, 2
141
142 sub rNum, rN, rI # book keeping
143 vsetvl rVlen, rNum # set the vector length
144 # rN is the desired (application) vector length
145 # rVlen is what vector length we were given
146
147 #####################################
148 # LOADS FOR C #
149 #####################################
150 mul rTemp0, rJ, rLda4
151 add rCTemp, rI4, rTemp0
152
153 add rCTemp, rC, rCTemp
154 vflw vf2, rCTemp
155
156 add rCTemp, rCTemp, rLda4
157 vflw vf4, rCTemp
158
159
160 #################################
161 # address calculation lifts #
162 #################################
163 mul rTemp0, rJ, rLda4
164 add rATmp2, rA, rTemp0
165
166 add rBTmp2, rI4, rB
167 move rK, zero
168 vec_loopk:
169 slli rK4, rK, 2
170
171 #####################################
172 # LOADS FOR A #
173 #####################################
174
175 add rATemp, rK4, rATmp2
176 vflstw vf0, rATemp, zero
177
178 add rATemp, rATemp, rLda4
179 vflstw vf3, rATemp, zero
180
181
182 #####################################
183 # LOADS FOR B #
184 #####################################
185 mul rTemp0, rK, rLda4
186 add rBTemp, rBTmp2, rTemp0
187 vflw vf1, rBTemp
188 vf 0(rVTAddr)
189
190 #####################################
191 # LOADS FOR A #
192 #####################################
193 add rATemp, rK4, rATmp2
194 addi rATemp, rATemp, 4
195 vflstw vf0, rATemp, zero
196
197 add rATemp, rATemp, rLda4
198 vflstw vf3, rATemp, zero
199
200
201 #####################################
202 # LOADS FOR B #
203 #####################################
204 add rBTemp, rBTemp, rLda4
205 vflw vf1, rBTemp
206 vf 0(rVTAddr)
207
208 #####################################
209 # LOADS FOR A #
210 #####################################
211 add rATemp, rK4, rATmp2
212 addi rATemp, rATemp, 8
213 vflstw vf0, rATemp, zero
214
215 add rATemp, rATemp, rLda4
216 vflstw vf3, rATemp, zero
217
218
219 #####################################
220 # LOADS FOR B #
221 #####################################
222 add rBTemp, rBTemp, rLda4
223 vflw vf1, rBTemp
224 vf 0(rVTAddr)
225
226
227 #####################################
228 # LOADS FOR A #
229 #####################################
230 add rATemp, rK4, rATmp2
231 addi rATemp, rATemp, 12
232 vflstw vf0, rATemp, zero
233
234 add rATemp, rATemp, rLda4
235 vflstw vf3, rATemp, zero
236
237
238 #####################################
239 # LOADS FOR B #
240 #####################################
241 add rBTemp, rBTemp, rLda4
242 vflw vf1, rBTemp
243 vf 0(rVTAddr)
244
245 vec_endk:
246 addi rK, rK, 4
247 blt rK, rLda, vec_loopk
248
249 vec_endi:
250 #####################################
251 # STORES FOR C #
252 #####################################
253 vfsw vf4, rCTemp
254 sub rCTemp, rCTemp, rLda4
255 vfsw vf2, rCTemp
256
257 add rI, rI, rVlen
258 blt rI, rLda, vec_loopi
259 vec_endj:
260 addi rJ, rJ, 2
261 # fence.v.l
262 blt rJ, rLda, vec_loopj
263
264
265 cpdone:
266 fence.v.l
267 ld s0, 0(sp)
268 ld s1, 8(sp)
269 ld s2, 16(sp)
270 addi sp, sp, 24
271
272
273 ret
274
275 vtcode:
276 # ADD YOUR VECTOR-ELEMENT CODE HERE ...
277
278 #TODO: hit this with a fused multiply add.
279
280 fmadd.s f2, f0, f1, f2
281 fmadd.s f4, f3, f1, f4
282 #fmadd.s f6, f5, f1, f6
283 #fmadd.s f8, f7, f1, f8
284
285
286 #fmul.s f1, f0, f1
287 #fadd.s f2, f2, f1
288 stop
289
290 transpose:
291 # turn on vector unit
292 mfpcr a13,cr0
293 ori a13,a13,4
294 mtpcr x0,a13,cr0
295
296
297 blez rLda, cpdone # exit early if lda < 0
298 vvcfgivl rVlen, rNum, 1, 1
299
300 move rI, zero
301 tloopi:
302 sub rNum, rLda, rI
303 vsetvl rVlen, rNum
304
305 move rJ, zero
306 tloopj:
307
308 mul rTemp0, rJ, rLda
309 add rATemp, rI, rTemp0
310 slli rATemp, rATemp, 2
311 add rATemp, rA, rATemp
312
313 mul rTemp0, rI, rLda
314 add rBTemp, rJ, rTemp0
315 slli rBTemp, rBTemp, 2
316 add rBTemp, rB, rBTemp
317
318 #flw f0, 0(rBTemp)
319 #fsw f0, 0(rATemp)
320 vflstw vf0, rBTemp, rLda4
321 vfsw vf0, rATemp
322
323 tendj:
324 addi rJ, rJ, 1
325 blt rJ, rLda, tloopj
326 tendi:
327 #addi rI, rI, 1
328 add rI, rI, rVlen
329 blt rI, rLda, tloopi
330
331 ret
332 # The C code uses a jalr instruction to call this function
333 # so we can use a jr to return back to where the function
334 # was called. Also known as "ret", for "return".
335
336 ret
337
338
339
340 #####################################
341 # NOPS TO AVOID OVERPREFETCH #
342 #####################################
343 # srli rTemp0, rLda, 4
344 #nop_lp: addi rTemp0, rTemp0, -1
345 # bgez rTemp0, nop_lp