b0ce08ae22a77b6151a328facc330ab02879b911
1 //**************************************************************************
2 // Multi-threaded Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
4 // TA : Christopher Celio
8 // This benchmark multiplies two 2-D arrays together and writes the results to
9 // a third vector. The input data (and reference data) should be generated
10 // using the matmul_gendata.pl perl script and dumped to a file named
14 // print out arrays, etc.
17 //--------------------------------------------------------------------------
25 //--------------------------------------------------------------------------
26 // Input/Reference Data
32 //--------------------------------------------------------------------------
33 // Basic Utilities and Multi-thread Support
35 __thread
unsigned long coreid
;
40 #define stringify_1(s) #s
41 #define stringify(s) stringify_1(s)
42 #define stats(code) do { \
43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
45 _c += rdcycle(), _i += rdinstret(); \
47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
52 //--------------------------------------------------------------------------
55 void printArrayMT( char name
[], int n
, data_t arr
[] )
61 printf( " %10s :", name
);
62 for ( i
= 0; i
< n
; i
++ )
63 printf( " %3ld ", (long) arr
[i
] );
67 void __attribute__((noinline
)) verifyMT(size_t n
, const data_t
* test
, const data_t
* correct
)
73 for (i
= 0; i
< n
; i
++)
75 if (test
[i
] != correct
[i
])
77 printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
78 i
, (long)test
[i
], i
, (long)correct
[i
]);
86 //--------------------------------------------------------------------------
89 // single-thread, naive version
90 void __attribute__((noinline
)) matmul_naive(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
97 for ( i
= 0; i
< lda
; i
++ )
98 for ( j
= 0; j
< lda
; j
++ )
100 for ( k
= 0; k
< lda
; k
++ )
102 C
[i
+ j
*lda
] += A
[j
*lda
+ k
] * B
[k
*lda
+ i
];
110 void __attribute__((noinline
)) matmul(const int lda
, const data_t A
[], const data_t B
[], data_t C
[] )
113 // ***************************** //
114 // **** ADD YOUR CODE HERE ***** //
115 // ***************************** //
117 // feel free to make a separate function for MI and MSI versions.
119 //-------------------------------------------------------------first working version best 500k
121 static __thread int i, j, k;
124 for ( j = 0; j < lda; j+=2 )
126 for ( k = 0; k < lda; k++ )
128 for ( i = 0; i < lda; i++)
130 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
138 for ( j = 1; j < lda; j+=2 )
140 for ( k = 0;k < lda; k++)
142 for ( i = 0; i < lda; i++)
144 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
151 //-------------------------------------------------------------version1.1, take read out of inner loop,300k
153 static __thread int i, j, k;
154 static __thread data_t TempA;
158 for ( j = 0; j < lda; j+=2 )
160 for ( k = 0; k < lda; k++ )
162 TempA = A[j*lda + k];
163 for ( i = 0; i < lda; i++)
165 C[i + j*lda] += TempA* B[k*lda + i];
173 for ( j = 1; j < lda; j+=2 )
175 for ( k = 0;k < lda; k++)
177 TempA = A[j*lda + k];
178 for ( i = 0; i < lda; i++)
180 C[i + j*lda] += TempA* B[k*lda + i];
186 //-------------------------------------------------------------version2.0, read 8 elements in B at one time. 140k mi, MSI117.0k
188 static __thread int i, j, k, m, n;
189 static __thread data_t TempA;
190 static __thread data_t TempB[8];
194 for ( j = 0; j < lda; j+=2 )
196 for ( k = 0; k < lda; k++ )
198 TempA = A[j*lda + k];
199 for( n = 0; n < 4; n++)
202 TempB[0] = B[k*lda+0+8*n];
203 TempB[1] = B[k*lda+1+8*n];
204 TempB[2] = B[k*lda+2+8*n];
205 TempB[3] = B[k*lda+3+8*n];
206 TempB[4] = B[k*lda+4+8*n];
207 TempB[5] = B[k*lda+5+8*n];
208 TempB[6] = B[k*lda+6+8*n];
209 TempB[7] = B[k*lda+7+8*n];
211 C[0+8*n+j*lda] += TempA * TempB[0];
212 C[1+8*n+j*lda] += TempA * TempB[1];
213 C[2+8*n+j*lda] += TempA * TempB[2];
214 C[3+8*n+j*lda] += TempA * TempB[3];
215 C[4+8*n+j*lda] += TempA * TempB[4];
216 C[5+8*n+j*lda] += TempA * TempB[5];
217 C[6+8*n+j*lda] += TempA * TempB[6];
218 C[7+8*n+j*lda] += TempA * TempB[7];
228 for ( j = 1; j < lda; j+=2 )
230 for ( k = 0; k < lda; k++ )
232 TempA = A[j*lda + k];
233 for( n = 0; n < 4; n++)
236 TempB[0] = B[k*lda+0+8*n];
237 TempB[1] = B[k*lda+1+8*n];
238 TempB[2] = B[k*lda+2+8*n];
239 TempB[3] = B[k*lda+3+8*n];
240 TempB[4] = B[k*lda+4+8*n];
241 TempB[5] = B[k*lda+5+8*n];
242 TempB[6] = B[k*lda+6+8*n];
243 TempB[7] = B[k*lda+7+8*n];
245 C[0+8*n+j*lda] += TempA * TempB[0];
246 C[1+8*n+j*lda] += TempA * TempB[1];
247 C[2+8*n+j*lda] += TempA * TempB[2];
248 C[3+8*n+j*lda] += TempA * TempB[3];
249 C[4+8*n+j*lda] += TempA * TempB[4];
250 C[5+8*n+j*lda] += TempA * TempB[5];
251 C[6+8*n+j*lda] += TempA * TempB[6];
252 C[7+8*n+j*lda] += TempA * TempB[7];
261 //-------------------------------------------------------------version2.1, optimize k. 700k. bad move to v2.2.
262 //-------------------------------------------------------------version2.9 take off all inner loops for both cores, MSI,109K. MI 182k
263 //-------------------------------------------------------------version2.10 use i= j*lda inside the n loop increase speed. but not out m and n. tried replace first 3, get 104.9k
265 static __thread int j, m, i,n;
266 static __thread data_t TempA[8];
267 static __thread data_t TempB[8];
271 for ( j = 1; j < lda; j+=2 )
274 for ( m = 0; m < 4; m++ )
277 TempA[0] = A[j*lda+0+8*m];
278 TempA[1] = A[j*lda+1+8*m];
279 TempA[2] = A[j*lda+2+8*m];
280 TempA[3] = A[j*lda+3+8*m];
281 TempA[4] = A[j*lda+4+8*m];
282 TempA[5] = A[j*lda+5+8*m];
283 TempA[6] = A[j*lda+6+8*m];
284 TempA[7] = A[j*lda+7+8*m];
286 for( n = 0; n < 4; n++)
290 TempB[0] = B[(0+8*m)*lda+0+8*n];
291 TempB[1] = B[(0+8*m)*lda+1+8*n];
292 TempB[2] = B[(0+8*m)*lda+2+8*n];
293 TempB[3] = B[(0+8*m)*lda+3+8*n];
294 TempB[4] = B[(0+8*m)*lda+4+8*n];
295 TempB[5] = B[(0+8*m)*lda+5+8*n];
296 TempB[6] = B[(0+8*m)*lda+6+8*n];
297 TempB[7] = B[(0+8*m)*lda+7+8*n];
299 C[0+8*n+i] += TempA[0] * TempB[0];
300 C[1+8*n+i] += TempA[0] * TempB[1];
301 C[2+8*n+i] += TempA[0] * TempB[2];
302 C[3+8*n+i] += TempA[0] * TempB[3];
303 C[4+8*n+i] += TempA[0] * TempB[4];
304 C[5+8*n+i] += TempA[0] * TempB[5];
305 C[6+8*n+i] += TempA[0] * TempB[6];
306 C[7+8*n+i] += TempA[0] * TempB[7];
310 TempB[0] = B[(1+8*m)*lda+0+8*n];
311 TempB[1] = B[(1+8*m)*lda+1+8*n];
312 TempB[2] = B[(1+8*m)*lda+2+8*n];
313 TempB[3] = B[(1+8*m)*lda+3+8*n];
314 TempB[4] = B[(1+8*m)*lda+4+8*n];
315 TempB[5] = B[(1+8*m)*lda+5+8*n];
316 TempB[6] = B[(1+8*m)*lda+6+8*n];
317 TempB[7] = B[(1+8*m)*lda+7+8*n];
319 C[0+8*n+i] += TempA[1] * TempB[0];
320 C[1+8*n+i] += TempA[1] * TempB[1];
321 C[2+8*n+i] += TempA[1] * TempB[2];
322 C[3+8*n+i] += TempA[1] * TempB[3];
323 C[4+8*n+i] += TempA[1] * TempB[4];
324 C[5+8*n+i] += TempA[1] * TempB[5];
325 C[6+8*n+i] += TempA[1] * TempB[6];
326 C[7+8*n+i] += TempA[1] * TempB[7];
330 TempB[0] = B[(2+8*m)*lda+0+8*n];
331 TempB[1] = B[(2+8*m)*lda+1+8*n];
332 TempB[2] = B[(2+8*m)*lda+2+8*n];
333 TempB[3] = B[(2+8*m)*lda+3+8*n];
334 TempB[4] = B[(2+8*m)*lda+4+8*n];
335 TempB[5] = B[(2+8*m)*lda+5+8*n];
336 TempB[6] = B[(2+8*m)*lda+6+8*n];
337 TempB[7] = B[(2+8*m)*lda+7+8*n];
339 C[0+8*n+i] += TempA[2] * TempB[0];
340 C[1+8*n+i] += TempA[2] * TempB[1];
341 C[2+8*n+i] += TempA[2] * TempB[2];
342 C[3+8*n+i] += TempA[2] * TempB[3];
343 C[4+8*n+i] += TempA[2] * TempB[4];
344 C[5+8*n+i] += TempA[2] * TempB[5];
345 C[6+8*n+i] += TempA[2] * TempB[6];
346 C[7+8*n+i] += TempA[2] * TempB[7];
350 TempB[0] = B[(3+8*m)*lda+0+8*n];
351 TempB[1] = B[(3+8*m)*lda+1+8*n];
352 TempB[2] = B[(3+8*m)*lda+2+8*n];
353 TempB[3] = B[(3+8*m)*lda+3+8*n];
354 TempB[4] = B[(3+8*m)*lda+4+8*n];
355 TempB[5] = B[(3+8*m)*lda+5+8*n];
356 TempB[6] = B[(3+8*m)*lda+6+8*n];
357 TempB[7] = B[(3+8*m)*lda+7+8*n];
359 C[0+8*n+i] += TempA[3] * TempB[0];
360 C[1+8*n+i] += TempA[3] * TempB[1];
361 C[2+8*n+i] += TempA[3] * TempB[2];
362 C[3+8*n+i] += TempA[3] * TempB[3];
363 C[4+8*n+i] += TempA[3] * TempB[4];
364 C[5+8*n+i] += TempA[3] * TempB[5];
365 C[6+8*n+i] += TempA[3] * TempB[6];
366 C[7+8*n+i] += TempA[3] * TempB[7];
369 TempB[0] = B[(4+8*m)*lda+0+8*n];
370 TempB[1] = B[(4+8*m)*lda+1+8*n];
371 TempB[2] = B[(4+8*m)*lda+2+8*n];
372 TempB[3] = B[(4+8*m)*lda+3+8*n];
373 TempB[4] = B[(4+8*m)*lda+4+8*n];
374 TempB[5] = B[(4+8*m)*lda+5+8*n];
375 TempB[6] = B[(4+8*m)*lda+6+8*n];
376 TempB[7] = B[(4+8*m)*lda+7+8*n];
378 C[0+8*n+i] += TempA[4] * TempB[0];
379 C[1+8*n+i] += TempA[4] * TempB[1];
380 C[2+8*n+i] += TempA[4] * TempB[2];
381 C[3+8*n+i] += TempA[4] * TempB[3];
382 C[4+8*n+i] += TempA[4] * TempB[4];
383 C[5+8*n+i] += TempA[4] * TempB[5];
384 C[6+8*n+i] += TempA[4] * TempB[6];
385 C[7+8*n+i] += TempA[4] * TempB[7];
389 TempB[0] = B[(5+8*m)*lda+0+8*n];
390 TempB[1] = B[(5+8*m)*lda+1+8*n];
391 TempB[2] = B[(5+8*m)*lda+2+8*n];
392 TempB[3] = B[(5+8*m)*lda+3+8*n];
393 TempB[4] = B[(5+8*m)*lda+4+8*n];
394 TempB[5] = B[(5+8*m)*lda+5+8*n];
395 TempB[6] = B[(5+8*m)*lda+6+8*n];
396 TempB[7] = B[(5+8*m)*lda+7+8*n];
398 C[0+8*n+i] += TempA[5] * TempB[0];
399 C[1+8*n+i] += TempA[5] * TempB[1];
400 C[2+8*n+i] += TempA[5] * TempB[2];
401 C[3+8*n+i] += TempA[5] * TempB[3];
402 C[4+8*n+i] += TempA[5] * TempB[4];
403 C[5+8*n+i] += TempA[5] * TempB[5];
404 C[6+8*n+i] += TempA[5] * TempB[6];
405 C[7+8*n+i] += TempA[5] * TempB[7];
409 TempB[0] = B[(6+8*m)*lda+0+8*n];
410 TempB[1] = B[(6+8*m)*lda+1+8*n];
411 TempB[2] = B[(6+8*m)*lda+2+8*n];
412 TempB[3] = B[(6+8*m)*lda+3+8*n];
413 TempB[4] = B[(6+8*m)*lda+4+8*n];
414 TempB[5] = B[(6+8*m)*lda+5+8*n];
415 TempB[6] = B[(6+8*m)*lda+6+8*n];
416 TempB[7] = B[(6+8*m)*lda+7+8*n];
418 C[0+8*n+i] += TempA[6] * TempB[0];
419 C[1+8*n+i] += TempA[6] * TempB[1];
420 C[2+8*n+i] += TempA[6] * TempB[2];
421 C[3+8*n+i] += TempA[6] * TempB[3];
422 C[4+8*n+i] += TempA[6] * TempB[4];
423 C[5+8*n+i] += TempA[6] * TempB[5];
424 C[6+8*n+i] += TempA[6] * TempB[6];
425 C[7+8*n+i] += TempA[6] * TempB[7];
428 TempB[0] = B[(7+8*m)*lda+0+8*n];
429 TempB[1] = B[(7+8*m)*lda+1+8*n];
430 TempB[2] = B[(7+8*m)*lda+2+8*n];
431 TempB[3] = B[(7+8*m)*lda+3+8*n];
432 TempB[4] = B[(7+8*m)*lda+4+8*n];
433 TempB[5] = B[(7+8*m)*lda+5+8*n];
434 TempB[6] = B[(7+8*m)*lda+6+8*n];
435 TempB[7] = B[(7+8*m)*lda+7+8*n];
437 C[0+8*n+i] += TempA[7] * TempB[0];
438 C[1+8*n+i] += TempA[7] * TempB[1];
439 C[2+8*n+i] += TempA[7] * TempB[2];
440 C[3+8*n+i] += TempA[7] * TempB[3];
441 C[4+8*n+i] += TempA[7] * TempB[4];
442 C[5+8*n+i] += TempA[7] * TempB[5];
443 C[6+8*n+i] += TempA[7] * TempB[6];
444 C[7+8*n+i] += TempA[7] * TempB[7];
452 for ( j = 0; j < lda; j+=2 )
455 for ( m = 0; m < 4; m++ )
458 TempA[0] = A[j*lda+0+8*m];
459 TempA[1] = A[j*lda+1+8*m];
460 TempA[2] = A[j*lda+2+8*m];
461 TempA[3] = A[j*lda+3+8*m];
462 TempA[4] = A[j*lda+4+8*m];
463 TempA[5] = A[j*lda+5+8*m];
464 TempA[6] = A[j*lda+6+8*m];
465 TempA[7] = A[j*lda+7+8*m];
467 for( n = 0; n < 4; n++)
471 TempB[0] = B[(0+8*m)*lda+0+8*n];
472 TempB[1] = B[(0+8*m)*lda+1+8*n];
473 TempB[2] = B[(0+8*m)*lda+2+8*n];
474 TempB[3] = B[(0+8*m)*lda+3+8*n];
475 TempB[4] = B[(0+8*m)*lda+4+8*n];
476 TempB[5] = B[(0+8*m)*lda+5+8*n];
477 TempB[6] = B[(0+8*m)*lda+6+8*n];
478 TempB[7] = B[(0+8*m)*lda+7+8*n];
480 C[0+8*n+i] += TempA[0] * TempB[0];
481 C[1+8*n+i] += TempA[0] * TempB[1];
482 C[2+8*n+i] += TempA[0] * TempB[2];
483 C[3+8*n+i] += TempA[0] * TempB[3];
484 C[4+8*n+i] += TempA[0] * TempB[4];
485 C[5+8*n+i] += TempA[0] * TempB[5];
486 C[6+8*n+i] += TempA[0] * TempB[6];
487 C[7+8*n+i] += TempA[0] * TempB[7];
491 TempB[0] = B[(1+8*m)*lda+0+8*n];
492 TempB[1] = B[(1+8*m)*lda+1+8*n];
493 TempB[2] = B[(1+8*m)*lda+2+8*n];
494 TempB[3] = B[(1+8*m)*lda+3+8*n];
495 TempB[4] = B[(1+8*m)*lda+4+8*n];
496 TempB[5] = B[(1+8*m)*lda+5+8*n];
497 TempB[6] = B[(1+8*m)*lda+6+8*n];
498 TempB[7] = B[(1+8*m)*lda+7+8*n];
500 C[0+8*n+i] += TempA[1] * TempB[0];
501 C[1+8*n+i] += TempA[1] * TempB[1];
502 C[2+8*n+i] += TempA[1] * TempB[2];
503 C[3+8*n+i] += TempA[1] * TempB[3];
504 C[4+8*n+i] += TempA[1] * TempB[4];
505 C[5+8*n+i] += TempA[1] * TempB[5];
506 C[6+8*n+i] += TempA[1] * TempB[6];
507 C[7+8*n+i] += TempA[1] * TempB[7];
511 TempB[0] = B[(2+8*m)*lda+0+8*n];
512 TempB[1] = B[(2+8*m)*lda+1+8*n];
513 TempB[2] = B[(2+8*m)*lda+2+8*n];
514 TempB[3] = B[(2+8*m)*lda+3+8*n];
515 TempB[4] = B[(2+8*m)*lda+4+8*n];
516 TempB[5] = B[(2+8*m)*lda+5+8*n];
517 TempB[6] = B[(2+8*m)*lda+6+8*n];
518 TempB[7] = B[(2+8*m)*lda+7+8*n];
520 C[0+8*n+i] += TempA[2] * TempB[0];
521 C[1+8*n+i] += TempA[2] * TempB[1];
522 C[2+8*n+i] += TempA[2] * TempB[2];
523 C[3+8*n+i] += TempA[2] * TempB[3];
524 C[4+8*n+i] += TempA[2] * TempB[4];
525 C[5+8*n+i] += TempA[2] * TempB[5];
526 C[6+8*n+i] += TempA[2] * TempB[6];
527 C[7+8*n+i] += TempA[2] * TempB[7];
531 TempB[0] = B[(3+8*m)*lda+0+8*n];
532 TempB[1] = B[(3+8*m)*lda+1+8*n];
533 TempB[2] = B[(3+8*m)*lda+2+8*n];
534 TempB[3] = B[(3+8*m)*lda+3+8*n];
535 TempB[4] = B[(3+8*m)*lda+4+8*n];
536 TempB[5] = B[(3+8*m)*lda+5+8*n];
537 TempB[6] = B[(3+8*m)*lda+6+8*n];
538 TempB[7] = B[(3+8*m)*lda+7+8*n];
540 C[0+8*n+i] += TempA[3] * TempB[0];
541 C[1+8*n+i] += TempA[3] * TempB[1];
542 C[2+8*n+i] += TempA[3] * TempB[2];
543 C[3+8*n+i] += TempA[3] * TempB[3];
544 C[4+8*n+i] += TempA[3] * TempB[4];
545 C[5+8*n+i] += TempA[3] * TempB[5];
546 C[6+8*n+i] += TempA[3] * TempB[6];
547 C[7+8*n+i] += TempA[3] * TempB[7];
550 TempB[0] = B[(4+8*m)*lda+0+8*n];
551 TempB[1] = B[(4+8*m)*lda+1+8*n];
552 TempB[2] = B[(4+8*m)*lda+2+8*n];
553 TempB[3] = B[(4+8*m)*lda+3+8*n];
554 TempB[4] = B[(4+8*m)*lda+4+8*n];
555 TempB[5] = B[(4+8*m)*lda+5+8*n];
556 TempB[6] = B[(4+8*m)*lda+6+8*n];
557 TempB[7] = B[(4+8*m)*lda+7+8*n];
559 C[0+8*n+i] += TempA[4] * TempB[0];
560 C[1+8*n+i] += TempA[4] * TempB[1];
561 C[2+8*n+i] += TempA[4] * TempB[2];
562 C[3+8*n+i] += TempA[4] * TempB[3];
563 C[4+8*n+i] += TempA[4] * TempB[4];
564 C[5+8*n+i] += TempA[4] * TempB[5];
565 C[6+8*n+i] += TempA[4] * TempB[6];
566 C[7+8*n+i] += TempA[4] * TempB[7];
570 TempB[0] = B[(5+8*m)*lda+0+8*n];
571 TempB[1] = B[(5+8*m)*lda+1+8*n];
572 TempB[2] = B[(5+8*m)*lda+2+8*n];
573 TempB[3] = B[(5+8*m)*lda+3+8*n];
574 TempB[4] = B[(5+8*m)*lda+4+8*n];
575 TempB[5] = B[(5+8*m)*lda+5+8*n];
576 TempB[6] = B[(5+8*m)*lda+6+8*n];
577 TempB[7] = B[(5+8*m)*lda+7+8*n];
579 C[0+8*n+i] += TempA[5] * TempB[0];
580 C[1+8*n+i] += TempA[5] * TempB[1];
581 C[2+8*n+i] += TempA[5] * TempB[2];
582 C[3+8*n+i] += TempA[5] * TempB[3];
583 C[4+8*n+i] += TempA[5] * TempB[4];
584 C[5+8*n+i] += TempA[5] * TempB[5];
585 C[6+8*n+i] += TempA[5] * TempB[6];
586 C[7+8*n+i] += TempA[5] * TempB[7];
590 TempB[0] = B[(6+8*m)*lda+0+8*n];
591 TempB[1] = B[(6+8*m)*lda+1+8*n];
592 TempB[2] = B[(6+8*m)*lda+2+8*n];
593 TempB[3] = B[(6+8*m)*lda+3+8*n];
594 TempB[4] = B[(6+8*m)*lda+4+8*n];
595 TempB[5] = B[(6+8*m)*lda+5+8*n];
596 TempB[6] = B[(6+8*m)*lda+6+8*n];
597 TempB[7] = B[(6+8*m)*lda+7+8*n];
599 C[0+8*n+i] += TempA[6] * TempB[0];
600 C[1+8*n+i] += TempA[6] * TempB[1];
601 C[2+8*n+i] += TempA[6] * TempB[2];
602 C[3+8*n+i] += TempA[6] * TempB[3];
603 C[4+8*n+i] += TempA[6] * TempB[4];
604 C[5+8*n+i] += TempA[6] * TempB[5];
605 C[6+8*n+i] += TempA[6] * TempB[6];
606 C[7+8*n+i] += TempA[6] * TempB[7];
609 TempB[0] = B[(7+8*m)*lda+0+8*n];
610 TempB[1] = B[(7+8*m)*lda+1+8*n];
611 TempB[2] = B[(7+8*m)*lda+2+8*n];
612 TempB[3] = B[(7+8*m)*lda+3+8*n];
613 TempB[4] = B[(7+8*m)*lda+4+8*n];
614 TempB[5] = B[(7+8*m)*lda+5+8*n];
615 TempB[6] = B[(7+8*m)*lda+6+8*n];
616 TempB[7] = B[(7+8*m)*lda+7+8*n];
618 C[0+8*n+i] += TempA[7] * TempB[0];
619 C[1+8*n+i] += TempA[7] * TempB[1];
620 C[2+8*n+i] += TempA[7] * TempB[2];
621 C[3+8*n+i] += TempA[7] * TempB[3];
622 C[4+8*n+i] += TempA[7] * TempB[4];
623 C[5+8*n+i] += TempA[7] * TempB[5];
624 C[6+8*n+i] += TempA[7] * TempB[6];
625 C[7+8*n+i] += TempA[7] * TempB[7];
633 //-------------------------------------------------------------version2.2, optimize k. from 4 instead of 8 like v2.1, random failing on MI, unknown reason, MSI,350K, take off each inner loop for core 0 260k, both cores 134k
634 //-------------------------------------------------------------try false sharing for core 0, 136k.
636 static __thread int j, m, n;
637 static __thread data_t TempA[4];
638 static __thread data_t TempB[4];
642 for ( j = 1; j < lda; j+=2 )
644 for ( m = 0; m < 8; m++ )
646 TempA[0] = A[j*lda+0+4*m];
647 TempA[1] = A[j*lda+1+4*m];
648 TempA[2] = A[j*lda+2+4*m];
649 TempA[3] = A[j*lda+3+4*m];
651 for( n = 0; n < 8; n++)
654 TempB[0] = B[(0+4*m)*lda+0+4*n];
655 TempB[1] = B[(0+4*m)*lda+1+4*n];
656 TempB[2] = B[(0+4*m)*lda+2+4*n];
657 TempB[3] = B[(0+4*m)*lda+3+4*n];
660 C[0+4*n+j*lda] += TempA[0] * TempB[0];
661 C[1+4*n+j*lda] += TempA[0] * TempB[1];
662 C[2+4*n+j*lda] += TempA[0] * TempB[2];
663 C[3+4*n+j*lda] += TempA[0] * TempB[3];
669 TempB[0] = B[(1+4*m)*lda+0+4*n];
670 TempB[1] = B[(1+4*m)*lda+1+4*n];
671 TempB[2] = B[(1+4*m)*lda+2+4*n];
672 TempB[3] = B[(1+4*m)*lda+3+4*n];
675 C[0+4*n+j*lda] += TempA[1] * TempB[0];
676 C[1+4*n+j*lda] += TempA[1] * TempB[1];
677 C[2+4*n+j*lda] += TempA[1] * TempB[2];
678 C[3+4*n+j*lda] += TempA[1] * TempB[3];
682 TempB[0] = B[(2+4*m)*lda+0+4*n];
683 TempB[1] = B[(2+4*m)*lda+1+4*n];
684 TempB[2] = B[(2+4*m)*lda+2+4*n];
685 TempB[3] = B[(2+4*m)*lda+3+4*n];
688 C[0+4*n+j*lda] += TempA[2] * TempB[0];
689 C[1+4*n+j*lda] += TempA[2] * TempB[1];
690 C[2+4*n+j*lda] += TempA[2] * TempB[2];
691 C[3+4*n+j*lda] += TempA[2] * TempB[3];
696 TempB[0] = B[(3+4*m)*lda+0+4*n];
697 TempB[1] = B[(3+4*m)*lda+1+4*n];
698 TempB[2] = B[(3+4*m)*lda+2+4*n];
699 TempB[3] = B[(3+4*m)*lda+3+4*n];
702 C[0+4*n+j*lda] += TempA[3] * TempB[0];
703 C[1+4*n+j*lda] += TempA[3] * TempB[1];
704 C[2+4*n+j*lda] += TempA[3] * TempB[2];
705 C[3+4*n+j*lda] += TempA[3] * TempB[3];
714 for ( j = 0; j < lda; j+=2 )
716 for ( m = 0; m < 8; m++ )
718 TempA[0] = A[j*lda+0+4*m];
719 TempA[1] = A[j*lda+1+4*m];
720 TempA[2] = A[j*lda+2+4*m];
721 TempA[3] = A[j*lda+3+4*m];
723 for( n = 0; n < 8; n++)
732 TempB[0] = B[(1+4*m)*lda+0+4*n];
733 TempB[1] = B[(1+4*m)*lda+1+4*n];
734 TempB[2] = B[(1+4*m)*lda+2+4*n];
735 TempB[3] = B[(1+4*m)*lda+3+4*n];
738 C[0+4*n+j*lda] += TempA[1] * TempB[0];
739 C[1+4*n+j*lda] += TempA[1] * TempB[1];
740 C[2+4*n+j*lda] += TempA[1] * TempB[2];
741 C[3+4*n+j*lda] += TempA[1] * TempB[3];
745 TempB[0] = B[(2+4*m)*lda+0+4*n];
746 TempB[1] = B[(2+4*m)*lda+1+4*n];
747 TempB[2] = B[(2+4*m)*lda+2+4*n];
748 TempB[3] = B[(2+4*m)*lda+3+4*n];
751 C[0+4*n+j*lda] += TempA[2] * TempB[0];
752 C[1+4*n+j*lda] += TempA[2] * TempB[1];
753 C[2+4*n+j*lda] += TempA[2] * TempB[2];
754 C[3+4*n+j*lda] += TempA[2] * TempB[3];
759 TempB[0] = B[(3+4*m)*lda+0+4*n];
760 TempB[1] = B[(3+4*m)*lda+1+4*n];
761 TempB[2] = B[(3+4*m)*lda+2+4*n];
762 TempB[3] = B[(3+4*m)*lda+3+4*n];
765 C[0+4*n+j*lda] += TempA[3] * TempB[0];
766 C[1+4*n+j*lda] += TempA[3] * TempB[1];
767 C[2+4*n+j*lda] += TempA[3] * TempB[2];
768 C[3+4*n+j*lda] += TempA[3] * TempB[3];
770 TempB[0] = B[(0+4*m)*lda+0+4*n];
771 TempB[1] = B[(0+4*m)*lda+1+4*n];
772 TempB[2] = B[(0+4*m)*lda+2+4*n];
773 TempB[3] = B[(0+4*m)*lda+3+4*n];
776 C[0+4*n+j*lda] += TempA[0] * TempB[0];
777 C[1+4*n+j*lda] += TempA[0] * TempB[1];
778 C[2+4*n+j*lda] += TempA[0] * TempB[2];
779 C[3+4*n+j*lda] += TempA[0] * TempB[3];
790 //-------------------------------------------------------------version2.3, read 8 elements in B at one time. make k to 2. 150k mi 128k msi. worse than v2.0
792 static __thread int i, j, k, m, n;
793 static __thread data_t TempA[2];
794 static __thread data_t TempB[8];
798 for ( j = 0; j < lda; j+=2 )
800 for ( m = 0; m < 16; m++ )
802 TempA[0] = A[j*lda + 0 + 2*m];
803 TempA[1] = A[j*lda + 1 + 2*m];
804 for( n = 0; n < 4; n++)
807 TempB[0] = B[2*m*lda+0+8*n];
808 TempB[1] = B[2*m*lda+1+8*n];
809 TempB[2] = B[2*m*lda+2+8*n];
810 TempB[3] = B[2*m*lda+3+8*n];
811 TempB[4] = B[2*m*lda+4+8*n];
812 TempB[5] = B[2*m*lda+5+8*n];
813 TempB[6] = B[2*m*lda+6+8*n];
814 TempB[7] = B[2*m*lda+7+8*n];
816 C[0+8*n+j*lda] += TempA[0] * TempB[0];
817 C[1+8*n+j*lda] += TempA[0] * TempB[1];
818 C[2+8*n+j*lda] += TempA[0] * TempB[2];
819 C[3+8*n+j*lda] += TempA[0] * TempB[3];
820 C[4+8*n+j*lda] += TempA[0] * TempB[4];
821 C[5+8*n+j*lda] += TempA[0] * TempB[5];
822 C[6+8*n+j*lda] += TempA[0] * TempB[6];
823 C[7+8*n+j*lda] += TempA[0] * TempB[7];
825 TempB[0] = B[(1+2*m)*lda+0+8*n];
826 TempB[1] = B[(1+2*m)*lda+1+8*n];
827 TempB[2] = B[(1+2*m)*lda+2+8*n];
828 TempB[3] = B[(1+2*m)*lda+3+8*n];
829 TempB[4] = B[(1+2*m)*lda+4+8*n];
830 TempB[5] = B[(1+2*m)*lda+5+8*n];
831 TempB[6] = B[(1+2*m)*lda+6+8*n];
832 TempB[7] = B[(1+2*m)*lda+7+8*n];
834 C[0+8*n+j*lda] += TempA[1] * TempB[0];
835 C[1+8*n+j*lda] += TempA[1] * TempB[1];
836 C[2+8*n+j*lda] += TempA[1] * TempB[2];
837 C[3+8*n+j*lda] += TempA[1] * TempB[3];
838 C[4+8*n+j*lda] += TempA[1] * TempB[4];
839 C[5+8*n+j*lda] += TempA[1] * TempB[5];
840 C[6+8*n+j*lda] += TempA[1] * TempB[6];
841 C[7+8*n+j*lda] += TempA[1] * TempB[7];
851 for ( j = 1; j < lda; j+=2 )
853 for ( m = 0; m < 16; m++ )
855 TempA[0] = A[j*lda + 0 + 2*m];
856 TempA[1] = A[j*lda + 1 + 2*m];
857 for( n = 0; n < 4; n++)
860 TempB[0] = B[2*m*lda+0+8*n];
861 TempB[1] = B[2*m*lda+1+8*n];
862 TempB[2] = B[2*m*lda+2+8*n];
863 TempB[3] = B[2*m*lda+3+8*n];
864 TempB[4] = B[2*m*lda+4+8*n];
865 TempB[5] = B[2*m*lda+5+8*n];
866 TempB[6] = B[2*m*lda+6+8*n];
867 TempB[7] = B[2*m*lda+7+8*n];
869 C[0+8*n+j*lda] += TempA[0] * TempB[0];
870 C[1+8*n+j*lda] += TempA[0] * TempB[1];
871 C[2+8*n+j*lda] += TempA[0] * TempB[2];
872 C[3+8*n+j*lda] += TempA[0] * TempB[3];
873 C[4+8*n+j*lda] += TempA[0] * TempB[4];
874 C[5+8*n+j*lda] += TempA[0] * TempB[5];
875 C[6+8*n+j*lda] += TempA[0] * TempB[6];
876 C[7+8*n+j*lda] += TempA[0] * TempB[7];
878 TempB[0] = B[(1+2*m)*lda+0+8*n];
879 TempB[1] = B[(1+2*m)*lda+1+8*n];
880 TempB[2] = B[(1+2*m)*lda+2+8*n];
881 TempB[3] = B[(1+2*m)*lda+3+8*n];
882 TempB[4] = B[(1+2*m)*lda+4+8*n];
883 TempB[5] = B[(1+2*m)*lda+5+8*n];
884 TempB[6] = B[(1+2*m)*lda+6+8*n];
885 TempB[7] = B[(1+2*m)*lda+7+8*n];
887 C[0+8*n+j*lda] += TempA[1] * TempB[0];
888 C[1+8*n+j*lda] += TempA[1] * TempB[1];
889 C[2+8*n+j*lda] += TempA[1] * TempB[2];
890 C[3+8*n+j*lda] += TempA[1] * TempB[3];
891 C[4+8*n+j*lda] += TempA[1] * TempB[4];
892 C[5+8*n+j*lda] += TempA[1] * TempB[5];
893 C[6+8*n+j*lda] += TempA[1] * TempB[6];
894 C[7+8*n+j*lda] += TempA[1] * TempB[7];
902 //-------------------------------------------------------------version2.4, read 4 170k and 16 140k, error because not enough space elements in B at one time.
904 static __thread int i, j, k, m, n;
905 static __thread data_t TempA;
906 static __thread data_t TempB[16];
910 for ( j = 0; j < lda; j+=2 )
912 for ( k = 0; k < lda; k++ )
914 TempA = A[j*lda + k];
915 for( n = 0; n < 2; n++)
918 TempB[0] = B[k*lda+0+16*n];
919 TempB[1] = B[k*lda+1+16*n];
920 TempB[2] = B[k*lda+2+16*n];
921 TempB[3] = B[k*lda+3+16*n];
922 TempB[4] = B[k*lda+4+16*n];
923 TempB[5] = B[k*lda+5+16*n];
924 TempB[6] = B[k*lda+6+16*n];
925 TempB[7] = B[k*lda+7+16*n];
926 TempB[8] = B[k*lda+8+16*n];
927 TempB[9] = B[k*lda+9+16*n];
928 TempB[10] = B[k*lda+10+16*n];
929 TempB[11] = B[k*lda+11+16*n];
930 TempB[12] = B[k*lda+12+16*n];
931 TempB[13] = B[k*lda+13+16*n];
932 TempB[14] = B[k*lda+14+16*n];
933 TempB[15] = B[k*lda+15+16*n];
936 C[0+16*n+j*lda] += TempA * TempB[0];
937 C[1+16*n+j*lda] += TempA * TempB[1];
938 C[2+16*n+j*lda] += TempA * TempB[2];
939 C[3+16*n+j*lda] += TempA * TempB[3];
940 C[4+16*n+j*lda] += TempA * TempB[4];
941 C[5+16*n+j*lda] += TempA * TempB[5];
942 C[6+16*n+j*lda] += TempA * TempB[6];
943 C[7+16*n+j*lda] += TempA * TempB[7];
944 C[8+16*n+j*lda] += TempA * TempB[8];
945 C[9+16*n+j*lda] += TempA * TempB[9];
946 C[10+16*n+j*lda] += TempA * TempB[10];
947 C[11+16*n+j*lda] += TempA * TempB[11];
948 C[12+16*n+j*lda] += TempA * TempB[12];
949 C[13+16*n+j*lda] += TempA * TempB[13];
950 C[14+16*n+j*lda] += TempA * TempB[14];
951 C[15+16*n+j*lda] += TempA * TempB[15];
962 for ( j = 1; j < lda; j+=2 )
964 for ( k = 0; k < lda; k++ )
966 TempA = A[j*lda + k];
967 for( n = 0; n < 2; n++)
970 TempB[0] = B[k*lda+0+16*n];
971 TempB[1] = B[k*lda+1+16*n];
972 TempB[2] = B[k*lda+2+16*n];
973 TempB[3] = B[k*lda+3+16*n];
974 TempB[4] = B[k*lda+4+16*n];
975 TempB[5] = B[k*lda+5+16*n];
976 TempB[6] = B[k*lda+6+16*n];
977 TempB[7] = B[k*lda+7+16*n];
978 TempB[8] = B[k*lda+8+16*n];
979 TempB[9] = B[k*lda+9+16*n];
980 TempB[10] = B[k*lda+10+16*n];
981 TempB[11] = B[k*lda+11+16*n];
982 TempB[12] = B[k*lda+12+16*n];
983 TempB[13] = B[k*lda+13+16*n];
984 TempB[14] = B[k*lda+14+16*n];
985 TempB[15] = B[k*lda+15+16*n];
988 C[0+16*n+j*lda] += TempA * TempB[0];
989 C[1+16*n+j*lda] += TempA * TempB[1];
990 C[2+16*n+j*lda] += TempA * TempB[2];
991 C[3+16*n+j*lda] += TempA * TempB[3];
992 C[4+16*n+j*lda] += TempA * TempB[4];
993 C[5+16*n+j*lda] += TempA * TempB[5];
994 C[6+16*n+j*lda] += TempA * TempB[6];
995 C[7+16*n+j*lda] += TempA * TempB[7];
996 C[8+16*n+j*lda] += TempA * TempB[8];
997 C[9+16*n+j*lda] += TempA * TempB[9];
998 C[10+16*n+j*lda] += TempA * TempB[10];
999 C[11+16*n+j*lda] += TempA * TempB[11];
1000 C[12+16*n+j*lda] += TempA * TempB[12];
1001 C[13+16*n+j*lda] += TempA * TempB[13];
1002 C[14+16*n+j*lda] += TempA * TempB[14];
1003 C[15+16*n+j*lda] += TempA * TempB[15];
1014 //-------------------------------------------------------------version2.5, read 10 elements in B at one time. has corner cases. Turns out it hangs.
1016 static __thread int j, k, n;
1017 static __thread data_t TempA;
1018 static __thread data_t TempB[10];
1022 for ( j = 0; j < lda; j+=2 )
1024 for ( k = 0; k < lda; k++ )
1026 TempA = A[j*lda + k];
1027 for( n = 0; n < 3; n++)
1029 TempB[0] = B[k*lda+0+10*n];
1030 TempB[1] = B[k*lda+1+10*n];
1031 TempB[2] = B[k*lda+2+10*n];
1032 TempB[3] = B[k*lda+3+10*n];
1033 TempB[4] = B[k*lda+4+10*n];
1034 TempB[5] = B[k*lda+5+10*n];
1035 TempB[6] = B[k*lda+6+10*n];
1036 TempB[7] = B[k*lda+7+10*n];
1037 TempB[8] = B[k*lda+8+10*n];
1038 TempB[9] = B[k*lda+9+10*n];
1040 C[0+10*n+j*lda] += TempA * TempB[0];
1041 C[1+10*n+j*lda] += TempA * TempB[1];
1042 C[2+10*n+j*lda] += TempA * TempB[2];
1043 C[3+10*n+j*lda] += TempA * TempB[3];
1044 C[4+10*n+j*lda] += TempA * TempB[4];
1045 C[5+10*n+j*lda] += TempA * TempB[5];
1046 C[6+10*n+j*lda] += TempA * TempB[6];
1047 C[7+10*n+j*lda] += TempA * TempB[7];
1048 C[8+10*n+j*lda] += TempA * TempB[8];
1049 C[9+10*n+j*lda] += TempA * TempB[9];
1051 TempB[0] = B[k*lda+30];
1052 TempB[1] = B[k*lda+31];
1053 C[30+j*lda] += TempA * TempB[0];
1054 C[31+j*lda] += TempA * TempB[1];
1060 for ( j = 1; j < lda; j+=2 )
1062 for ( k = 0; k < lda; k++ )
1064 TempA = A[j*lda + k];
1065 for( n = 0; n < 3; n++)
1067 TempB[0] = B[k*lda+0+10*n];
1068 TempB[1] = B[k*lda+1+10*n];
1069 TempB[2] = B[k*lda+2+10*n];
1070 TempB[3] = B[k*lda+3+10*n];
1071 TempB[4] = B[k*lda+4+10*n];
1072 TempB[5] = B[k*lda+5+10*n];
1073 TempB[6] = B[k*lda+6+10*n];
1074 TempB[7] = B[k*lda+7+10*n];
1075 TempB[8] = B[k*lda+8+10*n];
1076 TempB[9] = B[k*lda+9+10*n];
1078 C[0+10*n+j*lda] += TempA * TempB[0];
1079 C[1+10*n+j*lda] += TempA * TempB[1];
1080 C[2+10*n+j*lda] += TempA * TempB[2];
1081 C[3+10*n+j*lda] += TempA * TempB[3];
1082 C[4+10*n+j*lda] += TempA * TempB[4];
1083 C[5+10*n+j*lda] += TempA * TempB[5];
1084 C[6+10*n+j*lda] += TempA * TempB[6];
1085 C[7+10*n+j*lda] += TempA * TempB[7];
1086 C[8+10*n+j*lda] += TempA * TempB[8];
1087 C[9+10*n+j*lda] += TempA * TempB[9];
1089 TempB[0] = B[k*lda+30];
1090 TempB[1] = B[k*lda+31];
1091 C[30+j*lda] += TempA * TempB[0];
1092 C[31+j*lda] += TempA * TempB[1];
1099 //-------------------------------------------------------------version2.6, optimize 2.0. take off n loop and tried different order of reading B
1101 static __thread int j, k, n;
1102 static __thread data_t TempA;
1103 static __thread data_t TempB[8];
1107 for ( j = 0; j < lda; j+=2 )
1109 for ( k = 0; k < lda; k++ )
1111 TempA = A[j*lda + k];
1113 TempB[0] = B[k*lda+0];
1114 TempB[1] = B[k*lda+1];
1115 TempB[2] = B[k*lda+2];
1116 TempB[3] = B[k*lda+3];
1117 TempB[4] = B[k*lda+4];
1118 TempB[5] = B[k*lda+5];
1119 TempB[6] = B[k*lda+6];
1120 TempB[7] = B[k*lda+7];
1122 C[0+j*lda] += TempA * TempB[0];
1123 C[1+j*lda] += TempA * TempB[1];
1124 C[2+j*lda] += TempA * TempB[2];
1125 C[3+j*lda] += TempA * TempB[3];
1126 C[4+j*lda] += TempA * TempB[4];
1127 C[5+j*lda] += TempA * TempB[5];
1128 C[6+j*lda] += TempA * TempB[6];
1129 C[7+j*lda] += TempA * TempB[7];
1131 TempB[0] = B[k*lda+8];
1132 TempB[1] = B[k*lda+9];
1133 TempB[2] = B[k*lda+10];
1134 TempB[3] = B[k*lda+11];
1135 TempB[4] = B[k*lda+12];
1136 TempB[5] = B[k*lda+13];
1137 TempB[6] = B[k*lda+14];
1138 TempB[7] = B[k*lda+15];
1140 C[8+j*lda] += TempA * TempB[0];
1141 C[9+j*lda] += TempA * TempB[1];
1142 C[10+j*lda] += TempA * TempB[2];
1143 C[11+j*lda] += TempA * TempB[3];
1144 C[12+j*lda] += TempA * TempB[4];
1145 C[13+j*lda] += TempA * TempB[5];
1146 C[14+j*lda] += TempA * TempB[6];
1147 C[15+j*lda] += TempA * TempB[7];
1149 TempB[0] = B[k*lda+16];
1150 TempB[1] = B[k*lda+17];
1151 TempB[2] = B[k*lda+18];
1152 TempB[3] = B[k*lda+19];
1153 TempB[4] = B[k*lda+20];
1154 TempB[5] = B[k*lda+21];
1155 TempB[6] = B[k*lda+22];
1156 TempB[7] = B[k*lda+23];
1158 C[16+j*lda] += TempA * TempB[0];
1159 C[17+j*lda] += TempA * TempB[1];
1160 C[18+j*lda] += TempA * TempB[2];
1161 C[19+j*lda] += TempA * TempB[3];
1162 C[20+j*lda] += TempA * TempB[4];
1163 C[21+j*lda] += TempA * TempB[5];
1164 C[22+j*lda] += TempA * TempB[6];
1165 C[23+j*lda] += TempA * TempB[7];
1167 TempB[0] = B[k*lda+24];
1168 TempB[1] = B[k*lda+25];
1169 TempB[2] = B[k*lda+26];
1170 TempB[3] = B[k*lda+27];
1171 TempB[4] = B[k*lda+28];
1172 TempB[5] = B[k*lda+29];
1173 TempB[6] = B[k*lda+30];
1174 TempB[7] = B[k*lda+31];
1176 C[24+j*lda] += TempA * TempB[0];
1177 C[25+j*lda] += TempA * TempB[1];
1178 C[26+j*lda] += TempA * TempB[2];
1179 C[27+j*lda] += TempA * TempB[3];
1180 C[28+j*lda] += TempA * TempB[4];
1181 C[29+j*lda] += TempA * TempB[5];
1182 C[30+j*lda] += TempA * TempB[6];
1183 C[31+j*lda] += TempA * TempB[7];
1193 for ( j = 1; j < lda; j+=2 )
1195 for ( k = 0; k < lda; k++ )
1197 TempA = A[j*lda + k];
1200 TempB[0] = B[k*lda+24];
1201 TempB[1] = B[k*lda+25];
1202 TempB[2] = B[k*lda+26];
1203 TempB[3] = B[k*lda+27];
1204 TempB[4] = B[k*lda+28];
1205 TempB[5] = B[k*lda+29];
1206 TempB[6] = B[k*lda+30];
1207 TempB[7] = B[k*lda+31];
1209 C[24+j*lda] += TempA * TempB[0];
1210 C[25+j*lda] += TempA * TempB[1];
1211 C[26+j*lda] += TempA * TempB[2];
1212 C[27+j*lda] += TempA * TempB[3];
1213 C[28+j*lda] += TempA * TempB[4];
1214 C[29+j*lda] += TempA * TempB[5];
1215 C[30+j*lda] += TempA * TempB[6];
1216 C[31+j*lda] += TempA * TempB[7];
1218 TempB[0] = B[k*lda+0];
1219 TempB[1] = B[k*lda+1];
1220 TempB[2] = B[k*lda+2];
1221 TempB[3] = B[k*lda+3];
1222 TempB[4] = B[k*lda+4];
1223 TempB[5] = B[k*lda+5];
1224 TempB[6] = B[k*lda+6];
1225 TempB[7] = B[k*lda+7];
1227 C[0+j*lda] += TempA * TempB[0];
1228 C[1+j*lda] += TempA * TempB[1];
1229 C[2+j*lda] += TempA * TempB[2];
1230 C[3+j*lda] += TempA * TempB[3];
1231 C[4+j*lda] += TempA * TempB[4];
1232 C[5+j*lda] += TempA * TempB[5];
1233 C[6+j*lda] += TempA * TempB[6];
1234 C[7+j*lda] += TempA * TempB[7];
1236 TempB[0] = B[k*lda+8];
1237 TempB[1] = B[k*lda+9];
1238 TempB[2] = B[k*lda+10];
1239 TempB[3] = B[k*lda+11];
1240 TempB[4] = B[k*lda+12];
1241 TempB[5] = B[k*lda+13];
1242 TempB[6] = B[k*lda+14];
1243 TempB[7] = B[k*lda+15];
1245 C[8+j*lda] += TempA * TempB[0];
1246 C[9+j*lda] += TempA * TempB[1];
1247 C[10+j*lda] += TempA * TempB[2];
1248 C[11+j*lda] += TempA * TempB[3];
1249 C[12+j*lda] += TempA * TempB[4];
1250 C[13+j*lda] += TempA * TempB[5];
1251 C[14+j*lda] += TempA * TempB[6];
1252 C[15+j*lda] += TempA * TempB[7];
1254 TempB[0] = B[k*lda+16];
1255 TempB[1] = B[k*lda+17];
1256 TempB[2] = B[k*lda+18];
1257 TempB[3] = B[k*lda+19];
1258 TempB[4] = B[k*lda+20];
1259 TempB[5] = B[k*lda+21];
1260 TempB[6] = B[k*lda+22];
1261 TempB[7] = B[k*lda+23];
1263 C[16+j*lda] += TempA * TempB[0];
1264 C[17+j*lda] += TempA * TempB[1];
1265 C[18+j*lda] += TempA * TempB[2];
1266 C[19+j*lda] += TempA * TempB[3];
1267 C[20+j*lda] += TempA * TempB[4];
1268 C[21+j*lda] += TempA * TempB[5];
1269 C[22+j*lda] += TempA * TempB[6];
1270 C[23+j*lda] += TempA * TempB[7];
1281 //-------------------------------------------------------------version2.7, use m=l*da, i=k*lda,out of stack, only i, MI 150k, only m, MSI 117.9k slower than v2.0
1283 static __thread int i, j, k, m, n;
1284 static __thread data_t TempA;
1285 static __thread data_t TempB[8];
1289 for ( j = 0; j < lda; j+=2 )
1292 for ( k = 0; k < lda; k++ )
1295 for( n = 0; n < 4; n++)
1298 TempB[0] = B[k *lda+0+8*n];
1299 TempB[1] = B[k *lda+1+8*n];
1300 TempB[2] = B[k *lda+2+8*n];
1301 TempB[3] = B[k *lda+3+8*n];
1302 TempB[4] = B[k *lda+4+8*n];
1303 TempB[5] = B[k *lda+5+8*n];
1304 TempB[6] = B[k *lda+6+8*n];
1305 TempB[7] = B[k *lda+7+8*n];
1307 C[0+8*n+m] += TempA * TempB[0];
1308 C[1+8*n+m] += TempA * TempB[1];
1309 C[2+8*n+m] += TempA * TempB[2];
1310 C[3+8*n+m] += TempA * TempB[3];
1311 C[4+8*n+m] += TempA * TempB[4];
1312 C[5+8*n+m] += TempA * TempB[5];
1313 C[6+8*n+m] += TempA * TempB[6];
1314 C[7+8*n+m] += TempA * TempB[7];
1323 for ( j = 1; j < lda; j+=2 )
1326 for ( k = 0; k < lda; k++ )
1329 for( n = 0; n < 4; n++)
1332 TempB[0] = B[k *lda+0+8*n];
1333 TempB[1] = B[k *lda+1+8*n];
1334 TempB[2] = B[k *lda+2+8*n];
1335 TempB[3] = B[k *lda+3+8*n];
1336 TempB[4] = B[k *lda+4+8*n];
1337 TempB[5] = B[k *lda+5+8*n];
1338 TempB[6] = B[k *lda+6+8*n];
1339 TempB[7] = B[k *lda+7+8*n];
1341 C[0+8*n+m] += TempA * TempB[0];
1342 C[1+8*n+m] += TempA * TempB[1];
1343 C[2+8*n+m] += TempA * TempB[2];
1344 C[3+8*n+m] += TempA * TempB[3];
1345 C[4+8*n+m] += TempA * TempB[4];
1346 C[5+8*n+m] += TempA * TempB[5];
1347 C[6+8*n+m] += TempA * TempB[6];
1348 C[7+8*n+m] += TempA * TempB[7];
1356 //-------------------------------------------------------------version2.8 deal with false sharing, MSI,118K vs v2.0 117.0K. MI 147.629K.
1358 static __thread int i, j, k, m, n;
1359 static __thread data_t TempA;
1360 static __thread data_t TempB[8];
1364 for ( j = 0; j < lda; j+=2 )
1366 for ( k = 0; k < lda; k++ )
1368 TempA = A[j*lda + k];
1369 for( n = 0; n < 2; n++)
1372 TempB[0] = B[k*lda+0+16*n];
1373 TempB[1] = B[k*lda+1+16*n];
1374 TempB[2] = B[k*lda+2+16*n];
1375 TempB[3] = B[k*lda+3+16*n];
1376 TempB[4] = B[k*lda+4+16*n];
1377 TempB[5] = B[k*lda+5+16*n];
1378 TempB[6] = B[k*lda+6+16*n];
1379 TempB[7] = B[k*lda+7+16*n];
1383 C[0+16*n+j*lda] += TempA * TempB[0];
1384 C[1+16*n+j*lda] += TempA * TempB[1];
1385 C[2+16*n+j*lda] += TempA * TempB[2];
1386 C[3+16*n+j*lda] += TempA * TempB[3];
1387 C[4+16*n+j*lda] += TempA * TempB[4];
1388 C[5+16*n+j*lda] += TempA * TempB[5];
1389 C[6+16*n+j*lda] += TempA * TempB[6];
1390 C[7+16*n+j*lda] += TempA * TempB[7];
1392 TempB[0] = B[k*lda+8+16*n];
1393 TempB[1] = B[k*lda+9+16*n];
1394 TempB[2] = B[k*lda+10+16*n];
1395 TempB[3] = B[k*lda+11+16*n];
1396 TempB[4] = B[k*lda+12+16*n];
1397 TempB[5] = B[k*lda+13+16*n];
1398 TempB[6] = B[k*lda+14+16*n];
1399 TempB[7] = B[k*lda+15+16*n];
1401 C[8+16*n+j*lda] += TempA * TempB[0];
1402 C[9+16*n+j*lda] += TempA * TempB[1];
1403 C[10+16*n+j*lda] += TempA * TempB[2];
1404 C[11+16*n+j*lda] += TempA * TempB[3];
1405 C[12+16*n+j*lda] += TempA * TempB[4];
1406 C[13+16*n+j*lda] += TempA * TempB[5];
1407 C[14+16*n+j*lda] += TempA * TempB[6];
1408 C[15+16*n+j*lda] += TempA * TempB[7];
1419 for ( j = 1; j < lda; j+=2 )
1421 for ( k = 0; k < lda; k++ )
1423 TempA = A[j*lda + k];
1424 for( n = 0; n < 2; n++)
1429 TempB[0] = B[k*lda+8+16*n];
1430 TempB[1] = B[k*lda+9+16*n];
1431 TempB[2] = B[k*lda+10+16*n];
1432 TempB[3] = B[k*lda+11+16*n];
1433 TempB[4] = B[k*lda+12+16*n];
1434 TempB[5] = B[k*lda+13+16*n];
1435 TempB[6] = B[k*lda+14+16*n];
1436 TempB[7] = B[k*lda+15+16*n];
1438 C[8+16*n+j*lda] += TempA * TempB[0];
1439 C[9+16*n+j*lda] += TempA * TempB[1];
1440 C[10+16*n+j*lda] += TempA * TempB[2];
1441 C[11+16*n+j*lda] += TempA * TempB[3];
1442 C[12+16*n+j*lda] += TempA * TempB[4];
1443 C[13+16*n+j*lda] += TempA * TempB[5];
1444 C[14+16*n+j*lda] += TempA * TempB[6];
1445 C[15+16*n+j*lda] += TempA * TempB[7];
1447 TempB[0] = B[k*lda+0+16*n];
1448 TempB[1] = B[k*lda+1+16*n];
1449 TempB[2] = B[k*lda+2+16*n];
1450 TempB[3] = B[k*lda+3+16*n];
1451 TempB[4] = B[k*lda+4+16*n];
1452 TempB[5] = B[k*lda+5+16*n];
1453 TempB[6] = B[k*lda+6+16*n];
1454 TempB[7] = B[k*lda+7+16*n];
1458 C[0+16*n+j*lda] += TempA * TempB[0];
1459 C[1+16*n+j*lda] += TempA * TempB[1];
1460 C[2+16*n+j*lda] += TempA * TempB[2];
1461 C[3+16*n+j*lda] += TempA * TempB[3];
1462 C[4+16*n+j*lda] += TempA * TempB[4];
1463 C[5+16*n+j*lda] += TempA * TempB[5];
1464 C[6+16*n+j*lda] += TempA * TempB[6];
1465 C[7+16*n+j*lda] += TempA * TempB[7];
1475 //----------------------------------------------------------------version 2.11 optmize j,use core 1 j from 0 to 15 MSI 98k i = j*lda
1476 //----------------------------------------------------------------version 2.12 not use i = j *lda MSI 95k
1477 static __thread data_t TempA
[8];
1478 static __thread data_t TempB
[8];
1479 static __thread
int j
,m
,n
,i
,k
;
1483 for ( j
= 16; j
< 32; j
++ )
1486 for ( m
= 0; m
< 4; m
++ )
1489 TempA
[0] = A
[j
*lda
+0+8*m
];
1490 TempA
[1] = A
[j
*lda
+1+8*m
];
1491 TempA
[2] = A
[j
*lda
+2+8*m
];
1492 TempA
[3] = A
[j
*lda
+3+8*m
];
1493 TempA
[4] = A
[j
*lda
+4+8*m
];
1494 TempA
[5] = A
[j
*lda
+5+8*m
];
1495 TempA
[6] = A
[j
*lda
+6+8*m
];
1496 TempA
[7] = A
[j
*lda
+7+8*m
];
1498 for( n
= 0; n
< 4; n
++)
1503 TempB[0] = B[(0+8*m)*lda+0+8*n];
1504 TempB[1] = B[(0+8*m)*lda+1+8*n];
1505 TempB[2] = B[(0+8*m)*lda+2+8*n];
1506 TempB[3] = B[(0+8*m)*lda+3+8*n];
1507 TempB[4] = B[(0+8*m)*lda+4+8*n];
1508 TempB[5] = B[(0+8*m)*lda+5+8*n];
1509 TempB[6] = B[(0+8*m)*lda+6+8*n];
1510 TempB[7] = B[(0+8*m)*lda+7+8*n];
1512 C[0+8*n+i] += TempA[0] * TempB[0];
1513 C[1+8*n+i] += TempA[0] * TempB[1];
1514 C[2+8*n+i] += TempA[0] * TempB[2];
1515 C[3+8*n+i] += TempA[0] * TempB[3];
1516 C[4+8*n+i] += TempA[0] * TempB[4];
1517 C[5+8*n+i] += TempA[0] * TempB[5];
1518 C[6+8*n+i] += TempA[0] * TempB[6];
1519 C[7+8*n+i] += TempA[0] * TempB[7];
1523 TempB[0] = B[(1+8*m)*lda+0+8*n];
1524 TempB[1] = B[(1+8*m)*lda+1+8*n];
1525 TempB[2] = B[(1+8*m)*lda+2+8*n];
1526 TempB[3] = B[(1+8*m)*lda+3+8*n];
1527 TempB[4] = B[(1+8*m)*lda+4+8*n];
1528 TempB[5] = B[(1+8*m)*lda+5+8*n];
1529 TempB[6] = B[(1+8*m)*lda+6+8*n];
1530 TempB[7] = B[(1+8*m)*lda+7+8*n];
1532 C[0+8*n+i] += TempA[1] * TempB[0];
1533 C[1+8*n+i] += TempA[1] * TempB[1];
1534 C[2+8*n+i] += TempA[1] * TempB[2];
1535 C[3+8*n+i] += TempA[1] * TempB[3];
1536 C[4+8*n+i] += TempA[1] * TempB[4];
1537 C[5+8*n+i] += TempA[1] * TempB[5];
1538 C[6+8*n+i] += TempA[1] * TempB[6];
1539 C[7+8*n+i] += TempA[1] * TempB[7];
1543 TempB[0] = B[(2+8*m)*lda+0+8*n];
1544 TempB[1] = B[(2+8*m)*lda+1+8*n];
1545 TempB[2] = B[(2+8*m)*lda+2+8*n];
1546 TempB[3] = B[(2+8*m)*lda+3+8*n];
1547 TempB[4] = B[(2+8*m)*lda+4+8*n];
1548 TempB[5] = B[(2+8*m)*lda+5+8*n];
1549 TempB[6] = B[(2+8*m)*lda+6+8*n];
1550 TempB[7] = B[(2+8*m)*lda+7+8*n];
1552 C[0+8*n+i] += TempA[2] * TempB[0];
1553 C[1+8*n+i] += TempA[2] * TempB[1];
1554 C[2+8*n+i] += TempA[2] * TempB[2];
1555 C[3+8*n+i] += TempA[2] * TempB[3];
1556 C[4+8*n+i] += TempA[2] * TempB[4];
1557 C[5+8*n+i] += TempA[2] * TempB[5];
1558 C[6+8*n+i] += TempA[2] * TempB[6];
1559 C[7+8*n+i] += TempA[2] * TempB[7];
1563 TempB[0] = B[(3+8*m)*lda+0+8*n];
1564 TempB[1] = B[(3+8*m)*lda+1+8*n];
1565 TempB[2] = B[(3+8*m)*lda+2+8*n];
1566 TempB[3] = B[(3+8*m)*lda+3+8*n];
1567 TempB[4] = B[(3+8*m)*lda+4+8*n];
1568 TempB[5] = B[(3+8*m)*lda+5+8*n];
1569 TempB[6] = B[(3+8*m)*lda+6+8*n];
1570 TempB[7] = B[(3+8*m)*lda+7+8*n];
1572 C[0+8*n+i] += TempA[3] * TempB[0];
1573 C[1+8*n+i] += TempA[3] * TempB[1];
1574 C[2+8*n+i] += TempA[3] * TempB[2];
1575 C[3+8*n+i] += TempA[3] * TempB[3];
1576 C[4+8*n+i] += TempA[3] * TempB[4];
1577 C[5+8*n+i] += TempA[3] * TempB[5];
1578 C[6+8*n+i] += TempA[3] * TempB[6];
1579 C[7+8*n+i] += TempA[3] * TempB[7];
1582 TempB[0] = B[(4+8*m)*lda+0+8*n];
1583 TempB[1] = B[(4+8*m)*lda+1+8*n];
1584 TempB[2] = B[(4+8*m)*lda+2+8*n];
1585 TempB[3] = B[(4+8*m)*lda+3+8*n];
1586 TempB[4] = B[(4+8*m)*lda+4+8*n];
1587 TempB[5] = B[(4+8*m)*lda+5+8*n];
1588 TempB[6] = B[(4+8*m)*lda+6+8*n];
1589 TempB[7] = B[(4+8*m)*lda+7+8*n];
1591 C[0+8*n+i] += TempA[4] * TempB[0];
1592 C[1+8*n+i] += TempA[4] * TempB[1];
1593 C[2+8*n+i] += TempA[4] * TempB[2];
1594 C[3+8*n+i] += TempA[4] * TempB[3];
1595 C[4+8*n+i] += TempA[4] * TempB[4];
1596 C[5+8*n+i] += TempA[4] * TempB[5];
1597 C[6+8*n+i] += TempA[4] * TempB[6];
1598 C[7+8*n+i] += TempA[4] * TempB[7];
1602 TempB[0] = B[(5+8*m)*lda+0+8*n];
1603 TempB[1] = B[(5+8*m)*lda+1+8*n];
1604 TempB[2] = B[(5+8*m)*lda+2+8*n];
1605 TempB[3] = B[(5+8*m)*lda+3+8*n];
1606 TempB[4] = B[(5+8*m)*lda+4+8*n];
1607 TempB[5] = B[(5+8*m)*lda+5+8*n];
1608 TempB[6] = B[(5+8*m)*lda+6+8*n];
1609 TempB[7] = B[(5+8*m)*lda+7+8*n];
1611 C[0+8*n+i] += TempA[5] * TempB[0];
1612 C[1+8*n+i] += TempA[5] * TempB[1];
1613 C[2+8*n+i] += TempA[5] * TempB[2];
1614 C[3+8*n+i] += TempA[5] * TempB[3];
1615 C[4+8*n+i] += TempA[5] * TempB[4];
1616 C[5+8*n+i] += TempA[5] * TempB[5];
1617 C[6+8*n+i] += TempA[5] * TempB[6];
1618 C[7+8*n+i] += TempA[5] * TempB[7];
1622 TempB[0] = B[(6+8*m)*lda+0+8*n];
1623 TempB[1] = B[(6+8*m)*lda+1+8*n];
1624 TempB[2] = B[(6+8*m)*lda+2+8*n];
1625 TempB[3] = B[(6+8*m)*lda+3+8*n];
1626 TempB[4] = B[(6+8*m)*lda+4+8*n];
1627 TempB[5] = B[(6+8*m)*lda+5+8*n];
1628 TempB[6] = B[(6+8*m)*lda+6+8*n];
1629 TempB[7] = B[(6+8*m)*lda+7+8*n];
1631 C[0+8*n+i] += TempA[6] * TempB[0];
1632 C[1+8*n+i] += TempA[6] * TempB[1];
1633 C[2+8*n+i] += TempA[6] * TempB[2];
1634 C[3+8*n+i] += TempA[6] * TempB[3];
1635 C[4+8*n+i] += TempA[6] * TempB[4];
1636 C[5+8*n+i] += TempA[6] * TempB[5];
1637 C[6+8*n+i] += TempA[6] * TempB[6];
1638 C[7+8*n+i] += TempA[6] * TempB[7];
1641 TempB[0] = B[(7+8*m)*lda+0+8*n];
1642 TempB[1] = B[(7+8*m)*lda+1+8*n];
1643 TempB[2] = B[(7+8*m)*lda+2+8*n];
1644 TempB[3] = B[(7+8*m)*lda+3+8*n];
1645 TempB[4] = B[(7+8*m)*lda+4+8*n];
1646 TempB[5] = B[(7+8*m)*lda+5+8*n];
1647 TempB[6] = B[(7+8*m)*lda+6+8*n];
1648 TempB[7] = B[(7+8*m)*lda+7+8*n];
1650 C[0+8*n+i] += TempA[7] * TempB[0];
1651 C[1+8*n+i] += TempA[7] * TempB[1];
1652 C[2+8*n+i] += TempA[7] * TempB[2];
1653 C[3+8*n+i] += TempA[7] * TempB[3];
1654 C[4+8*n+i] += TempA[7] * TempB[4];
1655 C[5+8*n+i] += TempA[7] * TempB[5];
1656 C[6+8*n+i] += TempA[7] * TempB[6];
1657 C[7+8*n+i] += TempA[7] * TempB[7];
1660 TempB
[0] = B
[(0+8*m
)*lda
+0+8*n
];
1661 TempB
[1] = B
[(0+8*m
)*lda
+1+8*n
];
1662 TempB
[2] = B
[(0+8*m
)*lda
+2+8*n
];
1663 TempB
[3] = B
[(0+8*m
)*lda
+3+8*n
];
1664 TempB
[4] = B
[(0+8*m
)*lda
+4+8*n
];
1665 TempB
[5] = B
[(0+8*m
)*lda
+5+8*n
];
1666 TempB
[6] = B
[(0+8*m
)*lda
+6+8*n
];
1667 TempB
[7] = B
[(0+8*m
)*lda
+7+8*n
];
1669 C
[0+8*n
+j
*lda
] += TempA
[0] * TempB
[0];
1670 C
[1+8*n
+j
*lda
] += TempA
[0] * TempB
[1];
1671 C
[2+8*n
+j
*lda
] += TempA
[0] * TempB
[2];
1672 C
[3+8*n
+j
*lda
] += TempA
[0] * TempB
[3];
1673 C
[4+8*n
+j
*lda
] += TempA
[0] * TempB
[4];
1674 C
[5+8*n
+j
*lda
] += TempA
[0] * TempB
[5];
1675 C
[6+8*n
+j
*lda
] += TempA
[0] * TempB
[6];
1676 C
[7+8*n
+j
*lda
] += TempA
[0] * TempB
[7];
1680 TempB
[0] = B
[(1+8*m
)*lda
+0+8*n
];
1681 TempB
[1] = B
[(1+8*m
)*lda
+1+8*n
];
1682 TempB
[2] = B
[(1+8*m
)*lda
+2+8*n
];
1683 TempB
[3] = B
[(1+8*m
)*lda
+3+8*n
];
1684 TempB
[4] = B
[(1+8*m
)*lda
+4+8*n
];
1685 TempB
[5] = B
[(1+8*m
)*lda
+5+8*n
];
1686 TempB
[6] = B
[(1+8*m
)*lda
+6+8*n
];
1687 TempB
[7] = B
[(1+8*m
)*lda
+7+8*n
];
1689 C
[0+8*n
+j
*lda
] += TempA
[1] * TempB
[0];
1690 C
[1+8*n
+j
*lda
] += TempA
[1] * TempB
[1];
1691 C
[2+8*n
+j
*lda
] += TempA
[1] * TempB
[2];
1692 C
[3+8*n
+j
*lda
] += TempA
[1] * TempB
[3];
1693 C
[4+8*n
+j
*lda
] += TempA
[1] * TempB
[4];
1694 C
[5+8*n
+j
*lda
] += TempA
[1] * TempB
[5];
1695 C
[6+8*n
+j
*lda
] += TempA
[1] * TempB
[6];
1696 C
[7+8*n
+j
*lda
] += TempA
[1] * TempB
[7];
1700 TempB
[0] = B
[(2+8*m
)*lda
+0+8*n
];
1701 TempB
[1] = B
[(2+8*m
)*lda
+1+8*n
];
1702 TempB
[2] = B
[(2+8*m
)*lda
+2+8*n
];
1703 TempB
[3] = B
[(2+8*m
)*lda
+3+8*n
];
1704 TempB
[4] = B
[(2+8*m
)*lda
+4+8*n
];
1705 TempB
[5] = B
[(2+8*m
)*lda
+5+8*n
];
1706 TempB
[6] = B
[(2+8*m
)*lda
+6+8*n
];
1707 TempB
[7] = B
[(2+8*m
)*lda
+7+8*n
];
1709 C
[0+8*n
+j
*lda
] += TempA
[2] * TempB
[0];
1710 C
[1+8*n
+j
*lda
] += TempA
[2] * TempB
[1];
1711 C
[2+8*n
+j
*lda
] += TempA
[2] * TempB
[2];
1712 C
[3+8*n
+j
*lda
] += TempA
[2] * TempB
[3];
1713 C
[4+8*n
+j
*lda
] += TempA
[2] * TempB
[4];
1714 C
[5+8*n
+j
*lda
] += TempA
[2] * TempB
[5];
1715 C
[6+8*n
+j
*lda
] += TempA
[2] * TempB
[6];
1716 C
[7+8*n
+j
*lda
] += TempA
[2] * TempB
[7];
1720 TempB
[0] = B
[(3+8*m
)*lda
+0+8*n
];
1721 TempB
[1] = B
[(3+8*m
)*lda
+1+8*n
];
1722 TempB
[2] = B
[(3+8*m
)*lda
+2+8*n
];
1723 TempB
[3] = B
[(3+8*m
)*lda
+3+8*n
];
1724 TempB
[4] = B
[(3+8*m
)*lda
+4+8*n
];
1725 TempB
[5] = B
[(3+8*m
)*lda
+5+8*n
];
1726 TempB
[6] = B
[(3+8*m
)*lda
+6+8*n
];
1727 TempB
[7] = B
[(3+8*m
)*lda
+7+8*n
];
1729 C
[0+8*n
+j
*lda
] += TempA
[3] * TempB
[0];
1730 C
[1+8*n
+j
*lda
] += TempA
[3] * TempB
[1];
1731 C
[2+8*n
+j
*lda
] += TempA
[3] * TempB
[2];
1732 C
[3+8*n
+j
*lda
] += TempA
[3] * TempB
[3];
1733 C
[4+8*n
+j
*lda
] += TempA
[3] * TempB
[4];
1734 C
[5+8*n
+j
*lda
] += TempA
[3] * TempB
[5];
1735 C
[6+8*n
+j
*lda
] += TempA
[3] * TempB
[6];
1736 C
[7+8*n
+j
*lda
] += TempA
[3] * TempB
[7];
1739 TempB
[0] = B
[(4+8*m
)*lda
+0+8*n
];
1740 TempB
[1] = B
[(4+8*m
)*lda
+1+8*n
];
1741 TempB
[2] = B
[(4+8*m
)*lda
+2+8*n
];
1742 TempB
[3] = B
[(4+8*m
)*lda
+3+8*n
];
1743 TempB
[4] = B
[(4+8*m
)*lda
+4+8*n
];
1744 TempB
[5] = B
[(4+8*m
)*lda
+5+8*n
];
1745 TempB
[6] = B
[(4+8*m
)*lda
+6+8*n
];
1746 TempB
[7] = B
[(4+8*m
)*lda
+7+8*n
];
1748 C
[0+8*n
+j
*lda
] += TempA
[4] * TempB
[0];
1749 C
[1+8*n
+j
*lda
] += TempA
[4] * TempB
[1];
1750 C
[2+8*n
+j
*lda
] += TempA
[4] * TempB
[2];
1751 C
[3+8*n
+j
*lda
] += TempA
[4] * TempB
[3];
1752 C
[4+8*n
+j
*lda
] += TempA
[4] * TempB
[4];
1753 C
[5+8*n
+j
*lda
] += TempA
[4] * TempB
[5];
1754 C
[6+8*n
+j
*lda
] += TempA
[4] * TempB
[6];
1755 C
[7+8*n
+j
*lda
] += TempA
[4] * TempB
[7];
1759 TempB
[0] = B
[(5+8*m
)*lda
+0+8*n
];
1760 TempB
[1] = B
[(5+8*m
)*lda
+1+8*n
];
1761 TempB
[2] = B
[(5+8*m
)*lda
+2+8*n
];
1762 TempB
[3] = B
[(5+8*m
)*lda
+3+8*n
];
1763 TempB
[4] = B
[(5+8*m
)*lda
+4+8*n
];
1764 TempB
[5] = B
[(5+8*m
)*lda
+5+8*n
];
1765 TempB
[6] = B
[(5+8*m
)*lda
+6+8*n
];
1766 TempB
[7] = B
[(5+8*m
)*lda
+7+8*n
];
1768 C
[0+8*n
+j
*lda
] += TempA
[5] * TempB
[0];
1769 C
[1+8*n
+j
*lda
] += TempA
[5] * TempB
[1];
1770 C
[2+8*n
+j
*lda
] += TempA
[5] * TempB
[2];
1771 C
[3+8*n
+j
*lda
] += TempA
[5] * TempB
[3];
1772 C
[4+8*n
+j
*lda
] += TempA
[5] * TempB
[4];
1773 C
[5+8*n
+j
*lda
] += TempA
[5] * TempB
[5];
1774 C
[6+8*n
+j
*lda
] += TempA
[5] * TempB
[6];
1775 C
[7+8*n
+j
*lda
] += TempA
[5] * TempB
[7];
1779 TempB
[0] = B
[(6+8*m
)*lda
+0+8*n
];
1780 TempB
[1] = B
[(6+8*m
)*lda
+1+8*n
];
1781 TempB
[2] = B
[(6+8*m
)*lda
+2+8*n
];
1782 TempB
[3] = B
[(6+8*m
)*lda
+3+8*n
];
1783 TempB
[4] = B
[(6+8*m
)*lda
+4+8*n
];
1784 TempB
[5] = B
[(6+8*m
)*lda
+5+8*n
];
1785 TempB
[6] = B
[(6+8*m
)*lda
+6+8*n
];
1786 TempB
[7] = B
[(6+8*m
)*lda
+7+8*n
];
1788 C
[0+8*n
+j
*lda
] += TempA
[6] * TempB
[0];
1789 C
[1+8*n
+j
*lda
] += TempA
[6] * TempB
[1];
1790 C
[2+8*n
+j
*lda
] += TempA
[6] * TempB
[2];
1791 C
[3+8*n
+j
*lda
] += TempA
[6] * TempB
[3];
1792 C
[4+8*n
+j
*lda
] += TempA
[6] * TempB
[4];
1793 C
[5+8*n
+j
*lda
] += TempA
[6] * TempB
[5];
1794 C
[6+8*n
+j
*lda
] += TempA
[6] * TempB
[6];
1795 C
[7+8*n
+j
*lda
] += TempA
[6] * TempB
[7];
1798 TempB
[0] = B
[(7+8*m
)*lda
+0+8*n
];
1799 TempB
[1] = B
[(7+8*m
)*lda
+1+8*n
];
1800 TempB
[2] = B
[(7+8*m
)*lda
+2+8*n
];
1801 TempB
[3] = B
[(7+8*m
)*lda
+3+8*n
];
1802 TempB
[4] = B
[(7+8*m
)*lda
+4+8*n
];
1803 TempB
[5] = B
[(7+8*m
)*lda
+5+8*n
];
1804 TempB
[6] = B
[(7+8*m
)*lda
+6+8*n
];
1805 TempB
[7] = B
[(7+8*m
)*lda
+7+8*n
];
1807 C
[0+8*n
+j
*lda
] += TempA
[7] * TempB
[0];
1808 C
[1+8*n
+j
*lda
] += TempA
[7] * TempB
[1];
1809 C
[2+8*n
+j
*lda
] += TempA
[7] * TempB
[2];
1810 C
[3+8*n
+j
*lda
] += TempA
[7] * TempB
[3];
1811 C
[4+8*n
+j
*lda
] += TempA
[7] * TempB
[4];
1812 C
[5+8*n
+j
*lda
] += TempA
[7] * TempB
[5];
1813 C
[6+8*n
+j
*lda
] += TempA
[7] * TempB
[6];
1814 C
[7+8*n
+j
*lda
] += TempA
[7] * TempB
[7];
1822 for ( j
= 0; j
< 16; j
++ )
1825 for ( m
= 0; m
< 4; m
++ )
1828 TempA
[0] = A
[j
*lda
+0+8*m
];
1829 TempA
[1] = A
[j
*lda
+1+8*m
];
1830 TempA
[2] = A
[j
*lda
+2+8*m
];
1831 TempA
[3] = A
[j
*lda
+3+8*m
];
1832 TempA
[4] = A
[j
*lda
+4+8*m
];
1833 TempA
[5] = A
[j
*lda
+5+8*m
];
1834 TempA
[6] = A
[j
*lda
+6+8*m
];
1835 TempA
[7] = A
[j
*lda
+7+8*m
];
1837 for( n
= 0; n
< 4; n
++)
1842 TempB[0] = B[(0+8*m)*lda+0+8*n];
1843 TempB[1] = B[(0+8*m)*lda+1+8*n];
1844 TempB[2] = B[(0+8*m)*lda+2+8*n];
1845 TempB[3] = B[(0+8*m)*lda+3+8*n];
1846 TempB[4] = B[(0+8*m)*lda+4+8*n];
1847 TempB[5] = B[(0+8*m)*lda+5+8*n];
1848 TempB[6] = B[(0+8*m)*lda+6+8*n];
1849 TempB[7] = B[(0+8*m)*lda+7+8*n];
1851 C[0+8*n+i] += TempA[0] * TempB[0];
1852 C[1+8*n+i] += TempA[0] * TempB[1];
1853 C[2+8*n+i] += TempA[0] * TempB[2];
1854 C[3+8*n+i] += TempA[0] * TempB[3];
1855 C[4+8*n+i] += TempA[0] * TempB[4];
1856 C[5+8*n+i] += TempA[0] * TempB[5];
1857 C[6+8*n+i] += TempA[0] * TempB[6];
1858 C[7+8*n+i] += TempA[0] * TempB[7];
1862 TempB[0] = B[(1+8*m)*lda+0+8*n];
1863 TempB[1] = B[(1+8*m)*lda+1+8*n];
1864 TempB[2] = B[(1+8*m)*lda+2+8*n];
1865 TempB[3] = B[(1+8*m)*lda+3+8*n];
1866 TempB[4] = B[(1+8*m)*lda+4+8*n];
1867 TempB[5] = B[(1+8*m)*lda+5+8*n];
1868 TempB[6] = B[(1+8*m)*lda+6+8*n];
1869 TempB[7] = B[(1+8*m)*lda+7+8*n];
1871 C[0+8*n+i] += TempA[1] * TempB[0];
1872 C[1+8*n+i] += TempA[1] * TempB[1];
1873 C[2+8*n+i] += TempA[1] * TempB[2];
1874 C[3+8*n+i] += TempA[1] * TempB[3];
1875 C[4+8*n+i] += TempA[1] * TempB[4];
1876 C[5+8*n+i] += TempA[1] * TempB[5];
1877 C[6+8*n+i] += TempA[1] * TempB[6];
1878 C[7+8*n+i] += TempA[1] * TempB[7];
1882 TempB[0] = B[(2+8*m)*lda+0+8*n];
1883 TempB[1] = B[(2+8*m)*lda+1+8*n];
1884 TempB[2] = B[(2+8*m)*lda+2+8*n];
1885 TempB[3] = B[(2+8*m)*lda+3+8*n];
1886 TempB[4] = B[(2+8*m)*lda+4+8*n];
1887 TempB[5] = B[(2+8*m)*lda+5+8*n];
1888 TempB[6] = B[(2+8*m)*lda+6+8*n];
1889 TempB[7] = B[(2+8*m)*lda+7+8*n];
1891 C[0+8*n+i] += TempA[2] * TempB[0];
1892 C[1+8*n+i] += TempA[2] * TempB[1];
1893 C[2+8*n+i] += TempA[2] * TempB[2];
1894 C[3+8*n+i] += TempA[2] * TempB[3];
1895 C[4+8*n+i] += TempA[2] * TempB[4];
1896 C[5+8*n+i] += TempA[2] * TempB[5];
1897 C[6+8*n+i] += TempA[2] * TempB[6];
1898 C[7+8*n+i] += TempA[2] * TempB[7];
1902 TempB[0] = B[(3+8*m)*lda+0+8*n];
1903 TempB[1] = B[(3+8*m)*lda+1+8*n];
1904 TempB[2] = B[(3+8*m)*lda+2+8*n];
1905 TempB[3] = B[(3+8*m)*lda+3+8*n];
1906 TempB[4] = B[(3+8*m)*lda+4+8*n];
1907 TempB[5] = B[(3+8*m)*lda+5+8*n];
1908 TempB[6] = B[(3+8*m)*lda+6+8*n];
1909 TempB[7] = B[(3+8*m)*lda+7+8*n];
1911 C[0+8*n+i] += TempA[3] * TempB[0];
1912 C[1+8*n+i] += TempA[3] * TempB[1];
1913 C[2+8*n+i] += TempA[3] * TempB[2];
1914 C[3+8*n+i] += TempA[3] * TempB[3];
1915 C[4+8*n+i] += TempA[3] * TempB[4];
1916 C[5+8*n+i] += TempA[3] * TempB[5];
1917 C[6+8*n+i] += TempA[3] * TempB[6];
1918 C[7+8*n+i] += TempA[3] * TempB[7];
1921 TempB[0] = B[(4+8*m)*lda+0+8*n];
1922 TempB[1] = B[(4+8*m)*lda+1+8*n];
1923 TempB[2] = B[(4+8*m)*lda+2+8*n];
1924 TempB[3] = B[(4+8*m)*lda+3+8*n];
1925 TempB[4] = B[(4+8*m)*lda+4+8*n];
1926 TempB[5] = B[(4+8*m)*lda+5+8*n];
1927 TempB[6] = B[(4+8*m)*lda+6+8*n];
1928 TempB[7] = B[(4+8*m)*lda+7+8*n];
1930 C[0+8*n+i] += TempA[4] * TempB[0];
1931 C[1+8*n+i] += TempA[4] * TempB[1];
1932 C[2+8*n+i] += TempA[4] * TempB[2];
1933 C[3+8*n+i] += TempA[4] * TempB[3];
1934 C[4+8*n+i] += TempA[4] * TempB[4];
1935 C[5+8*n+i] += TempA[4] * TempB[5];
1936 C[6+8*n+i] += TempA[4] * TempB[6];
1937 C[7+8*n+i] += TempA[4] * TempB[7];
1941 TempB[0] = B[(5+8*m)*lda+0+8*n];
1942 TempB[1] = B[(5+8*m)*lda+1+8*n];
1943 TempB[2] = B[(5+8*m)*lda+2+8*n];
1944 TempB[3] = B[(5+8*m)*lda+3+8*n];
1945 TempB[4] = B[(5+8*m)*lda+4+8*n];
1946 TempB[5] = B[(5+8*m)*lda+5+8*n];
1947 TempB[6] = B[(5+8*m)*lda+6+8*n];
1948 TempB[7] = B[(5+8*m)*lda+7+8*n];
1950 C[0+8*n+i] += TempA[5] * TempB[0];
1951 C[1+8*n+i] += TempA[5] * TempB[1];
1952 C[2+8*n+i] += TempA[5] * TempB[2];
1953 C[3+8*n+i] += TempA[5] * TempB[3];
1954 C[4+8*n+i] += TempA[5] * TempB[4];
1955 C[5+8*n+i] += TempA[5] * TempB[5];
1956 C[6+8*n+i] += TempA[5] * TempB[6];
1957 C[7+8*n+i] += TempA[5] * TempB[7];
1961 TempB[0] = B[(6+8*m)*lda+0+8*n];
1962 TempB[1] = B[(6+8*m)*lda+1+8*n];
1963 TempB[2] = B[(6+8*m)*lda+2+8*n];
1964 TempB[3] = B[(6+8*m)*lda+3+8*n];
1965 TempB[4] = B[(6+8*m)*lda+4+8*n];
1966 TempB[5] = B[(6+8*m)*lda+5+8*n];
1967 TempB[6] = B[(6+8*m)*lda+6+8*n];
1968 TempB[7] = B[(6+8*m)*lda+7+8*n];
1970 C[0+8*n+i] += TempA[6] * TempB[0];
1971 C[1+8*n+i] += TempA[6] * TempB[1];
1972 C[2+8*n+i] += TempA[6] * TempB[2];
1973 C[3+8*n+i] += TempA[6] * TempB[3];
1974 C[4+8*n+i] += TempA[6] * TempB[4];
1975 C[5+8*n+i] += TempA[6] * TempB[5];
1976 C[6+8*n+i] += TempA[6] * TempB[6];
1977 C[7+8*n+i] += TempA[6] * TempB[7];
1980 TempB[0] = B[(7+8*m)*lda+0+8*n];
1981 TempB[1] = B[(7+8*m)*lda+1+8*n];
1982 TempB[2] = B[(7+8*m)*lda+2+8*n];
1983 TempB[3] = B[(7+8*m)*lda+3+8*n];
1984 TempB[4] = B[(7+8*m)*lda+4+8*n];
1985 TempB[5] = B[(7+8*m)*lda+5+8*n];
1986 TempB[6] = B[(7+8*m)*lda+6+8*n];
1987 TempB[7] = B[(7+8*m)*lda+7+8*n];
1989 C[0+8*n+i] += TempA[7] * TempB[0];
1990 C[1+8*n+i] += TempA[7] * TempB[1];
1991 C[2+8*n+i] += TempA[7] * TempB[2];
1992 C[3+8*n+i] += TempA[7] * TempB[3];
1993 C[4+8*n+i] += TempA[7] * TempB[4];
1994 C[5+8*n+i] += TempA[7] * TempB[5];
1995 C[6+8*n+i] += TempA[7] * TempB[6];
1996 C[7+8*n+i] += TempA[7] * TempB[7];
1999 TempB
[0] = B
[(0+8*m
)*lda
+0+8*n
];
2000 TempB
[1] = B
[(0+8*m
)*lda
+1+8*n
];
2001 TempB
[2] = B
[(0+8*m
)*lda
+2+8*n
];
2002 TempB
[3] = B
[(0+8*m
)*lda
+3+8*n
];
2003 TempB
[4] = B
[(0+8*m
)*lda
+4+8*n
];
2004 TempB
[5] = B
[(0+8*m
)*lda
+5+8*n
];
2005 TempB
[6] = B
[(0+8*m
)*lda
+6+8*n
];
2006 TempB
[7] = B
[(0+8*m
)*lda
+7+8*n
];
2008 C
[0+8*n
+j
*lda
] += TempA
[0] * TempB
[0];
2009 C
[1+8*n
+j
*lda
] += TempA
[0] * TempB
[1];
2010 C
[2+8*n
+j
*lda
] += TempA
[0] * TempB
[2];
2011 C
[3+8*n
+j
*lda
] += TempA
[0] * TempB
[3];
2012 C
[4+8*n
+j
*lda
] += TempA
[0] * TempB
[4];
2013 C
[5+8*n
+j
*lda
] += TempA
[0] * TempB
[5];
2014 C
[6+8*n
+j
*lda
] += TempA
[0] * TempB
[6];
2015 C
[7+8*n
+j
*lda
] += TempA
[0] * TempB
[7];
2019 TempB
[0] = B
[(1+8*m
)*lda
+0+8*n
];
2020 TempB
[1] = B
[(1+8*m
)*lda
+1+8*n
];
2021 TempB
[2] = B
[(1+8*m
)*lda
+2+8*n
];
2022 TempB
[3] = B
[(1+8*m
)*lda
+3+8*n
];
2023 TempB
[4] = B
[(1+8*m
)*lda
+4+8*n
];
2024 TempB
[5] = B
[(1+8*m
)*lda
+5+8*n
];
2025 TempB
[6] = B
[(1+8*m
)*lda
+6+8*n
];
2026 TempB
[7] = B
[(1+8*m
)*lda
+7+8*n
];
2028 C
[0+8*n
+j
*lda
] += TempA
[1] * TempB
[0];
2029 C
[1+8*n
+j
*lda
] += TempA
[1] * TempB
[1];
2030 C
[2+8*n
+j
*lda
] += TempA
[1] * TempB
[2];
2031 C
[3+8*n
+j
*lda
] += TempA
[1] * TempB
[3];
2032 C
[4+8*n
+j
*lda
] += TempA
[1] * TempB
[4];
2033 C
[5+8*n
+j
*lda
] += TempA
[1] * TempB
[5];
2034 C
[6+8*n
+j
*lda
] += TempA
[1] * TempB
[6];
2035 C
[7+8*n
+j
*lda
] += TempA
[1] * TempB
[7];
2039 TempB
[0] = B
[(2+8*m
)*lda
+0+8*n
];
2040 TempB
[1] = B
[(2+8*m
)*lda
+1+8*n
];
2041 TempB
[2] = B
[(2+8*m
)*lda
+2+8*n
];
2042 TempB
[3] = B
[(2+8*m
)*lda
+3+8*n
];
2043 TempB
[4] = B
[(2+8*m
)*lda
+4+8*n
];
2044 TempB
[5] = B
[(2+8*m
)*lda
+5+8*n
];
2045 TempB
[6] = B
[(2+8*m
)*lda
+6+8*n
];
2046 TempB
[7] = B
[(2+8*m
)*lda
+7+8*n
];
2048 C
[0+8*n
+j
*lda
] += TempA
[2] * TempB
[0];
2049 C
[1+8*n
+j
*lda
] += TempA
[2] * TempB
[1];
2050 C
[2+8*n
+j
*lda
] += TempA
[2] * TempB
[2];
2051 C
[3+8*n
+j
*lda
] += TempA
[2] * TempB
[3];
2052 C
[4+8*n
+j
*lda
] += TempA
[2] * TempB
[4];
2053 C
[5+8*n
+j
*lda
] += TempA
[2] * TempB
[5];
2054 C
[6+8*n
+j
*lda
] += TempA
[2] * TempB
[6];
2055 C
[7+8*n
+j
*lda
] += TempA
[2] * TempB
[7];
2059 TempB
[0] = B
[(3+8*m
)*lda
+0+8*n
];
2060 TempB
[1] = B
[(3+8*m
)*lda
+1+8*n
];
2061 TempB
[2] = B
[(3+8*m
)*lda
+2+8*n
];
2062 TempB
[3] = B
[(3+8*m
)*lda
+3+8*n
];
2063 TempB
[4] = B
[(3+8*m
)*lda
+4+8*n
];
2064 TempB
[5] = B
[(3+8*m
)*lda
+5+8*n
];
2065 TempB
[6] = B
[(3+8*m
)*lda
+6+8*n
];
2066 TempB
[7] = B
[(3+8*m
)*lda
+7+8*n
];
2068 C
[0+8*n
+j
*lda
] += TempA
[3] * TempB
[0];
2069 C
[1+8*n
+j
*lda
] += TempA
[3] * TempB
[1];
2070 C
[2+8*n
+j
*lda
] += TempA
[3] * TempB
[2];
2071 C
[3+8*n
+j
*lda
] += TempA
[3] * TempB
[3];
2072 C
[4+8*n
+j
*lda
] += TempA
[3] * TempB
[4];
2073 C
[5+8*n
+j
*lda
] += TempA
[3] * TempB
[5];
2074 C
[6+8*n
+j
*lda
] += TempA
[3] * TempB
[6];
2075 C
[7+8*n
+j
*lda
] += TempA
[3] * TempB
[7];
2078 TempB
[0] = B
[(4+8*m
)*lda
+0+8*n
];
2079 TempB
[1] = B
[(4+8*m
)*lda
+1+8*n
];
2080 TempB
[2] = B
[(4+8*m
)*lda
+2+8*n
];
2081 TempB
[3] = B
[(4+8*m
)*lda
+3+8*n
];
2082 TempB
[4] = B
[(4+8*m
)*lda
+4+8*n
];
2083 TempB
[5] = B
[(4+8*m
)*lda
+5+8*n
];
2084 TempB
[6] = B
[(4+8*m
)*lda
+6+8*n
];
2085 TempB
[7] = B
[(4+8*m
)*lda
+7+8*n
];
2087 C
[0+8*n
+j
*lda
] += TempA
[4] * TempB
[0];
2088 C
[1+8*n
+j
*lda
] += TempA
[4] * TempB
[1];
2089 C
[2+8*n
+j
*lda
] += TempA
[4] * TempB
[2];
2090 C
[3+8*n
+j
*lda
] += TempA
[4] * TempB
[3];
2091 C
[4+8*n
+j
*lda
] += TempA
[4] * TempB
[4];
2092 C
[5+8*n
+j
*lda
] += TempA
[4] * TempB
[5];
2093 C
[6+8*n
+j
*lda
] += TempA
[4] * TempB
[6];
2094 C
[7+8*n
+j
*lda
] += TempA
[4] * TempB
[7];
2098 TempB
[0] = B
[(5+8*m
)*lda
+0+8*n
];
2099 TempB
[1] = B
[(5+8*m
)*lda
+1+8*n
];
2100 TempB
[2] = B
[(5+8*m
)*lda
+2+8*n
];
2101 TempB
[3] = B
[(5+8*m
)*lda
+3+8*n
];
2102 TempB
[4] = B
[(5+8*m
)*lda
+4+8*n
];
2103 TempB
[5] = B
[(5+8*m
)*lda
+5+8*n
];
2104 TempB
[6] = B
[(5+8*m
)*lda
+6+8*n
];
2105 TempB
[7] = B
[(5+8*m
)*lda
+7+8*n
];
2107 C
[0+8*n
+j
*lda
] += TempA
[5] * TempB
[0];
2108 C
[1+8*n
+j
*lda
] += TempA
[5] * TempB
[1];
2109 C
[2+8*n
+j
*lda
] += TempA
[5] * TempB
[2];
2110 C
[3+8*n
+j
*lda
] += TempA
[5] * TempB
[3];
2111 C
[4+8*n
+j
*lda
] += TempA
[5] * TempB
[4];
2112 C
[5+8*n
+j
*lda
] += TempA
[5] * TempB
[5];
2113 C
[6+8*n
+j
*lda
] += TempA
[5] * TempB
[6];
2114 C
[7+8*n
+j
*lda
] += TempA
[5] * TempB
[7];
2118 TempB
[0] = B
[(6+8*m
)*lda
+0+8*n
];
2119 TempB
[1] = B
[(6+8*m
)*lda
+1+8*n
];
2120 TempB
[2] = B
[(6+8*m
)*lda
+2+8*n
];
2121 TempB
[3] = B
[(6+8*m
)*lda
+3+8*n
];
2122 TempB
[4] = B
[(6+8*m
)*lda
+4+8*n
];
2123 TempB
[5] = B
[(6+8*m
)*lda
+5+8*n
];
2124 TempB
[6] = B
[(6+8*m
)*lda
+6+8*n
];
2125 TempB
[7] = B
[(6+8*m
)*lda
+7+8*n
];
2127 C
[0+8*n
+j
*lda
] += TempA
[6] * TempB
[0];
2128 C
[1+8*n
+j
*lda
] += TempA
[6] * TempB
[1];
2129 C
[2+8*n
+j
*lda
] += TempA
[6] * TempB
[2];
2130 C
[3+8*n
+j
*lda
] += TempA
[6] * TempB
[3];
2131 C
[4+8*n
+j
*lda
] += TempA
[6] * TempB
[4];
2132 C
[5+8*n
+j
*lda
] += TempA
[6] * TempB
[5];
2133 C
[6+8*n
+j
*lda
] += TempA
[6] * TempB
[6];
2134 C
[7+8*n
+j
*lda
] += TempA
[6] * TempB
[7];
2137 TempB
[0] = B
[(7+8*m
)*lda
+0+8*n
];
2138 TempB
[1] = B
[(7+8*m
)*lda
+1+8*n
];
2139 TempB
[2] = B
[(7+8*m
)*lda
+2+8*n
];
2140 TempB
[3] = B
[(7+8*m
)*lda
+3+8*n
];
2141 TempB
[4] = B
[(7+8*m
)*lda
+4+8*n
];
2142 TempB
[5] = B
[(7+8*m
)*lda
+5+8*n
];
2143 TempB
[6] = B
[(7+8*m
)*lda
+6+8*n
];
2144 TempB
[7] = B
[(7+8*m
)*lda
+7+8*n
];
2146 C
[0+8*n
+j
*lda
] += TempA
[7] * TempB
[0];
2147 C
[1+8*n
+j
*lda
] += TempA
[7] * TempB
[1];
2148 C
[2+8*n
+j
*lda
] += TempA
[7] * TempB
[2];
2149 C
[3+8*n
+j
*lda
] += TempA
[7] * TempB
[3];
2150 C
[4+8*n
+j
*lda
] += TempA
[7] * TempB
[4];
2151 C
[5+8*n
+j
*lda
] += TempA
[7] * TempB
[5];
2152 C
[6+8*n
+j
*lda
] += TempA
[7] * TempB
[6];
2153 C
[7+8*n
+j
*lda
] += TempA
[7] * TempB
[7];
2163 //--------------------------------------------------------------------------
2166 // all threads start executing thread_entry(). Use their "coreid" to
2167 // differentiate between threads (each thread is running on a separate core).
2169 void thread_entry(int cid
, int nc
)
2174 // static allocates data in the binary, which is visible to both threads
2175 static data_t results_data
[ARRAY_SIZE
];
2178 // Execute the provided, naive matmul
2180 stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier(nc));
2184 verifyMT(ARRAY_SIZE, results_data, verify_data);
2186 // clear results from the first trial
2189 for (i=0; i < ARRAY_SIZE; i++)
2190 results_data[i] = 0;
2194 // Execute your faster matmul
2196 stats(matmul(DIM_SIZE
, input1_data
, input2_data
, results_data
); barrier(nc
));
2199 printArrayMT("results:", ARRAY_SIZE
, results_data
);
2200 printArrayMT("verify :", ARRAY_SIZE
, verify_data
);
2204 verifyMT(ARRAY_SIZE
, results_data
, verify_data
);