593b54f134f2b5818c74711708c37da93e56a092
1 // See LICENSE for license details.
3 static const int RBM
= 4, RBN
= 5, RBK
= 6;
4 static const int CBM
= 36, CBN
= 35, CBK
= 36;
5 static inline void kloop(size_t p
, t
* a0
, size_t lda
, t
* b0
, size_t ldb
, t
* c
, size_t ldc
)
31 for (t
*a
= a0
, *b
= b0
; a
< a0
+ p
/RBK
*RBK
; a
+= RBK
, b
+= RBK
*ldb
)
43 c_0_0
= fma(a_0
[0], b_0
[0], c_0_0
);
44 c_0_1
= fma(a_0
[0], b_0
[1], c_0_1
);
45 c_0_2
= fma(a_0
[0], b_0
[2], c_0_2
);
46 c_0_3
= fma(a_0
[0], b_0
[3], c_0_3
);
47 c_0_4
= fma(a_0
[0], b_0
[4], c_0_4
);
48 c_1_0
= fma(a_1
[0], b_0
[0], c_1_0
);
49 c_1_1
= fma(a_1
[0], b_0
[1], c_1_1
);
50 c_1_2
= fma(a_1
[0], b_0
[2], c_1_2
);
51 c_1_3
= fma(a_1
[0], b_0
[3], c_1_3
);
52 c_1_4
= fma(a_1
[0], b_0
[4], c_1_4
);
53 c_2_0
= fma(a_2
[0], b_0
[0], c_2_0
);
54 c_2_1
= fma(a_2
[0], b_0
[1], c_2_1
);
55 c_2_2
= fma(a_2
[0], b_0
[2], c_2_2
);
56 c_2_3
= fma(a_2
[0], b_0
[3], c_2_3
);
57 c_2_4
= fma(a_2
[0], b_0
[4], c_2_4
);
58 c_3_0
= fma(a_3
[0], b_0
[0], c_3_0
);
59 c_3_1
= fma(a_3
[0], b_0
[1], c_3_1
);
60 c_3_2
= fma(a_3
[0], b_0
[2], c_3_2
);
61 c_3_3
= fma(a_3
[0], b_0
[3], c_3_3
);
62 c_3_4
= fma(a_3
[0], b_0
[4], c_3_4
);
63 c_0_0
= fma(a_0
[1], b_1
[0], c_0_0
);
64 c_0_1
= fma(a_0
[1], b_1
[1], c_0_1
);
65 c_0_2
= fma(a_0
[1], b_1
[2], c_0_2
);
66 c_0_3
= fma(a_0
[1], b_1
[3], c_0_3
);
67 c_0_4
= fma(a_0
[1], b_1
[4], c_0_4
);
68 c_1_0
= fma(a_1
[1], b_1
[0], c_1_0
);
69 c_1_1
= fma(a_1
[1], b_1
[1], c_1_1
);
70 c_1_2
= fma(a_1
[1], b_1
[2], c_1_2
);
71 c_1_3
= fma(a_1
[1], b_1
[3], c_1_3
);
72 c_1_4
= fma(a_1
[1], b_1
[4], c_1_4
);
73 c_2_0
= fma(a_2
[1], b_1
[0], c_2_0
);
74 c_2_1
= fma(a_2
[1], b_1
[1], c_2_1
);
75 c_2_2
= fma(a_2
[1], b_1
[2], c_2_2
);
76 c_2_3
= fma(a_2
[1], b_1
[3], c_2_3
);
77 c_2_4
= fma(a_2
[1], b_1
[4], c_2_4
);
78 c_3_0
= fma(a_3
[1], b_1
[0], c_3_0
);
79 c_3_1
= fma(a_3
[1], b_1
[1], c_3_1
);
80 c_3_2
= fma(a_3
[1], b_1
[2], c_3_2
);
81 c_3_3
= fma(a_3
[1], b_1
[3], c_3_3
);
82 c_3_4
= fma(a_3
[1], b_1
[4], c_3_4
);
83 c_0_0
= fma(a_0
[2], b_2
[0], c_0_0
);
84 c_0_1
= fma(a_0
[2], b_2
[1], c_0_1
);
85 c_0_2
= fma(a_0
[2], b_2
[2], c_0_2
);
86 c_0_3
= fma(a_0
[2], b_2
[3], c_0_3
);
87 c_0_4
= fma(a_0
[2], b_2
[4], c_0_4
);
88 c_1_0
= fma(a_1
[2], b_2
[0], c_1_0
);
89 c_1_1
= fma(a_1
[2], b_2
[1], c_1_1
);
90 c_1_2
= fma(a_1
[2], b_2
[2], c_1_2
);
91 c_1_3
= fma(a_1
[2], b_2
[3], c_1_3
);
92 c_1_4
= fma(a_1
[2], b_2
[4], c_1_4
);
93 c_2_0
= fma(a_2
[2], b_2
[0], c_2_0
);
94 c_2_1
= fma(a_2
[2], b_2
[1], c_2_1
);
95 c_2_2
= fma(a_2
[2], b_2
[2], c_2_2
);
96 c_2_3
= fma(a_2
[2], b_2
[3], c_2_3
);
97 c_2_4
= fma(a_2
[2], b_2
[4], c_2_4
);
98 c_3_0
= fma(a_3
[2], b_2
[0], c_3_0
);
99 c_3_1
= fma(a_3
[2], b_2
[1], c_3_1
);
100 c_3_2
= fma(a_3
[2], b_2
[2], c_3_2
);
101 c_3_3
= fma(a_3
[2], b_2
[3], c_3_3
);
102 c_3_4
= fma(a_3
[2], b_2
[4], c_3_4
);
103 c_0_0
= fma(a_0
[3], b_3
[0], c_0_0
);
104 c_0_1
= fma(a_0
[3], b_3
[1], c_0_1
);
105 c_0_2
= fma(a_0
[3], b_3
[2], c_0_2
);
106 c_0_3
= fma(a_0
[3], b_3
[3], c_0_3
);
107 c_0_4
= fma(a_0
[3], b_3
[4], c_0_4
);
108 c_1_0
= fma(a_1
[3], b_3
[0], c_1_0
);
109 c_1_1
= fma(a_1
[3], b_3
[1], c_1_1
);
110 c_1_2
= fma(a_1
[3], b_3
[2], c_1_2
);
111 c_1_3
= fma(a_1
[3], b_3
[3], c_1_3
);
112 c_1_4
= fma(a_1
[3], b_3
[4], c_1_4
);
113 c_2_0
= fma(a_2
[3], b_3
[0], c_2_0
);
114 c_2_1
= fma(a_2
[3], b_3
[1], c_2_1
);
115 c_2_2
= fma(a_2
[3], b_3
[2], c_2_2
);
116 c_2_3
= fma(a_2
[3], b_3
[3], c_2_3
);
117 c_2_4
= fma(a_2
[3], b_3
[4], c_2_4
);
118 c_3_0
= fma(a_3
[3], b_3
[0], c_3_0
);
119 c_3_1
= fma(a_3
[3], b_3
[1], c_3_1
);
120 c_3_2
= fma(a_3
[3], b_3
[2], c_3_2
);
121 c_3_3
= fma(a_3
[3], b_3
[3], c_3_3
);
122 c_3_4
= fma(a_3
[3], b_3
[4], c_3_4
);
123 c_0_0
= fma(a_0
[4], b_4
[0], c_0_0
);
124 c_0_1
= fma(a_0
[4], b_4
[1], c_0_1
);
125 c_0_2
= fma(a_0
[4], b_4
[2], c_0_2
);
126 c_0_3
= fma(a_0
[4], b_4
[3], c_0_3
);
127 c_0_4
= fma(a_0
[4], b_4
[4], c_0_4
);
128 c_1_0
= fma(a_1
[4], b_4
[0], c_1_0
);
129 c_1_1
= fma(a_1
[4], b_4
[1], c_1_1
);
130 c_1_2
= fma(a_1
[4], b_4
[2], c_1_2
);
131 c_1_3
= fma(a_1
[4], b_4
[3], c_1_3
);
132 c_1_4
= fma(a_1
[4], b_4
[4], c_1_4
);
133 c_2_0
= fma(a_2
[4], b_4
[0], c_2_0
);
134 c_2_1
= fma(a_2
[4], b_4
[1], c_2_1
);
135 c_2_2
= fma(a_2
[4], b_4
[2], c_2_2
);
136 c_2_3
= fma(a_2
[4], b_4
[3], c_2_3
);
137 c_2_4
= fma(a_2
[4], b_4
[4], c_2_4
);
138 c_3_0
= fma(a_3
[4], b_4
[0], c_3_0
);
139 c_3_1
= fma(a_3
[4], b_4
[1], c_3_1
);
140 c_3_2
= fma(a_3
[4], b_4
[2], c_3_2
);
141 c_3_3
= fma(a_3
[4], b_4
[3], c_3_3
);
142 c_3_4
= fma(a_3
[4], b_4
[4], c_3_4
);
143 c_0_0
= fma(a_0
[5], b_5
[0], c_0_0
);
144 c_0_1
= fma(a_0
[5], b_5
[1], c_0_1
);
145 c_0_2
= fma(a_0
[5], b_5
[2], c_0_2
);
146 c_0_3
= fma(a_0
[5], b_5
[3], c_0_3
);
147 c_0_4
= fma(a_0
[5], b_5
[4], c_0_4
);
148 c_1_0
= fma(a_1
[5], b_5
[0], c_1_0
);
149 c_1_1
= fma(a_1
[5], b_5
[1], c_1_1
);
150 c_1_2
= fma(a_1
[5], b_5
[2], c_1_2
);
151 c_1_3
= fma(a_1
[5], b_5
[3], c_1_3
);
152 c_1_4
= fma(a_1
[5], b_5
[4], c_1_4
);
153 c_2_0
= fma(a_2
[5], b_5
[0], c_2_0
);
154 c_2_1
= fma(a_2
[5], b_5
[1], c_2_1
);
155 c_2_2
= fma(a_2
[5], b_5
[2], c_2_2
);
156 c_2_3
= fma(a_2
[5], b_5
[3], c_2_3
);
157 c_2_4
= fma(a_2
[5], b_5
[4], c_2_4
);
158 c_3_0
= fma(a_3
[5], b_5
[0], c_3_0
);
159 c_3_1
= fma(a_3
[5], b_5
[1], c_3_1
);
160 c_3_2
= fma(a_3
[5], b_5
[2], c_3_2
);
161 c_3_3
= fma(a_3
[5], b_5
[3], c_3_3
);
162 c_3_4
= fma(a_3
[5], b_5
[4], c_3_4
);
164 for (t
*a
= a0
+ p
/RBK
*RBK
, *b
= b0
+ p
/RBK
*RBK
*ldb
; a
< a0
+ p
; a
++, b
+= ldb
)
171 c_0_0
= fma(a_0
[0], b_0
[0], c_0_0
);
172 c_0_1
= fma(a_0
[0], b_0
[1], c_0_1
);
173 c_0_2
= fma(a_0
[0], b_0
[2], c_0_2
);
174 c_0_3
= fma(a_0
[0], b_0
[3], c_0_3
);
175 c_0_4
= fma(a_0
[0], b_0
[4], c_0_4
);
176 c_1_0
= fma(a_1
[0], b_0
[0], c_1_0
);
177 c_1_1
= fma(a_1
[0], b_0
[1], c_1_1
);
178 c_1_2
= fma(a_1
[0], b_0
[2], c_1_2
);
179 c_1_3
= fma(a_1
[0], b_0
[3], c_1_3
);
180 c_1_4
= fma(a_1
[0], b_0
[4], c_1_4
);
181 c_2_0
= fma(a_2
[0], b_0
[0], c_2_0
);
182 c_2_1
= fma(a_2
[0], b_0
[1], c_2_1
);
183 c_2_2
= fma(a_2
[0], b_0
[2], c_2_2
);
184 c_2_3
= fma(a_2
[0], b_0
[3], c_2_3
);
185 c_2_4
= fma(a_2
[0], b_0
[4], c_2_4
);
186 c_3_0
= fma(a_3
[0], b_0
[0], c_3_0
);
187 c_3_1
= fma(a_3
[0], b_0
[1], c_3_1
);
188 c_3_2
= fma(a_3
[0], b_0
[2], c_3_2
);
189 c_3_3
= fma(a_3
[0], b_0
[3], c_3_3
);
190 c_3_4
= fma(a_3
[0], b_0
[4], c_3_4
);