4ff6dba0f96b7b30b8cc9fdece70ba0c82a4e2b8
[libreriscv.git] / simple_v_extension / specification / mv.x.rst
1 [[!tag standards]]
2
3 MV.X and MV.swizzle
4 ===================
5
6 swizzle needs a MV (there are 2 of them: swizzle and swizzle2).
7 see below for a potential way to use the funct7 to do a swizzle in rs2.
8
9 +---------------+-------------+-------+----------+----------+--------+----------+--------+--------+
10 | Encoding | 31:27 | 26:25 | 24:20 | 19:15 | 14:12 | 11:7 | 6:2 | 1:0 |
11 +---------------+-------------+-------+----------+----------+--------+----------+--------+--------+
12 | RV32-I-type + imm[11:0] + rs1[4:0] + funct3 | rd[4:0] + opcode + 0b11 |
13 +---------------+-------------+-------+----------+----------+--------+----------+--------+--------+
14 | RV32-I-type + fn4[3:0] + swizzle[7:0] + rs1[4:0] + 0b000 | rd[4:0] + OP-V + 0b11 |
15 +---------------+-------------+-------+----------+----------+--------+----------+--------+--------+
16
17 * funct3 = MV: 0b000 for FP, 0b001 for INT
18 * OP-V = 0b1010111
19 * fn4 = 4 bit function.
20 * fn4 = 0b0000 - MV-SWIZZLE
21 * fn4 = 0bNN01 - MV-X, NN=elwidth (default/8/16/32)
22 * fn4 = 0bNN11 - MV-X.SUBVL NN=elwidth (default/8/16/32)
23
24 swizzle (only active on SV or P48/P64 when SUBVL!=0):
25
26 +-----+-----+-----+-----+
27 | 7:6 | 5:4 | 3:2 | 1:0 |
28 +-----+-----+-----+-----+
29 | w | z | y | x |
30 +-----+-----+-----+-----+
31
32 MV.X has two modes: SUBVL mode applies the element offsets only within a SUBVL inner loop. This can be used for transposition.
33
34 ::
35
36 for i in range(VL):
37 for j in range(SUBVL):
38 regs[rd] = regs[rd+regs[rs+j]]
39
40 Normal mode will apply the element offsets incrementally:
41
42 ::
43
44 for i in range(VL):
45 for j in range(SUBVL):
46 regs[rd] = regs[rd+regs[rs+k]]
47 k++
48
49
50 Pseudocode for element width part of MV.X:
51
52 ::
53
54 def mv_x(rd, rs1, funct4):
55 elwidth = (funct4>>2) & 0x3
56 bitwidth = {0:XLEN, 1:8, 2:16, 3:32}[elwidth] # get bits per el
57 bytewidth = bitwidth / 8 # get bytes per el
58 for i in range(VL):
59 addr = (unsigned char *)&regs[rs1]
60 offset = addr + bytewidth # get offset within regfile as SRAM
61 # TODO, actually, needs to respect rd and rs1 element width,
62 # here, as well. this pseudocode just illustrates that the
63 # MV.X operation contains a way to compact the indices into
64 # less space.
65 regs[rd] = (unsigned char*)(regs)[offset]
66
67 The idea here is to allow 8-bit indices to be stored inside XLEN-sized
68 registers, such that rather than doing this:
69
70 .. parsed-literal::
71 ldimm x8, 1
72 ldimm x9, 3
73 ldimm x10, 2
74 ldimm x11, 0
75 {SVP.VL=4} MV.X x3, x8, elwidth=default
76
77 The alternative is this:
78
79 .. parsed-literal::
80 ldimm x8, 0x00020301
81 {SVP.VL=4} MV.X x3, x8, elwidth=8
82
83 Thus compacting four indices into the one register. x3 and x8's element
84 width are *independent* of the MV.X elwidth, thus allowing both source
85 and element element widths of the *elements* to be moved to be over-ridden,
86 whilst *at the same time* allowing the *indices* to be compacted, as well.
87
88 ----
89
90 potential MV.X? register-version of MV-swizzle?
91
92 +-------------+-------+-------+----------+----------+--------+----------+--------+--------+
93 | Encoding | 31:27 | 26:25 | 24:20 | 19:15 | 14:12 | 11:7 | 6:2 | 1:0 |
94 +-------------+-------+-------+----------+----------+--------+----------+--------+--------+
95 | RV32-R-type + funct7 + rs2[4:0] + rs1[4:0] + funct3 | rd[4:0] + opcode + 0b11 |
96 +-------------+-------+-------+----------+----------+--------+----------+--------+--------+
97 | RV32-R-type + 0b0000000 + rs2[4:0] + rs1[4:0] + 0b001 | rd[4:0] + OP-V + 0b11 |
98 +-------------+-------+-------+----------+----------+--------+----------+--------+--------+
99
100 * funct3 = MV.X
101 * OP-V = 0b1010111
102 * funct7 = 0b000NN00 - INT MV.X, elwidth=NN (default/8/16/32)
103 * funct7 = 0b000NN10 - FP MV.X, elwidth=NN (default/8/16/32)
104 * funct7 = 0b0000001 - INT MV.swizzle to say that rs2 is a swizzle argument?
105 * funct7 = 0b0000011 - FP MV.swizzle to say that rs2 is a swizzle argument?
106
107 question: do we need a swizzle MV.X as well?
108
109 MV.X with 3 operands
110 ====================
111
112 regs[rd] = regs[rs1 + regs[rs2]]
113
114 Similar to LD/ST with the same twin predication rules
115
116 macro-op fusion
117 ===============
118
119 there is the potential for macro-op fusion of mv-swizzle with the following instruction and/or preceding instruction.
120 <http://lists.libre-riscv.org/pipermail/libre-riscv-dev/2019-August/002486.html>
121
122 VBLOCK context?
123 ===============
124
125 additional idea: a VBLOCK context that says that if a given register is used, it indicates that the
126 register is to be "swizzled", and the VBLOCK swizzle context contains the swizzling to be carried out.
127
128 mm_shuffle_ps?
129 ==============
130
131 __m128 _mm_shuffle_ps(__m128 lo,__m128 hi,
132 _MM_SHUFFLE(hi3,hi2,lo1,lo0))
133 Interleave inputs into low 2 floats and high 2 floats of output. Basically
134 out[0]=lo[lo0];
135 out[1]=lo[lo1];
136 out[2]=hi[hi2];
137 out[3]=hi[hi3];
138
139 For example, _mm_shuffle_ps(a,a,_MM_SHUFFLE(i,i,i,i)) copies the float
140 a[i] into all 4 output floats.
141
142 Transpose
143 =========
144
145 assuming a vector of 4x4 matrixes is stored as 4 separate vectors with subvl=4 in struct-of-array-of-struct form (the form I've been planning on using):
146 using standard (4+4) -> 4 swizzle instructions with 2 input vectors with subvl=4 and 1 output vector with subvl, a vectorized matrix transpose operation can be done in 2 steps with 4 instructions per step to give 8 instructions in total:
147
148 input:
149 | m00 m10 m20 m30 |
150 | m01 m11 m21 m31 |
151 | m02 m12 m22 m32 |
152 | m03 m13 m23 m33 |
153
154 transpose 4 corner 2x2 matrices
155
156 intermediate:
157 | m00 m01 m20 m21 |
158 | m10 m11 m30 m31 |
159 | m02 m03 m22 m23 |
160 | m12 m13 m32 m33 |
161
162 finish transpose
163
164 output:
165 | m00 m01 m02 m03 |
166 | m10 m11 m12 m13 |
167 | m20 m21 m22 m23 |
168 | m30 m31 m32 m33 |
169
170 <http://web.archive.org/web/20100111104515/http://www.randombit.net:80/bitbashing/programming/integer_matrix_transpose_in_sse2.html>
171
172
173 ::
174
175 __m128i T0 = _mm_unpacklo_epi32(I0, I1);
176 __m128i T1 = _mm_unpacklo_epi32(I2, I3);
177 __m128i T2 = _mm_unpackhi_epi32(I0, I1);
178 __m128i T3 = _mm_unpackhi_epi32(I2, I3);
179
180 /* Assigning transposed values back into I[0-3] */
181 I0 = _mm_unpacklo_epi64(T0, T1);
182 I1 = _mm_unpackhi_epi64(T0, T1);
183 I2 = _mm_unpacklo_epi64(T2, T3);
184 I3 = _mm_unpackhi_epi64(T2, T3);
185
186 Transforms for DCT
187 ==================
188
189 <https://opencores.org/websvn/filedetails?repname=mpeg2fpga&path=%2Fmpeg2fpga%2Ftrunk%2Frtl%2Fmpeg2%2Fidct.v>
190
191 Table to evaluate
192 =================
193
194 swizzle2 takes 2 arguments, interleaving the two vectors depending on a 3rd (the swizzle selector)
195
196 +-----------+-------+-------+-------+-------+-------+------+
197 | | 31:27 | 26:25 | 24:20 | 19:15 | 14:12 | 11:7 |
198 +===========+=======+=======+=======+=======+=======+======+
199 | swizzle2 | rs3 | 00 | rs2 | rs1 | 000 | rd |
200 +-----------+-------+-------+-------+-------+-------+------+
201 | fswizzle2 | rs3 | 01 | rs2 | rs1 | 000 | rd |
202 +-----------+-------+-------+-------+-------+-------+------+
203 | swizzle | 0 | 10 | rs2 | rs1 | 000 | rd |
204 +-----------+-------+-------+-------+-------+-------+------+
205 | fswizzle | 0 | 11 | rs2 | rs1 | 000 | rd |
206 +-----------+-------+-------+-------+-------+-------+------+
207 | swizzlei | imm | rs1 | 001 | rd |
208 +-----------+ +-------+-------+------+
209 | fswizzlei | | rs1 | 010 | rd |
210 +-----------+-------+-------+-------+-------+-------+------+
211
212 More:
213
214 swizzlei would still need the 12-bit format due to not having enough immediate bits. we can get away with only 3 i-type funct3s used for [f]swizzlei by having one funct3 for destsubvl 1 through 3 for int and fp versions and a separate one for destsubvl = 4 that's shared between int/fp:
215
216 +--------+-----------+----+-----------+----------+-------+-------+------+
217 | int/fp | DESTSUBVL | 31 | 30:29 | 28:20 | 19:15 | 14:12 | 11:7 |
218 +========+===========+====+===========+==========+=======+=======+======+
219 | int | 1 to 3 | 0 | DESTSUBVL | selector | rs | 000 | rd |
220 +--------+-----------+----+-----------+----------+-------+-------+------+
221 | fp | 1 to 3 | 1 | DESTSUBVL | selector | rs | 000 | rd |
222 +--------+-----------+----+-----------+----------+-------+-------+------+
223 | int | 4 | selector[11:0] | rs | 001 | rd |
224 +--------+-----------+---------------------------+-------+-------+------+
225 | fp | 4 | selector[11:0] | rs | 010 | rd |
226 +--------+-----------+---------------------------+-------+-------+------+
227
228 the rest could be encoded as follows:
229
230 +-----------+-------+-----------+-------+-------+-------+------+
231 | | 31:27 | 26:25 | 24:20 | 19:15 | 14:12 | 11:7 |
232 +===========+=======+===========+=======+=======+=======+======+
233 | swizzle2 | rs3 | DESTSUBVL | rs2 | rs1 | 100 | rd |
234 +-----------+-------+-----------+-------+-------+-------+------+
235 | swizzle | rs1 | DESTSUBVL | rs2 | rs1 | 100 | rd |
236 +-----------+-------+-----------+-------+-------+-------+------+
237 | fswizzle2 | rs3 | DESTSUBVL | rs2 | rs1 | 101 | rd |
238 +-----------+-------+-----------+-------+-------+-------+------+
239 | fswizzle | rs1 | DESTSUBVL | rs2 | rs1 | 101 | rd |
240 +-----------+-------+-----------+-------+-------+-------+------+
241
242 note how for [f]swizzle, rs3 == rs1
243
244 so it uses 5 funct3 values overall, which is appropriate, since swizzle is probably right after muladd in usage in graphics shaders.
245
246 Alternative immed encoding
247
248 +--------+-----------+----------+-------+-------+------+
249 | int/fp | 31:28 | 27:20 | 19:15 | 14:12 | 11:7 |
250 +========+===========+==========+=======+=======+======+
251 | int | DESTMASK | selector | rs | 000 | rd |
252 +--------+-----------+----------+-------+-------+------+
253 | fp | DESTMASK | selector | rs | 001 | rd |
254 +--------+-----------+----------+-------+-------+------+
255 | int | DESTMASK | constsel | rs | 010 | rd |
256 +--------+-----------+----------+-------+-------+------+
257 | fp | DESTMASK | constsel | rs | 011 | rd |
258 +--------+-----------+----------+-------+-------+------+
259
260 Allows setting of arbitrary dest (xz, yw) without needing register-versions. Saves on instruction count.
261 Needs 4 funct3 to express.
262
263 Matrix 4x4 Vector mul
264 =====================
265
266 ::
267
268 pfscale,3 F2, F1, F10
269 pfscaleadd,2 F2, F1, F11, F2
270 pfscaleadd,1 F2, F1, F12, F2
271 pfscaleadd,0 F2, F1, F13, F2
272
273 pfscale is a 4 vec mv.shuffle followed by a fmul. pfscaleadd is a 4 vec mv.shuffle followed by a fmac.
274
275 In effect what this is doing is:
276
277 ::
278
279 fmul f2, f1.xxxx, f10
280 fmac f2, f1.yyyy, f11, f2
281 fmac f2, f1.zzzz, f12, f2
282 fmac f2, f1.wwww, f13, f2
283
284 Where all of f2, f1, and f10-13 are vec4, and f1.x-w are copied (fixed index) where the other vec4 indices progress.
285
286 Pseudocode
287 ==========
288
289 Swizzle:
290
291 ::
292
293 pub trait SwizzleConstants: Copy + 'static {
294 const CONSTANTS: &'static [Self; 4];
295 }
296
297 impl SwizzleConstants for u8 {
298 const CONSTANTS: &'static [Self; 4] = &[0, 1, 0xFF, 0x7F];
299 }
300
301 impl SwizzleConstants for u16 {
302 const CONSTANTS: &'static [Self; 4] = &[0, 1, 0xFFFF, 0x7FFF];
303 }
304
305 impl SwizzleConstants for f32 {
306 const CONSTANTS: &'static [Self; 4] = &[0.0, 1.0, -1.0, 0.5];
307 }
308
309 // impl for other types too...
310
311 pub fn swizzle<Elm, Selector>(
312 rd: &mut [Elm],
313 rs1: &[Elm],
314 rs2: &[Selector],
315 vl: usize,
316 destsubvl: usize,
317 srcsubvl: usize)
318 where
319 Elm: SwizzleConstants,
320 // Selector is a copyable type that can be converted into u64
321 Selector: Copy + Into<u64>,
322 {
323 const FIELD_SIZE: usize = 3;
324 const FIELD_MASK: u64 = 0b111;
325 for vindex in 0..vl {
326 let selector = rs2[vindex].into();
327 // selector's type is u64
328 if selector >> (FIELD_SIZE * destsubvl) != 0 {
329 // handle illegal instruction trap
330 }
331 for i in 0..destsubvl {
332 let mut sel_field = selector >> (FIELD_SIZE * i);
333 sel_field &= FIELD_MASK;
334 let src = if (sel_field & 0b100) == 0 {
335 &rs1[(vindex * srcsubvl)..]
336 } else {
337 SwizzleConstants::CONSTANTS
338 };
339 sel_field &= 0b11;
340 if sel_field as usize >= srcsubvl {
341 // handle illegal instruction trap
342 }
343 let value = src[sel_field as usize];
344 rd[vindex * destsubvl + i] = value;
345 }
346 }
347 }
348
349 Swizzle2:
350
351 ::
352
353 fn swizzle2<Elm, Selector>(
354 rd: &mut [Elm],
355 rs1: &[Elm],
356 rs2: &[Selector],
357 rs3: &[Elm],
358 vl: usize,
359 destsubvl: usize,
360 srcsubvl: usize)
361 where
362 // Elm is a copyable type
363 Elm: Copy,
364 // Selector is a copyable type that can be converted into u64
365 Selector: Copy + Into<u64>,
366 {
367 const FIELD_SIZE: usize = 3;
368 const FIELD_MASK: u64 = 0b111;
369 for vindex in 0..vl {
370 let selector = rs2[vindex].into();
371 // selector's type is u64
372 if selector >> (FIELD_SIZE * destsubvl) != 0 {
373 // handle illegal instruction trap
374 }
375 for i in 0..destsubvl {
376 let mut sel_field = selector >> (FIELD_SIZE * i);
377 sel_field &= FIELD_MASK;
378 let src = if (sel_field & 0b100) != 0 {
379 rs1
380 } else {
381 rs3
382 };
383 sel_field &= 0b11;
384 if sel_field as usize >= srcsubvl {
385 // handle illegal instruction trap
386 }
387 let value = src[vindex * srcsubvl + (sel_field as usize)];
388 rd[vindex * destsubvl + i] = value;
389 }
390 }
391 }
392