d2f98d74b9c22d7e92bb72404eaf597734320e15
2 # SPDX-License-Identifier: LGPL-3-or-later
3 # See Notices.txt for copyright information
6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
15 from nmigen
import Signal
, Module
, Elaboratable
, Mux
, Cat
, Shape
, Repl
16 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
17 from nmigen
.cli
import rtlil
19 from collections
.abc
import Mapping
20 from functools
import reduce
22 from collections
import defaultdict
23 from pprint
import pprint
25 from ieee754
.part_mul_add
.partpoints
import PartitionPoints
28 # main fn, which started out here in the bugtracker:
29 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
30 # note that signed is **NOT** part of the layout, and will NOT
31 # be added (because it is not relevant or appropriate).
32 # sign belongs in ast.Shape and is the only appropriate location.
33 # there is absolutely nothing within this function that in any
34 # way requires a sign. it is *purely* performing numerical width
35 # computations that have absolutely nothing to do with whether the
36 # actual data is signed or unsigned.
37 def layout(elwid
, vec_el_counts
, lane_shapes
=None, fixed_width
=None):
38 """calculate a SIMD layout.
41 * element: a single scalar value that is an element of a SIMD vector.
42 it has a width in bits. Every element is made of 1 or
44 * ElWid: the element-width (really the element type) of an instruction.
45 Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
46 In Python, `ElWid` is either an enum type or is `int`.
47 Example `ElWid` definition for integers:
50 I64 = ... # SVP64 value 0b00
51 I32 = ... # SVP64 value 0b01
52 I16 = ... # SVP64 value 0b10
53 I8 = ... # SVP64 value 0b11
55 Example `ElWid` definition for floats:
58 F64 = ... # SVP64 value 0b00
59 F32 = ... # SVP64 value 0b01
60 F16 = ... # SVP64 value 0b10
61 BF16 = ... # SVP64 value 0b11
63 # XXX this is redundant and out-of-date with respect to the
64 # clarification that the input is in counts of *elements*
65 # *NOT* "fixed width parts".
66 # fixed-width parts results in 14 such parts being created
67 # when 5 will do, for a simple example 5-6-6-6
68 * part: A piece of a SIMD vector, every SIMD vector is made of a
69 non-negative integer of parts. Elements are made of a power-of-two
70 number of parts. A part is a fixed number of bits wide for each
71 different SIMD layout, it doesn't vary when `elwid` changes. A part
72 can have a bit width of any non-negative integer, it is not restricted
73 to power-of-two. SIMD vectors should have as few parts as necessary,
74 since some circuits have size proportional to the number of parts.
76 * elwid: ElWid or nmigen Value with ElWid as the shape
77 the current element-width
79 * vec_el_counts: dict[ElWid, int]
80 a map from `ElWid` values `k` to the number of vector elements
81 required within a partition when `elwid == k`.
84 vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
85 ElWid.I16(==0b10): 4, # 4 vector elements
86 ElWid.I32(==0b01): 2, # 2 vector elements
87 ElWid.I64(==0b00): 1} # 1 vector (aka scalar) element
90 vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
91 ElWid.F16(==0b10): 4, # 4 vector elements
92 ElWid.F32(==0b01): 2, # 2 vector elements
93 ElWid.F64(==0b00): 1} # 1 (aka scalar) vector element
95 * lane_shapes: int or Mapping[ElWid, int] (optional)
96 the bit-width of all elements in a SIMD layout.
97 if not provided, the lane_shapes are computed from fixed_width
98 and vec_el_counts at each elwidth.
100 * fixed_width: int (optional)
101 the total width of a SIMD vector. One or both of lane_shapes or
102 fixed_width may be provided. Both may not be left out.
104 # when there are no lane_shapes specified, this indicates a
105 # desire to use the maximum available space based on the fixed width
106 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
107 if lane_shapes
is None:
108 assert fixed_width
is not None, \
109 "both fixed_width and lane_shapes cannot be None"
110 lane_shapes
= {i
: fixed_width
// vec_el_counts
[i
]
111 for i
in vec_el_counts
}
112 print("lane_shapes", fixed_width
, lane_shapes
)
114 # identify if the lane_shapes is a mapping (dict, etc.)
115 # if not, then assume that it is an integer (width) that
116 # needs to be requested across all partitions
117 if not isinstance(lane_shapes
, Mapping
):
118 lane_shapes
= {i
: lane_shapes
for i
in vec_el_counts
}
120 # compute a set of partition widths
121 print("lane_shapes", lane_shapes
, "vec_el_counts", vec_el_counts
)
124 for i
, lwid
in lane_shapes
.items():
125 required_width
= lwid
* vec_el_counts
[i
]
126 print(" required width", cpart_wid
, i
, lwid
, required_width
)
127 if required_width
> width
:
129 width
= required_width
131 # calculate the minumum width required if fixed_width specified
132 part_count
= max(vec_el_counts
.values())
133 print("width", width
, cpart_wid
, part_count
)
134 if fixed_width
is not None: # override the width and part_wid
135 assert width
<= fixed_width
, "not enough space to fit partitions"
136 part_wid
= fixed_width
// part_count
137 assert part_wid
* part_count
== fixed_width
, \
138 "calculated width not aligned multiples"
140 print("part_wid", part_wid
, "count", part_count
, "width", width
)
142 # create the breakpoints dictionary.
143 # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
144 # https://stackoverflow.com/questions/26367812/
145 dpoints
= defaultdict(list) # if empty key, create a (empty) list
146 for i
, c
in vec_el_counts
.items():
147 print ("dpoints", i
, "count", c
)
148 # calculate part_wid based on overall width divided by number
150 part_wid
= width
// c
151 def add_p(msg
, start
, p
):
152 print (" adding dpoint", msg
, start
, part_wid
, i
, c
, p
)
153 dpoints
[p
].append(i
) # auto-creates list if key non-existent
154 # for each elwidth, create the required number of vector elements
155 for start
in range(c
):
156 add_p("start", start
, start
* part_wid
) # start of lane
157 add_p("end ", start
, start
* part_wid
+ lane_shapes
[i
]) # end lane
159 # do not need the breakpoints at the very start or the very end
161 if fixed_width
is not None:
162 dpoints
.pop(fixed_width
, None)
164 dpoints
.pop(width
, None)
165 plist
= list(dpoints
.keys())
168 pprint(dict(dpoints
))
170 # second stage, add (map to) the elwidth==i expressions.
171 # TODO: use nmutil.treereduce?
174 points
[p
] = map(lambda i
: elwid
== i
, dpoints
[p
])
175 points
[p
] = reduce(operator
.or_
, points
[p
])
177 # third stage, create the binary values which *if* elwidth is set to i
178 # *would* result in the mask at that elwidth being set to this value
179 # these can easily be double-checked through Assertion
181 for i
in vec_el_counts
.keys():
183 for p
, elwidths
in dpoints
.items():
185 bitpos
= plist
.index(p
)
186 bitp
[i
] |
= 1 << bitpos
188 # fourth stage: determine which partitions are 100% unused.
189 # these can then be "blanked out"
190 bmask
= (1 << len(plist
))-1
191 for p
in bitp
.values():
193 return (PartitionPoints(points
), bitp
, bmask
, width
, lane_shapes
,
197 if __name__
== '__main__':
199 # for each element-width (elwidth 0-3) the number of Vector Elements is:
200 # elwidth=0b00 QTY 1 partitions: | ? |
201 # elwidth=0b01 QTY 1 partitions: | ? |
202 # elwidth=0b10 QTY 2 partitions: | ? | ? |
203 # elwidth=0b11 QTY 4 partitions: | ? | ? | ? | ? |
204 # actual widths of Signals *within* those partitions is given separately
212 # width=3 indicates "same width Vector Elements (3) at all elwidths"
213 # elwidth=0b00 1x 5-bit | unused xx ..3 |
214 # elwidth=0b01 1x 6-bit | unused xx ..3 |
215 # elwidth=0b10 2x 12-bit | xxx ..3 | xxx ..3 |
216 # elwidth=0b11 3x 24-bit | ..3| ..3 | ..3 |..3 |
217 # expected partitions (^) | | | (^)
218 # to be at these points: (|) | | | |
219 width_in_all_parts
= 3
222 pprint((i
, layout(i
, vec_el_counts
, width_in_all_parts
)))
224 # specify that the Vector Element lengths are to be *different* at
225 # each of the elwidths.
226 # combined with vec_el_counts we have:
227 # elwidth=0b00 1x 5-bit |<----unused---------->....5|
228 # elwidth=0b01 1x 6-bit |<----unused--------->.....6|
229 # elwidth=0b10 2x 6-bit |unused>.....6|unused>.....6|
230 # elwidth=0b11 4x 6-bit |.....6|.....6|.....6|.....6|
231 # expected partitions (^) ^ ^ ^^ (^)
232 # to be at these points: (|) | | || (|)
234 widths_at_elwidth
= {
241 print ("5,6,6,6 elements", widths_at_elwidth
)
243 pp
, bitp
, bm
, b
, c
, d
= \
244 layout(i
, vec_el_counts
, widths_at_elwidth
)
245 pprint((i
, (pp
, bitp
, bm
, b
, c
, d
)))
246 # now check that the expected partition points occur
247 print("5,6,6,6 ppt keys", pp
.keys())
248 assert list(pp
.keys()) == [5,6,12,18]
250 # this example was probably what the 5,6,6,6 one was supposed to be.
251 # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
252 # elwidth=0b00 1x 24-bit |.........................24|
253 # elwidth=0b01 1x 12-bit |<--unused--->|...........12|
254 # elwidth=0b10 2x 5 -bit |unused>|....5|unused>|....5|
255 # elwidth=0b11 4x 6 -bit |.....6|.....6|.....6|.....6|
256 # expected partitions (^) ^^ ^ ^^ (^)
257 # to be at these points: (|) || | || (|)
258 # (24) 1817 12 65 (0)
259 widths_at_elwidth
= {
266 print ("24,12,5,6 elements", widths_at_elwidth
)
268 pp
, bitp
, bm
, b
, c
, d
= \
269 layout(i
, vec_el_counts
, widths_at_elwidth
)
270 pprint((i
, (pp
, bitp
, bm
, b
, c
, d
)))
271 # now check that the expected partition points occur
272 print("24,12,5,6 ppt keys", pp
.keys())
273 assert list(pp
.keys()) == [5,6,12,17,18]
276 # this tests elwidth as an actual Signal. layout is allowed to
277 # determine arbitrarily the overall length
278 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
281 pp
, bitp
, bm
, b
, c
, d
= layout(
282 elwid
, vec_el_counts
, widths_at_elwidth
)
283 pprint((pp
, b
, c
, d
))
284 for k
, v
in bitp
.items():
285 print("bitp elwidth=%d" % k
, bin(v
))
286 print("bmask", bin(bm
))
295 for pval
in list(pp
.values()):
296 val
= yield pval
# get nmigen to evaluate pp
298 pprint((i
, (ppt
, b
, c
, d
)))
299 # check the results against bitp static-expected partition points
300 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
301 # https://stackoverflow.com/a/27165694
302 ival
= int(''.join(map(str, ppt
[::-1])), 2)
303 assert ival
== bitp
[i
]
306 sim
.add_process(process
)
309 # this tests elwidth as an actual Signal. layout is *not* allowed to
310 # determine arbitrarily the overall length, it is fixed to 64
311 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
314 pp
, bitp
, bm
, b
, c
, d
= layout(elwid
, vec_el_counts
,
317 pprint((pp
, b
, c
, d
))
318 for k
, v
in bitp
.items():
319 print("bitp elwidth=%d" % k
, bin(v
))
320 print("bmask", bin(bm
))
329 for pval
in list(pp
.values()):
330 val
= yield pval
# get nmigen to evaluate pp
332 print("test elwidth=%d" % i
)
333 pprint((i
, (ppt
, b
, c
, d
)))
334 # check the results against bitp static-expected partition points
335 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
336 # https://stackoverflow.com/a/27165694
337 ival
= int(''.join(map(str, ppt
[::-1])), 2)
338 assert ival
== bitp
[i
], "ival %s actual %s" % (bin(ival
),
342 sim
.add_process(process
)
345 # fixed_width=32 and no lane_widths says "allocate maximum"
346 # i.e. Vector Element Widths are auto-allocated
347 # elwidth=0b00 1x 32-bit | .................32 |
348 # elwidth=0b01 1x 32-bit | .................32 |
349 # elwidth=0b10 2x 12-bit | ......16 | ......16 |
350 # elwidth=0b11 3x 24-bit | ..8| ..8 | ..8 |..8 |
351 # expected partitions (^) | | | (^)
352 # to be at these points: (|) | | | |
354 # TODO, fix this so that it is correct. put it at the end so it
355 # shows that things break and doesn't stop the other tests.
356 print ("maximum allocation from fixed_width=32")
358 pprint((i
, layout(i
, vec_el_counts
, fixed_width
=32)))
361 # https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
362 # 1xFP64: 11 bits, one exponent
363 # 2xFP32: 8 bits, two exponents
364 # 4xFP16: 5 bits, four exponents
365 # 4xBF16: 8 bits, four exponents
372 widths_at_elwidth
= {
373 0: 11, # FP64 ew=0b00
381 # |31| | |24| 16|15 | | 8|7 0 |
382 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
383 # 32bit | x| x| x| | x| x| x|10 .... 0 |
384 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
385 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
388 print ("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth
)
390 pp
, bitp
, bm
, b
, c
, d
= \
391 layout(i
, vec_el_counts
, widths_at_elwidth
,
393 pprint((i
, (pp
, bitp
, bin(bm
), b
, c
, d
)))
394 # now check that the expected partition points occur
395 print("11,8,5,8 pp keys", pp
.keys())
396 #assert list(pp.keys()) == [5,6,12,18]
399 ###### 2nd test, different from the above, elwid=0b10 ==> 11 bit ######
409 widths_at_elwidth
= {
410 0: 11, # FP64 ew=0b00
411 1: 11, # FP32 ew=0b01
418 # |31| | |24| 16|15 | | 8|7 0 |
419 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
420 # 32bit | x| x| x| | x| x| x|10 .... 0 |
421 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
422 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
425 print ("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth
)
427 pp
, bitp
, bm
, b
, c
, d
= \
428 layout(i
, vec_el_counts
, widths_at_elwidth
,
430 pprint((i
, (pp
, bitp
, bin(bm
), b
, c
, d
)))
431 # now check that the expected partition points occur
432 print("11,8,5,8 pp keys", pp
.keys())
433 #assert list(pp.keys()) == [5,6,12,18]