2 # SPDX-License-Identifier: LGPL-3-or-later
3 # See Notices.txt for copyright information
6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
15 from nmigen
import Signal
, Module
, Elaboratable
, Mux
, Cat
, Shape
, Repl
16 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
17 from nmigen
.cli
import rtlil
19 from collections
.abc
import Mapping
20 from functools
import reduce
22 from collections
import defaultdict
23 from pprint
import pprint
25 from ieee754
.part_mul_add
.partpoints
import PartitionPoints
28 # main fn, which started out here in the bugtracker:
29 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
30 # note that signed is **NOT** part of the layout, and will NOT
31 # be added (because it is not relevant or appropriate).
32 # sign belongs in ast.Shape and is the only appropriate location.
33 # there is absolutely nothing within this function that in any
34 # way requires a sign. it is *purely* performing numerical width
35 # computations that have absolutely nothing to do with whether the
36 # actual data is signed or unsigned.
37 def layout(elwid
, vec_el_counts
, lane_shapes
=None, fixed_width
=None):
38 """calculate a SIMD layout.
41 * element: a single scalar value that is an element of a SIMD vector.
42 it has a width in bits. Every element is made of 1 or
44 * ElWid: the element-width (really the element type) of an instruction.
45 Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
46 In Python, `ElWid` is either an enum type or is `int`.
47 Example `ElWid` definition for integers:
50 I64 = ... # SVP64 value 0b00
51 I32 = ... # SVP64 value 0b01
52 I16 = ... # SVP64 value 0b10
53 I8 = ... # SVP64 value 0b11
55 Example `ElWid` definition for floats:
58 F64 = ... # SVP64 value 0b00
59 F32 = ... # SVP64 value 0b01
60 F16 = ... # SVP64 value 0b10
61 BF16 = ... # SVP64 value 0b11
63 * elwid: ElWid or nmigen Value with ElWid as the shape
64 the current element-width
66 * vec_el_counts: dict[ElWid, int]
67 a map from `ElWid` values `k` to the number of vector elements
68 required within a partition when `elwid == k`.
71 vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
72 ElWid.I16(==0b10): 4, # 4 vector elements
73 ElWid.I32(==0b01): 2, # 2 vector elements
74 ElWid.I64(==0b00): 1} # 1 vector (aka scalar) element
77 vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
78 ElWid.F16(==0b10): 4, # 4 vector elements
79 ElWid.F32(==0b01): 2, # 2 vector elements
80 ElWid.F64(==0b00): 1} # 1 (aka scalar) vector element
82 * lane_shapes: int or Mapping[ElWid, int] (optional)
83 the bit-width of all elements in a SIMD layout.
84 if not provided, the lane_shapes are computed from fixed_width
85 and vec_el_counts at each elwidth.
87 * fixed_width: int (optional)
88 the total width of a SIMD vector. One or both of lane_shapes or
89 fixed_width may be provided. Both may not be left out.
91 # when there are no lane_shapes specified, this indicates a
92 # desire to use the maximum available space based on the fixed width
93 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
94 if lane_shapes
is None:
95 assert fixed_width
is not None, \
96 "both fixed_width and lane_shapes cannot be None"
97 lane_shapes
= {i
: fixed_width
// vec_el_counts
[i
]
98 for i
in vec_el_counts
}
99 print("lane_shapes", fixed_width
, lane_shapes
)
101 # identify if the lane_shapes is a mapping (dict, etc.)
102 # if not, then assume that it is an integer (width) that
103 # needs to be requested across all partitions
104 if not isinstance(lane_shapes
, Mapping
):
105 lane_shapes
= {i
: lane_shapes
for i
in vec_el_counts
}
107 # compute a set of partition widths
108 print("lane_shapes", lane_shapes
, "vec_el_counts", vec_el_counts
)
111 for i
, lwid
in lane_shapes
.items():
112 required_width
= lwid
* vec_el_counts
[i
]
113 print(" required width", cpart_wid
, i
, lwid
, required_width
)
114 if required_width
> width
:
116 width
= required_width
118 # calculate the minumum width required if fixed_width specified
119 part_count
= max(vec_el_counts
.values())
120 print("width", width
, cpart_wid
, part_count
)
121 if fixed_width
is not None: # override the width and part_wid
122 assert width
<= fixed_width
, "not enough space to fit partitions"
123 part_wid
= fixed_width
// part_count
124 assert part_wid
* part_count
== fixed_width
, \
125 "calculated width not aligned multiples"
127 print("part_wid", part_wid
, "count", part_count
, "width", width
)
129 # create the breakpoints dictionary.
130 # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
131 # https://stackoverflow.com/questions/26367812/
132 dpoints
= defaultdict(list) # if empty key, create a (empty) list
133 for i
, c
in vec_el_counts
.items():
134 print("dpoints", i
, "count", c
)
135 # calculate part_wid based on overall width divided by number
137 part_wid
= width
// c
139 def add_p(msg
, start
, p
):
140 print(" adding dpoint", msg
, start
, part_wid
, i
, c
, p
)
141 dpoints
[p
].append(i
) # auto-creates list if key non-existent
142 # for each elwidth, create the required number of vector elements
143 for start
in range(c
):
144 start_bit
= start
* part_wid
145 end_bit
= start_bit
+ lane_shapes
[i
]
146 add_p("start", start
, start_bit
) # start of lane
147 add_p("end ", start
, end_bit
) # end lane
149 # deduplicate dpoints lists
150 for k
in dpoints
.keys():
151 dpoints
[k
] = list({i
: None for i
in dpoints
[k
]}.keys())
153 # do not need the breakpoints at the very start or the very end
155 dpoints
.pop(width
, None)
158 dpoints
= dict(sorted(dpoints
.items(), key
=lambda i
: i
[0]))
163 # second stage, add (map to) the elwidth==i expressions.
164 # TODO: use nmutil.treereduce?
166 for p
in dpoints
.keys():
167 points
[p
] = map(lambda i
: elwid
== i
, dpoints
[p
])
168 points
[p
] = reduce(operator
.or_
, points
[p
])
170 # third stage, create the binary values which *if* elwidth is set to i
171 # *would* result in the mask at that elwidth being set to this value
172 # these can easily be double-checked through Assertion
174 for i
in vec_el_counts
.keys():
176 for bit_index
, (p
, elwidths
) in enumerate(dpoints
.items()):
178 bitp
[i
] |
= 1 << bit_index
180 # fourth stage: determine which partitions are 100% unused.
181 # these can then be "blanked out"
182 bmask
= (1 << len(dpoints
)) - 1
183 for p
in bitp
.values():
185 return (PartitionPoints(points
), bitp
, bmask
, width
, lane_shapes
,
189 if __name__
== '__main__':
191 # for each element-width (elwidth 0-3) the number of Vector Elements is:
192 # elwidth=0b00 QTY 1 partitions: | ? |
193 # elwidth=0b01 QTY 1 partitions: | ? |
194 # elwidth=0b10 QTY 2 partitions: | ? | ? |
195 # elwidth=0b11 QTY 4 partitions: | ? | ? | ? | ? |
196 # actual widths of Signals *within* those partitions is given separately
204 # width=3 indicates "same width Vector Elements (3) at all elwidths"
205 # elwidth=0b00 1x 5-bit | unused xx ..3 |
206 # elwidth=0b01 1x 6-bit | unused xx ..3 |
207 # elwidth=0b10 2x 12-bit | xxx ..3 | xxx ..3 |
208 # elwidth=0b11 3x 24-bit | ..3| ..3 | ..3 |..3 |
209 # expected partitions (^) | | | (^)
210 # to be at these points: (|) | | | |
211 width_in_all_parts
= 3
214 pprint((i
, layout(i
, vec_el_counts
, width_in_all_parts
)))
216 # specify that the Vector Element lengths are to be *different* at
217 # each of the elwidths.
218 # combined with vec_el_counts we have:
219 # elwidth=0b00 1x 5-bit |<----unused---------->....5|
220 # elwidth=0b01 1x 6-bit |<----unused--------->.....6|
221 # elwidth=0b10 2x 6-bit |unused>.....6|unused>.....6|
222 # elwidth=0b11 4x 6-bit |.....6|.....6|.....6|.....6|
223 # expected partitions (^) ^ ^ ^^ (^)
224 # to be at these points: (|) | | || (|)
226 widths_at_elwidth
= {
233 print("5,6,6,6 elements", widths_at_elwidth
)
235 pp
, bitp
, bm
, b
, c
, d
= \
236 layout(i
, vec_el_counts
, widths_at_elwidth
)
237 pprint((i
, (pp
, bitp
, bm
, b
, c
, d
)))
238 # now check that the expected partition points occur
239 print("5,6,6,6 ppt keys", pp
.keys())
240 assert list(pp
.keys()) == [5, 6, 12, 18]
242 # this example was probably what the 5,6,6,6 one was supposed to be.
243 # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
244 # elwidth=0b00 1x 24-bit |.........................24|
245 # elwidth=0b01 1x 12-bit |<--unused--->|...........12|
246 # elwidth=0b10 2x 5 -bit |unused>|....5|unused>|....5|
247 # elwidth=0b11 4x 6 -bit |.....6|.....6|.....6|.....6|
248 # expected partitions (^) ^^ ^ ^^ (^)
249 # to be at these points: (|) || | || (|)
250 # (24) 1817 12 65 (0)
251 widths_at_elwidth
= {
258 print("24,12,5,6 elements", widths_at_elwidth
)
260 pp
, bitp
, bm
, b
, c
, d
= \
261 layout(i
, vec_el_counts
, widths_at_elwidth
)
262 pprint((i
, (pp
, bitp
, bm
, b
, c
, d
)))
263 # now check that the expected partition points occur
264 print("24,12,5,6 ppt keys", pp
.keys())
265 assert list(pp
.keys()) == [5, 6, 12, 17, 18]
267 # this tests elwidth as an actual Signal. layout is allowed to
268 # determine arbitrarily the overall length
269 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
272 pp
, bitp
, bm
, b
, c
, d
= layout(
273 elwid
, vec_el_counts
, widths_at_elwidth
)
274 pprint((pp
, b
, c
, d
))
275 for k
, v
in bitp
.items():
276 print("bitp elwidth=%d" % k
, bin(v
))
277 print("bmask", bin(bm
))
286 for pval
in list(pp
.values()):
287 val
= yield pval
# get nmigen to evaluate pp
289 pprint((i
, (ppt
, b
, c
, d
)))
290 # check the results against bitp static-expected partition points
291 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
292 # https://stackoverflow.com/a/27165694
293 ival
= int(''.join(map(str, ppt
[::-1])), 2)
294 assert ival
== bitp
[i
]
297 sim
.add_process(process
)
300 # this tests elwidth as an actual Signal. layout is *not* allowed to
301 # determine arbitrarily the overall length, it is fixed to 64
302 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
305 pp
, bitp
, bm
, b
, c
, d
= layout(elwid
, vec_el_counts
,
308 pprint((pp
, b
, c
, d
))
309 for k
, v
in bitp
.items():
310 print("bitp elwidth=%d" % k
, bin(v
))
311 print("bmask", bin(bm
))
320 for pval
in list(pp
.values()):
321 val
= yield pval
# get nmigen to evaluate pp
323 print("test elwidth=%d" % i
)
324 pprint((i
, (ppt
, b
, c
, d
)))
325 # check the results against bitp static-expected partition points
326 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
327 # https://stackoverflow.com/a/27165694
328 ival
= int(''.join(map(str, ppt
[::-1])), 2)
329 assert ival
== bitp
[i
], "ival %s actual %s" % (bin(ival
),
333 sim
.add_process(process
)
336 # fixed_width=32 and no lane_widths says "allocate maximum"
337 # i.e. Vector Element Widths are auto-allocated
338 # elwidth=0b00 1x 32-bit | .................32 |
339 # elwidth=0b01 1x 32-bit | .................32 |
340 # elwidth=0b10 2x 12-bit | ......16 | ......16 |
341 # elwidth=0b11 3x 24-bit | ..8| ..8 | ..8 |..8 |
342 # expected partitions (^) | | | (^)
343 # to be at these points: (|) | | | |
345 # TODO, fix this so that it is correct. put it at the end so it
346 # shows that things break and doesn't stop the other tests.
347 print("maximum allocation from fixed_width=32")
349 pprint((i
, layout(i
, vec_el_counts
, fixed_width
=32)))
352 # https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
353 # 1xFP64: 11 bits, one exponent
354 # 2xFP32: 8 bits, two exponents
355 # 4xFP16: 5 bits, four exponents
356 # 4xBF16: 8 bits, four exponents
363 widths_at_elwidth
= {
364 0: 11, # FP64 ew=0b00
372 # |31| | |24| 16|15 | | 8|7 0 |
373 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
374 # 32bit | x| x| x| | x| x| x|10 .... 0 |
375 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
376 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
379 print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth
)
381 pp
, bitp
, bm
, b
, c
, d
= \
382 layout(i
, vec_el_counts
, widths_at_elwidth
,
384 pprint((i
, (pp
, bitp
, bin(bm
), b
, c
, d
)))
385 # now check that the expected partition points occur
386 print("11,8,5,8 pp keys", pp
.keys())
387 #assert list(pp.keys()) == [5,6,12,18]
390 ###### 2nd test, different from the above, elwid=0b10 ==> 11 bit ######
400 widths_at_elwidth
= {
401 0: 11, # FP64 ew=0b00
402 1: 11, # FP32 ew=0b01
409 # |31| | |24| 16|15 | | 8|7 0 |
410 # |31|28|26|24| |20|16| 12| |10|8|5|4 0 |
411 # 32bit | x| x| x| | x| x| x|10 .... 0 |
412 # 16bit | x| x|26 ... 16 | x| x|10 .... 0 |
413 # 8bit | x|28 .. 24| 20.16| x|11 .. 8|x|4.. 0 |
416 print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth
)
418 pp
, bitp
, bm
, b
, c
, d
= \
419 layout(i
, vec_el_counts
, widths_at_elwidth
,
421 pprint((i
, (pp
, bitp
, bin(bm
), b
, c
, d
)))
422 # now check that the expected partition points occur
423 print("11,8,5,8 pp keys", pp
.keys())
424 #assert list(pp.keys()) == [5,6,12,18]