src/ieee754/part/layout_experiment.py

   1 #!/usr/bin/env python3
   2 # SPDX-License-Identifier: LGPL-3-or-later
   3 # See Notices.txt for copyright information
   4 """
   5 Links:
   6 * https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
   7 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
   8 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
   9 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
  10 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
  11 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
  12 * https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
  13 """
  14
  15 from nmigen import Signal, Module, Elaboratable, Mux, Cat, Shape, Repl
  16 from nmigen.back.pysim import Simulator, Delay, Settle
  17 from nmigen.cli import rtlil
  18
  19 from collections.abc import Mapping
  20 from functools import reduce
  21 import operator
  22 from collections import defaultdict
  23 from pprint import pprint
  24
  25 from ieee754.part_mul_add.partpoints import PartitionPoints
  26
  27
  28 # main fn, which started out here in the bugtracker:
  29 # https://bugs.libre-soc.org/show_bug.cgi?id=713#c20
  30 # note that signed is **NOT** part of the layout, and will NOT
  31 # be added (because it is not relevant or appropriate).
  32 # sign belongs in ast.Shape and is the only appropriate location.
  33 # there is absolutely nothing within this function that in any
  34 # way requires a sign.  it is *purely* performing numerical width
  35 # computations that have absolutely nothing to do with whether the
  36 # actual data is signed or unsigned.
  37 def layout(elwid, vec_el_counts, lane_shapes=None, fixed_width=None):
  38     """calculate a SIMD layout.
  39
  40     Glossary:
  41     * element: a single scalar value that is an element of a SIMD vector.
  42         it has a width in bits. Every element is made of 1 or
  43         more parts.
  44     * ElWid: the element-width (really the element type) of an instruction.
  45         Either an integer or a FP type. Integer `ElWid`s are sign-agnostic.
  46         In Python, `ElWid` is either an enum type or is `int`.
  47         Example `ElWid` definition for integers:
  48
  49         class ElWid(Enum):
  50             I64 = ...       # SVP64 value 0b00
  51             I32 = ...       # SVP64 value 0b01
  52             I16 = ...       # SVP64 value 0b10
  53             I8 = ...        # SVP64 value 0b11
  54
  55         Example `ElWid` definition for floats:
  56
  57         class ElWid(Enum):
  58             F64 = ...    # SVP64 value 0b00
  59             F32 = ...    # SVP64 value 0b01
  60             F16 = ...    # SVP64 value 0b10
  61             BF16 = ...   # SVP64 value 0b11
  62
  63     * elwid: ElWid or nmigen Value with ElWid as the shape
  64         the current element-width
  65
  66     * vec_el_counts: dict[ElWid, int]
  67         a map from `ElWid` values `k` to the number of vector elements
  68         required within a partition when `elwid == k`.
  69
  70         Example:
  71         vec_el_counts = {ElWid.I8(==0b11): 8, # 8 vector elements
  72                        ElWid.I16(==0b10): 4,  # 4 vector elements
  73                        ElWid.I32(==0b01): 2,  # 2 vector elements
  74                        ElWid.I64(==0b00): 1}  # 1 vector (aka scalar) element
  75
  76         Another Example:
  77         vec_el_counts = {ElWid.BF16(==0b11): 4, # 4 vector elements
  78                          ElWid.F16(==0b10): 4,  # 4 vector elements
  79                          ElWid.F32(==0b01): 2,  # 2 vector elements
  80                          ElWid.F64(==0b00): 1}  # 1 (aka scalar) vector element
  81
  82     * lane_shapes: int or Mapping[ElWid, int] (optional)
  83         the bit-width of all elements in a SIMD layout.
  84         if not provided, the lane_shapes are computed from fixed_width
  85         and vec_el_counts at each elwidth.
  86
  87     * fixed_width: int (optional)
  88         the total width of a SIMD vector. One or both of lane_shapes or
  89         fixed_width may be provided.  Both may not be left out.
  90     """
  91     # when there are no lane_shapes specified, this indicates a
  92     # desire to use the maximum available space based on the fixed width
  93     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c67
  94     if lane_shapes is None:
  95         assert fixed_width is not None, \
  96             "both fixed_width and lane_shapes cannot be None"
  97         lane_shapes = {i: fixed_width // vec_el_counts[i]
  98                        for i in vec_el_counts}
  99         print("lane_shapes", fixed_width, lane_shapes)
 100
 101     # identify if the lane_shapes is a mapping (dict, etc.)
 102     # if not, then assume that it is an integer (width) that
 103     # needs to be requested across all partitions
 104     if not isinstance(lane_shapes, Mapping):
 105         lane_shapes = {i: lane_shapes for i in vec_el_counts}
 106
 107     # compute a set of partition widths
 108     print("lane_shapes", lane_shapes, "vec_el_counts", vec_el_counts)
 109     cpart_wid = 0
 110     width = 0
 111     for i, lwid in lane_shapes.items():
 112         required_width = lwid * vec_el_counts[i]
 113         print("     required width", cpart_wid, i, lwid, required_width)
 114         if required_width > width:
 115             cpart_wid = lwid
 116             width = required_width
 117
 118     # calculate the minumum width required if fixed_width specified
 119     part_count = max(vec_el_counts.values())
 120     print("width", width, cpart_wid, part_count)
 121     if fixed_width is not None:  # override the width and part_wid
 122         assert width <= fixed_width, "not enough space to fit partitions"
 123         part_wid = fixed_width // part_count
 124         assert part_wid * part_count == fixed_width, \
 125             "calculated width not aligned multiples"
 126         width = fixed_width
 127         print("part_wid", part_wid, "count", part_count, "width", width)
 128
 129     # create the breakpoints dictionary.
 130     # do multi-stage version https://bugs.libre-soc.org/show_bug.cgi?id=713#c34
 131     # https://stackoverflow.com/questions/26367812/
 132     dpoints = defaultdict(list)  # if empty key, create a (empty) list
 133     for i, c in vec_el_counts.items():
 134         print("dpoints", i, "count", c)
 135         # calculate part_wid based on overall width divided by number
 136         # of elements.
 137         part_wid = width // c
 138
 139         def add_p(msg, start, p):
 140             print("    adding dpoint", msg, start, part_wid, i, c, p)
 141             dpoints[p].append(i)  # auto-creates list if key non-existent
 142         # for each elwidth, create the required number of vector elements
 143         for start in range(c):
 144             start_bit = start * part_wid
 145             end_bit = start_bit + lane_shapes[i]
 146             add_p("start", start, start_bit)  # start of lane
 147             add_p("end  ", start, end_bit)  # end lane
 148
 149     # deduplicate dpoints lists
 150     for k in dpoints.keys():
 151         dpoints[k] = list({i: None for i in dpoints[k]}.keys())
 152
 153     # do not need the breakpoints at the very start or the very end
 154     dpoints.pop(0, None)
 155     dpoints.pop(width, None)
 156
 157     # sort dpoints keys
 158     dpoints = dict(sorted(dpoints.items(), key=lambda i: i[0]))
 159
 160     print("dpoints")
 161     pprint(dpoints)
 162
 163     # second stage, add (map to) the elwidth==i expressions.
 164     # TODO: use nmutil.treereduce?
 165     points = {}
 166     for p in dpoints.keys():
 167         points[p] = map(lambda i: elwid == i, dpoints[p])
 168         points[p] = reduce(operator.or_, points[p])
 169
 170     # third stage, create the binary values which *if* elwidth is set to i
 171     # *would* result in the mask at that elwidth being set to this value
 172     # these can easily be double-checked through Assertion
 173     bitp = {}
 174     for i in vec_el_counts.keys():
 175         bitp[i] = 0
 176         for bit_index, (p, elwidths) in enumerate(dpoints.items()):
 177             if i in elwidths:
 178                 bitp[i] |= 1 << bit_index
 179
 180     # fourth stage: determine which partitions are 100% unused.
 181     # these can then be "blanked out"
 182     bmask = (1 << len(dpoints)) - 1
 183     for p in bitp.values():
 184         bmask &= ~p
 185     return (PartitionPoints(points), bitp, bmask, width, lane_shapes,
 186             part_wid)
 187
 188
 189 if __name__ == '__main__':
 190
 191     # for each element-width (elwidth 0-3) the number of Vector Elements is:
 192     # elwidth=0b00 QTY 1 partitions:   |          ?          |
 193     # elwidth=0b01 QTY 1 partitions:   |          ?          |
 194     # elwidth=0b10 QTY 2 partitions:   |    ?     |     ?    |
 195     # elwidth=0b11 QTY 4 partitions:   | ?  |  ?  |  ?  | ?  |
 196     # actual widths of Signals *within* those partitions is given separately
 197     vec_el_counts = {
 198         0: 1,
 199         1: 1,
 200         2: 2,
 201         3: 4,
 202     }
 203
 204     # width=3 indicates "same width Vector Elements (3) at all elwidths"
 205     # elwidth=0b00 1x 5-bit     |  unused xx      ..3 |
 206     # elwidth=0b01 1x 6-bit     |  unused xx      ..3 |
 207     # elwidth=0b10 2x 12-bit    | xxx  ..3 | xxx  ..3 |
 208     # elwidth=0b11 3x 24-bit    | ..3| ..3 | ..3 |..3 |
 209     # expected partitions      (^)   |     |     |   (^)
 210     # to be at these points:   (|)   |     |     |    |
 211     width_in_all_parts = 3
 212
 213     for i in range(4):
 214         pprint((i, layout(i, vec_el_counts, width_in_all_parts)))
 215
 216     # specify that the Vector Element lengths are to be *different* at
 217     # each of the elwidths.
 218     # combined with vec_el_counts we have:
 219     # elwidth=0b00 1x 5-bit    |<----unused---------->....5|
 220     # elwidth=0b01 1x 6-bit    |<----unused--------->.....6|
 221     # elwidth=0b10 2x 6-bit    |unused>.....6|unused>.....6|
 222     # elwidth=0b11 4x 6-bit    |.....6|.....6|.....6|.....6|
 223     # expected partitions     (^)     ^      ^      ^^    (^)
 224     # to be at these points:  (|)     |      |      ||    (|)
 225     #                         (24)   18     12      65    (0)
 226     widths_at_elwidth = {
 227         0: 5,
 228         1: 6,
 229         2: 6,
 230         3: 6
 231     }
 232
 233     print("5,6,6,6 elements", widths_at_elwidth)
 234     for i in range(4):
 235         pp, bitp, bm, b, c, d = \
 236             layout(i, vec_el_counts, widths_at_elwidth)
 237         pprint((i, (pp, bitp, bm, b, c, d)))
 238     # now check that the expected partition points occur
 239     print("5,6,6,6 ppt keys", pp.keys())
 240     assert list(pp.keys()) == [5, 6, 12, 18]
 241
 242     # this example was probably what the 5,6,6,6 one was supposed to be.
 243     # combined with vec_el_counts {0:1, 1:1, 2:2, 3:4} we have:
 244     # elwidth=0b00 1x 24-bit    |.........................24|
 245     # elwidth=0b01 1x 12-bit    |<--unused--->|...........12|
 246     # elwidth=0b10 2x 5 -bit    |unused>|....5|unused>|....5|
 247     # elwidth=0b11 4x 6 -bit    |.....6|.....6|.....6|.....6|
 248     # expected partitions      (^)     ^^     ^       ^^    (^)
 249     # to be at these points:   (|)     ||     |       ||    (|)
 250     #                          (24)   1817   12       65    (0)
 251     widths_at_elwidth = {
 252         0: 24,  # QTY 1x 24
 253         1: 12,  # QTY 1x 12
 254         2: 5,   # QTY 2x 5
 255         3: 6    # QTY 4x 6
 256     }
 257
 258     print("24,12,5,6 elements", widths_at_elwidth)
 259     for i in range(4):
 260         pp, bitp, bm, b, c, d = \
 261             layout(i, vec_el_counts, widths_at_elwidth)
 262         pprint((i, (pp, bitp, bm, b, c, d)))
 263     # now check that the expected partition points occur
 264     print("24,12,5,6 ppt keys", pp.keys())
 265     assert list(pp.keys()) == [5, 6, 12, 17, 18]
 266
 267     # this tests elwidth as an actual Signal. layout is allowed to
 268     # determine arbitrarily the overall length
 269     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c30
 270
 271     elwid = Signal(2)
 272     pp, bitp, bm, b, c, d = layout(
 273         elwid, vec_el_counts, widths_at_elwidth)
 274     pprint((pp, b, c, d))
 275     for k, v in bitp.items():
 276         print("bitp elwidth=%d" % k, bin(v))
 277     print("bmask", bin(bm))
 278
 279     m = Module()
 280
 281     def process():
 282         for i in range(4):
 283             yield elwid.eq(i)
 284             yield Settle()
 285             ppt = []
 286             for pval in list(pp.values()):
 287                 val = yield pval  # get nmigen to evaluate pp
 288                 ppt.append(val)
 289             pprint((i, (ppt, b, c, d)))
 290             # check the results against bitp static-expected partition points
 291             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 292             # https://stackoverflow.com/a/27165694
 293             ival = int(''.join(map(str, ppt[::-1])), 2)
 294             assert ival == bitp[i]
 295
 296     sim = Simulator(m)
 297     sim.add_process(process)
 298     sim.run()
 299
 300     # this tests elwidth as an actual Signal. layout is *not* allowed to
 301     # determine arbitrarily the overall length, it is fixed to 64
 302     # https://bugs.libre-soc.org/show_bug.cgi?id=713#c22
 303
 304     elwid = Signal(2)
 305     pp, bitp, bm, b, c, d = layout(elwid, vec_el_counts,
 306                                    widths_at_elwidth,
 307                                    fixed_width=64)
 308     pprint((pp, b, c, d))
 309     for k, v in bitp.items():
 310         print("bitp elwidth=%d" % k, bin(v))
 311     print("bmask", bin(bm))
 312
 313     m = Module()
 314
 315     def process():
 316         for i in range(4):
 317             yield elwid.eq(i)
 318             yield Settle()
 319             ppt = []
 320             for pval in list(pp.values()):
 321                 val = yield pval  # get nmigen to evaluate pp
 322                 ppt.append(val)
 323             print("test elwidth=%d" % i)
 324             pprint((i, (ppt, b, c, d)))
 325             # check the results against bitp static-expected partition points
 326             # https://bugs.libre-soc.org/show_bug.cgi?id=713#c47
 327             # https://stackoverflow.com/a/27165694
 328             ival = int(''.join(map(str, ppt[::-1])), 2)
 329             assert ival == bitp[i], "ival %s actual %s" % (bin(ival),
 330                                                            bin(bitp[i]))
 331
 332     sim = Simulator(m)
 333     sim.add_process(process)
 334     sim.run()
 335
 336     # fixed_width=32 and no lane_widths says "allocate maximum"
 337     # i.e. Vector Element Widths are auto-allocated
 338     # elwidth=0b00 1x 32-bit    | .................32 |
 339     # elwidth=0b01 1x 32-bit    | .................32 |
 340     # elwidth=0b10 2x 12-bit    | ......16 | ......16 |
 341     # elwidth=0b11 3x 24-bit    | ..8| ..8 | ..8 |..8 |
 342     # expected partitions      (^)   |     |     |   (^)
 343     # to be at these points:   (|)   |     |     |    |
 344
 345     # TODO, fix this so that it is correct.  put it at the end so it
 346     # shows that things break and doesn't stop the other tests.
 347     print("maximum allocation from fixed_width=32")
 348     for i in range(4):
 349         pprint((i, layout(i, vec_el_counts, fixed_width=32)))
 350
 351     # example "exponent"
 352     #  https://libre-soc.org/3d_gpu/architecture/dynamic_simd/shape/
 353     # 1xFP64: 11 bits, one exponent
 354     # 2xFP32: 8 bits, two exponents
 355     # 4xFP16: 5 bits, four exponents
 356     # 4xBF16: 8 bits, four exponents
 357     vec_el_counts = {
 358         0: 1,  # QTY 1x FP64
 359         1: 2,  # QTY 2x FP32
 360         2: 4,  # QTY 4x FP16
 361         3: 4,  # QTY 4x BF16
 362     }
 363     widths_at_elwidth = {
 364         0: 11,  # FP64 ew=0b00
 365         1: 8,  # FP32 ew=0b01
 366         2: 5,  # FP16 ew=0b10
 367         3: 8   # BF16 ew=0b11
 368     }
 369
 370     # expected results:
 371     #
 372     #        |31|  |  |24|     16|15  |  |   8|7     0 |
 373     #        |31|28|26|24| |20|16|  12|  |10|8|5|4   0 |
 374     #  32bit | x| x| x|  |      x|   x| x|10 ....    0 |
 375     #  16bit | x| x|26    ... 16 |   x| x|10 ....    0 |
 376     #  8bit  | x|28 .. 24|  20.16|   x|11 .. 8|x|4.. 0 |
 377     #  unused  x                     x
 378
 379     print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
 380     for i in range(4):
 381         pp, bitp, bm, b, c, d = \
 382             layout(i, vec_el_counts, widths_at_elwidth,
 383                    fixed_width=32)
 384         pprint((i, (pp, bitp, bin(bm), b, c, d)))
 385     # now check that the expected partition points occur
 386     print("11,8,5,8 pp keys", pp.keys())
 387     #assert list(pp.keys()) == [5,6,12,18]
 388
 389     ######                                                           ######
 390     ###### 2nd test, different from the above, elwid=0b10 ==> 11 bit ######
 391     ######                                                           ######
 392
 393     # example "exponent"
 394     vec_el_counts = {
 395         0: 1,  # QTY 1x FP64
 396         1: 2,  # QTY 2x FP32
 397         2: 4,  # QTY 4x FP16
 398         3: 4,  # QTY 4x BF16
 399     }
 400     widths_at_elwidth = {
 401         0: 11,  # FP64 ew=0b00
 402         1: 11,  # FP32 ew=0b01
 403         2: 5,  # FP16 ew=0b10
 404         3: 8   # BF16 ew=0b11
 405     }
 406
 407     # expected results:
 408     #
 409     #        |31|  |  |24|     16|15  |  |   8|7     0 |
 410     #        |31|28|26|24| |20|16|  12|  |10|8|5|4   0 |
 411     #  32bit | x| x| x|  |      x|   x| x|10 ....    0 |
 412     #  16bit | x| x|26    ... 16 |   x| x|10 ....    0 |
 413     #  8bit  | x|28 .. 24|  20.16|   x|11 .. 8|x|4.. 0 |
 414     #  unused  x                     x
 415
 416     print("11,8,5,8 elements (FP64/32/16/BF exponents)", widths_at_elwidth)
 417     for i in range(4):
 418         pp, bitp, bm, b, c, d = \
 419             layout(i, vec_el_counts, widths_at_elwidth,
 420                    fixed_width=32)
 421         pprint((i, (pp, bitp, bin(bm), b, c, d)))
 422     # now check that the expected partition points occur
 423     print("11,8,5,8 pp keys", pp.keys())
 424     #assert list(pp.keys()) == [5,6,12,18]