src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width)
 136         self.in1 = Signal(width)
 137         self.in2 = Signal(width)
 138         self.sum = Signal(width)
 139         self.carry = Signal(width)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249         # XXX these have to remain here due to some horrible nmigen
 250         # simulation bugs involving sync.  it is *not* necessary to
 251         # have them here, they should (under normal circumstances)
 252         # be moved into elaborate, as they are entirely local
 253         self._expanded_a = Signal(expanded_width) # includes extra part-points
 254         self._expanded_b = Signal(expanded_width) # likewise.
 255         self._expanded_o = Signal(expanded_width) # likewise.
 256
 257     def elaborate(self, platform):
 258         """Elaborate this module."""
 259         m = Module()
 260         expanded_index = 0
 261         # store bits in a list, use Cat later.  graphviz is much cleaner
 262         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 263
 264         # partition points are "breaks" (extra zeros or 1s) in what would
 265         # otherwise be a massive long add.  when the "break" points are 0,
 266         # whatever is in it (in the output) is discarded.  however when
 267         # there is a "1", it causes a roll-over carry to the *next* bit.
 268         # we still ignore the "break" bit in the [intermediate] output,
 269         # however by that time we've got the effect that we wanted: the
 270         # carry has been carried *over* the break point.
 271
 272         for i in range(self.width):
 273             if i in self.partition_points:
 274                 # add extra bit set to 0 + 0 for enabled partition points
 275                 # and 1 + 0 for disabled partition points
 276                 ea.append(self._expanded_a[expanded_index])
 277                 al.append(~self.partition_points[i]) # add extra bit in a
 278                 eb.append(self._expanded_b[expanded_index])
 279                 bl.append(C(0)) # yes, add a zero
 280                 expanded_index += 1 # skip the extra point.  NOT in the output
 281             ea.append(self._expanded_a[expanded_index])
 282             eb.append(self._expanded_b[expanded_index])
 283             eo.append(self._expanded_o[expanded_index])
 284             al.append(self.a[i])
 285             bl.append(self.b[i])
 286             ol.append(self.output[i])
 287             expanded_index += 1
 288
 289         # combine above using Cat
 290         m.d.comb += Cat(*ea).eq(Cat(*al))
 291         m.d.comb += Cat(*eb).eq(Cat(*bl))
 292         m.d.comb += Cat(*ol).eq(Cat(*eo))
 293
 294         # use only one addition to take advantage of look-ahead carry and
 295         # special hardware on FPGAs
 296         m.d.comb += self._expanded_o.eq(
 297             self._expanded_a + self._expanded_b)
 298         return m
 299
 300
 301 FULL_ADDER_INPUT_COUNT = 3
 302
 303 class AddReduceData:
 304
 305     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 306         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 307                           for i in range(n_parts)]
 308         self.inputs = [Signal(output_width, name=f"inputs[{i}]")
 309             for i in range(n_inputs)]
 310         self.reg_partition_points = ppoints.like()
 311
 312     def eq_from(self, reg_partition_points, inputs, part_ops):
 313         return [self.reg_partition_points.eq(reg_partition_points)] + \
 314                [self.inputs[i].eq(inputs[i])
 315                                      for i in range(len(self.inputs))] + \
 316                [self.part_ops[i].eq(part_ops[i])
 317                                      for i in range(len(self.part_ops))]
 318
 319     def eq(self, rhs):
 320         return self.eq_from(rhs.reg_partition_points, rhs.inputs, rhs.part_ops)
 321
 322
 323 class FinalReduceData:
 324
 325     def __init__(self, ppoints, output_width, n_parts):
 326         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 327                           for i in range(n_parts)]
 328         self.output = Signal(output_width)
 329         self.reg_partition_points = ppoints.like()
 330
 331     def eq_from(self, reg_partition_points, output, part_ops):
 332         return [self.reg_partition_points.eq(reg_partition_points)] + \
 333                [self.output.eq(output)] + \
 334                [self.part_ops[i].eq(part_ops[i])
 335                                      for i in range(len(self.part_ops))]
 336
 337     def eq(self, rhs):
 338         return self.eq_from(rhs.reg_partition_points, rhs.output, rhs.part_ops)
 339
 340
 341 class FinalAdd(Elaboratable):
 342     """ Final stage of add reduce
 343     """
 344
 345     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 346                        partition_points):
 347         self.i = AddReduceData(partition_points, n_inputs,
 348                                output_width, n_parts)
 349         self.o = FinalReduceData(partition_points, output_width, n_parts)
 350         self.output_width = output_width
 351         self.n_inputs = n_inputs
 352         self.n_parts = n_parts
 353         self.register_levels = list(register_levels)
 354         self.partition_points = PartitionPoints(partition_points)
 355         if not self.partition_points.fits_in_width(output_width):
 356             raise ValueError("partition_points doesn't fit in output_width")
 357
 358     def elaborate(self, platform):
 359         """Elaborate this module."""
 360         m = Module()
 361
 362         output_width = self.output_width
 363         output = Signal(output_width)
 364         if self.n_inputs == 0:
 365             # use 0 as the default output value
 366             m.d.comb += output.eq(0)
 367         elif self.n_inputs == 1:
 368             # handle single input
 369             m.d.comb += output.eq(self.i.inputs[0])
 370         else:
 371             # base case for adding 2 inputs
 372             assert self.n_inputs == 2
 373             adder = PartitionedAdder(output_width, self.i.reg_partition_points)
 374             m.submodules.final_adder = adder
 375             m.d.comb += adder.a.eq(self.i.inputs[0])
 376             m.d.comb += adder.b.eq(self.i.inputs[1])
 377             m.d.comb += output.eq(adder.output)
 378
 379         # create output
 380         m.d.comb += self.o.eq_from(self.i.reg_partition_points, output,
 381                                    self.i.part_ops)
 382
 383         return m
 384
 385
 386 class AddReduceSingle(Elaboratable):
 387     """Add list of numbers together.
 388
 389     :attribute inputs: input ``Signal``s to be summed. Modification not
 390         supported, except for by ``Signal.eq``.
 391     :attribute register_levels: List of nesting levels that should have
 392         pipeline registers.
 393     :attribute output: output sum.
 394     :attribute partition_points: the input partition points. Modification not
 395         supported, except for by ``Signal.eq``.
 396     """
 397
 398     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 399                        partition_points):
 400         """Create an ``AddReduce``.
 401
 402         :param inputs: input ``Signal``s to be summed.
 403         :param output_width: bit-width of ``output``.
 404         :param register_levels: List of nesting levels that should have
 405             pipeline registers.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.register_levels = list(register_levels)
 414         self.partition_points = PartitionPoints(partition_points)
 415         if not self.partition_points.fits_in_width(output_width):
 416             raise ValueError("partition_points doesn't fit in output_width")
 417
 418         max_level = AddReduceSingle.get_max_level(n_inputs)
 419         for level in self.register_levels:
 420             if level > max_level:
 421                 raise ValueError(
 422                     "not enough adder levels for specified register levels")
 423
 424         # this is annoying.  we have to create the modules (and terms)
 425         # because we need to know what they are (in order to set up the
 426         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 427         # etc because this is not in elaboratable.
 428         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 429         self._intermediate_terms = []
 430         self.adders = []
 431         if len(self.groups) != 0:
 432             self.create_next_terms()
 433
 434         self.o = AddReduceData(partition_points, len(self._intermediate_terms),
 435                                output_width, n_parts)
 436
 437     @staticmethod
 438     def get_max_level(input_count):
 439         """Get the maximum level.
 440
 441         All ``register_levels`` must be less than or equal to the maximum
 442         level.
 443         """
 444         retval = 0
 445         while True:
 446             groups = AddReduceSingle.full_adder_groups(input_count)
 447             if len(groups) == 0:
 448                 return retval
 449             input_count %= FULL_ADDER_INPUT_COUNT
 450             input_count += 2 * len(groups)
 451             retval += 1
 452
 453     @staticmethod
 454     def full_adder_groups(input_count):
 455         """Get ``inputs`` indices for which a full adder should be built."""
 456         return range(0,
 457                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 458                      FULL_ADDER_INPUT_COUNT)
 459
 460     def elaborate(self, platform):
 461         """Elaborate this module."""
 462         m = Module()
 463
 464         # copy the intermediate terms to the output
 465         for i, value in enumerate(self._intermediate_terms):
 466             m.d.comb += self.o.inputs[i].eq(value)
 467
 468         # copy reg part points and part ops to output
 469         m.d.comb += self.o.reg_partition_points.eq(self.i.reg_partition_points)
 470         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 471                                      for i in range(len(self.i.part_ops))]
 472
 473         # set up the partition mask (for the adders)
 474         part_mask = Signal(self.output_width, reset_less=True)
 475
 476         mask = self.i.reg_partition_points.as_mask(self.output_width)
 477         m.d.comb += part_mask.eq(mask)
 478
 479         # add and link the intermediate term modules
 480         for i, (iidx, adder_i) in enumerate(self.adders):
 481             setattr(m.submodules, f"adder_{i}", adder_i)
 482
 483             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 484             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 485             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 486             m.d.comb += adder_i.mask.eq(part_mask)
 487
 488         return m
 489
 490     def create_next_terms(self):
 491
 492         _intermediate_terms = []
 493
 494         def add_intermediate_term(value):
 495             _intermediate_terms.append(value)
 496
 497         # create full adders for this recursive level.
 498         # this shrinks N terms to 2 * (N // 3) plus the remainder
 499         for i in self.groups:
 500             adder_i = MaskedFullAdder(self.output_width)
 501             self.adders.append((i, adder_i))
 502             # add both the sum and the masked-carry to the next level.
 503             # 3 inputs have now been reduced to 2...
 504             add_intermediate_term(adder_i.sum)
 505             add_intermediate_term(adder_i.mcarry)
 506         # handle the remaining inputs.
 507         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 508             add_intermediate_term(self.i.inputs[-1])
 509         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 510             # Just pass the terms to the next layer, since we wouldn't gain
 511             # anything by using a half adder since there would still be 2 terms
 512             # and just passing the terms to the next layer saves gates.
 513             add_intermediate_term(self.i.inputs[-2])
 514             add_intermediate_term(self.i.inputs[-1])
 515         else:
 516             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 517
 518         self._intermediate_terms = _intermediate_terms
 519
 520
 521 class AddReduce(Elaboratable):
 522     """Recursively Add list of numbers together.
 523
 524     :attribute inputs: input ``Signal``s to be summed. Modification not
 525         supported, except for by ``Signal.eq``.
 526     :attribute register_levels: List of nesting levels that should have
 527         pipeline registers.
 528     :attribute output: output sum.
 529     :attribute partition_points: the input partition points. Modification not
 530         supported, except for by ``Signal.eq``.
 531     """
 532
 533     def __init__(self, inputs, output_width, register_levels, partition_points,
 534                        part_ops):
 535         """Create an ``AddReduce``.
 536
 537         :param inputs: input ``Signal``s to be summed.
 538         :param output_width: bit-width of ``output``.
 539         :param register_levels: List of nesting levels that should have
 540             pipeline registers.
 541         :param partition_points: the input partition points.
 542         """
 543         self.inputs = inputs
 544         self.part_ops = part_ops
 545         n_parts = len(part_ops)
 546         self.o = FinalReduceData(partition_points, output_width, n_parts)
 547         self.output_width = output_width
 548         self.register_levels = register_levels
 549         self.partition_points = partition_points
 550
 551         self.create_levels()
 552
 553     @staticmethod
 554     def get_max_level(input_count):
 555         return AddReduceSingle.get_max_level(input_count)
 556
 557     @staticmethod
 558     def next_register_levels(register_levels):
 559         """``Iterable`` of ``register_levels`` for next recursive level."""
 560         for level in register_levels:
 561             if level > 0:
 562                 yield level - 1
 563
 564     def create_levels(self):
 565         """creates reduction levels"""
 566
 567         mods = []
 568         next_levels = self.register_levels
 569         partition_points = self.partition_points
 570         part_ops = self.part_ops
 571         n_parts = len(part_ops)
 572         inputs = self.inputs
 573         ilen = len(inputs)
 574         while True:
 575             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 576                                          next_levels, partition_points)
 577             mods.append(next_level)
 578             next_levels = list(AddReduce.next_register_levels(next_levels))
 579             partition_points = next_level.i.reg_partition_points
 580             inputs = next_level.o.inputs
 581             ilen = len(inputs)
 582             part_ops = next_level.i.part_ops
 583             groups = AddReduceSingle.full_adder_groups(len(inputs))
 584             if len(groups) == 0:
 585                 break
 586
 587         next_level = FinalAdd(ilen, self.output_width, n_parts,
 588                               next_levels, partition_points)
 589         mods.append(next_level)
 590
 591         self.levels = mods
 592
 593     def elaborate(self, platform):
 594         """Elaborate this module."""
 595         m = Module()
 596
 597         for i, next_level in enumerate(self.levels):
 598             setattr(m.submodules, "next_level%d" % i, next_level)
 599
 600         partition_points = self.partition_points
 601         inputs = self.inputs
 602         part_ops = self.part_ops
 603         n_parts = len(part_ops)
 604         n_inputs = len(inputs)
 605         output_width = self.output_width
 606         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 607         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 608         for idx in range(len(self.levels)):
 609             mcur = self.levels[idx]
 610             if 0 in mcur.register_levels:
 611                 m.d.sync += mcur.i.eq(i)
 612             else:
 613                 m.d.comb += mcur.i.eq(i)
 614             i = mcur.o # for next loop
 615
 616         # output comes from last module
 617         m.d.comb += self.o.eq(i)
 618
 619         return m
 620
 621
 622 OP_MUL_LOW = 0
 623 OP_MUL_SIGNED_HIGH = 1
 624 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 625 OP_MUL_UNSIGNED_HIGH = 3
 626
 627
 628 def get_term(value, shift=0, enabled=None):
 629     if enabled is not None:
 630         value = Mux(enabled, value, 0)
 631     if shift > 0:
 632         value = Cat(Repl(C(0, 1), shift), value)
 633     else:
 634         assert shift == 0
 635     return value
 636
 637
 638 class ProductTerm(Elaboratable):
 639     """ this class creates a single product term (a[..]*b[..]).
 640         it has a design flaw in that is the *output* that is selected,
 641         where the multiplication(s) are combinatorially generated
 642         all the time.
 643     """
 644
 645     def __init__(self, width, twidth, pbwid, a_index, b_index):
 646         self.a_index = a_index
 647         self.b_index = b_index
 648         shift = 8 * (self.a_index + self.b_index)
 649         self.pwidth = width
 650         self.twidth = twidth
 651         self.width = width*2
 652         self.shift = shift
 653
 654         self.ti = Signal(self.width, reset_less=True)
 655         self.term = Signal(twidth, reset_less=True)
 656         self.a = Signal(twidth//2, reset_less=True)
 657         self.b = Signal(twidth//2, reset_less=True)
 658         self.pb_en = Signal(pbwid, reset_less=True)
 659
 660         self.tl = tl = []
 661         min_index = min(self.a_index, self.b_index)
 662         max_index = max(self.a_index, self.b_index)
 663         for i in range(min_index, max_index):
 664             tl.append(self.pb_en[i])
 665         name = "te_%d_%d" % (self.a_index, self.b_index)
 666         if len(tl) > 0:
 667             term_enabled = Signal(name=name, reset_less=True)
 668         else:
 669             term_enabled = None
 670         self.enabled = term_enabled
 671         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 672
 673     def elaborate(self, platform):
 674
 675         m = Module()
 676         if self.enabled is not None:
 677             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 678
 679         bsa = Signal(self.width, reset_less=True)
 680         bsb = Signal(self.width, reset_less=True)
 681         a_index, b_index = self.a_index, self.b_index
 682         pwidth = self.pwidth
 683         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 684         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 685         m.d.comb += self.ti.eq(bsa * bsb)
 686         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 687         """
 688         #TODO: sort out width issues, get inputs a/b switched on/off.
 689         #data going into Muxes is 1/2 the required width
 690
 691         pwidth = self.pwidth
 692         width = self.width
 693         bsa = Signal(self.twidth//2, reset_less=True)
 694         bsb = Signal(self.twidth//2, reset_less=True)
 695         asel = Signal(width, reset_less=True)
 696         bsel = Signal(width, reset_less=True)
 697         a_index, b_index = self.a_index, self.b_index
 698         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 699         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 700         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 701         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 702         m.d.comb += self.ti.eq(bsa * bsb)
 703         m.d.comb += self.term.eq(self.ti)
 704         """
 705
 706         return m
 707
 708
 709 class ProductTerms(Elaboratable):
 710     """ creates a bank of product terms.  also performs the actual bit-selection
 711         this class is to be wrapped with a for-loop on the "a" operand.
 712         it creates a second-level for-loop on the "b" operand.
 713     """
 714     def __init__(self, width, twidth, pbwid, a_index, blen):
 715         self.a_index = a_index
 716         self.blen = blen
 717         self.pwidth = width
 718         self.twidth = twidth
 719         self.pbwid = pbwid
 720         self.a = Signal(twidth//2, reset_less=True)
 721         self.b = Signal(twidth//2, reset_less=True)
 722         self.pb_en = Signal(pbwid, reset_less=True)
 723         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 724                             for i in range(blen)]
 725
 726     def elaborate(self, platform):
 727
 728         m = Module()
 729
 730         for b_index in range(self.blen):
 731             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 732                             self.a_index, b_index)
 733             setattr(m.submodules, "term_%d" % b_index, t)
 734
 735             m.d.comb += t.a.eq(self.a)
 736             m.d.comb += t.b.eq(self.b)
 737             m.d.comb += t.pb_en.eq(self.pb_en)
 738
 739             m.d.comb += self.terms[b_index].eq(t.term)
 740
 741         return m
 742
 743
 744 class LSBNegTerm(Elaboratable):
 745
 746     def __init__(self, bit_width):
 747         self.bit_width = bit_width
 748         self.part = Signal(reset_less=True)
 749         self.signed = Signal(reset_less=True)
 750         self.op = Signal(bit_width, reset_less=True)
 751         self.msb = Signal(reset_less=True)
 752         self.nt = Signal(bit_width*2, reset_less=True)
 753         self.nl = Signal(bit_width*2, reset_less=True)
 754
 755     def elaborate(self, platform):
 756         m = Module()
 757         comb = m.d.comb
 758         bit_wid = self.bit_width
 759         ext = Repl(0, bit_wid) # extend output to HI part
 760
 761         # determine sign of each incoming number *in this partition*
 762         enabled = Signal(reset_less=True)
 763         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 764
 765         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 766         # negation operation is split into a bitwise not and a +1.
 767         # likewise for 16, 32, and 64-bit values.
 768
 769         # width-extended 1s complement if a is signed, otherwise zero
 770         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 771
 772         # add 1 if signed, otherwise add zero
 773         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 774
 775         return m
 776
 777
 778 class Parts(Elaboratable):
 779
 780     def __init__(self, pbwid, epps, n_parts):
 781         self.pbwid = pbwid
 782         # inputs
 783         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 784         # outputs
 785         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 786
 787     def elaborate(self, platform):
 788         m = Module()
 789
 790         epps, parts = self.epps, self.parts
 791         # collect part-bytes (double factor because the input is extended)
 792         pbs = Signal(self.pbwid, reset_less=True)
 793         tl = []
 794         for i in range(self.pbwid):
 795             pb = Signal(name="pb%d" % i, reset_less=True)
 796             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 797             tl.append(pb)
 798         m.d.comb += pbs.eq(Cat(*tl))
 799
 800         # negated-temporary copy of partition bits
 801         npbs = Signal.like(pbs, reset_less=True)
 802         m.d.comb += npbs.eq(~pbs)
 803         byte_count = 8 // len(parts)
 804         for i in range(len(parts)):
 805             pbl = []
 806             pbl.append(npbs[i * byte_count - 1])
 807             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 808                 pbl.append(pbs[j])
 809             pbl.append(npbs[(i + 1) * byte_count - 1])
 810             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 811             m.d.comb += value.eq(Cat(*pbl))
 812             m.d.comb += parts[i].eq(~(value).bool())
 813
 814         return m
 815
 816
 817 class Part(Elaboratable):
 818     """ a key class which, depending on the partitioning, will determine
 819         what action to take when parts of the output are signed or unsigned.
 820
 821         this requires 2 pieces of data *per operand, per partition*:
 822         whether the MSB is HI/LO (per partition!), and whether a signed
 823         or unsigned operation has been *requested*.
 824
 825         once that is determined, signed is basically carried out
 826         by splitting 2's complement into 1's complement plus one.
 827         1's complement is just a bit-inversion.
 828
 829         the extra terms - as separate terms - are then thrown at the
 830         AddReduce alongside the multiplication part-results.
 831     """
 832     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 833
 834         self.pbwid = pbwid
 835         self.epps = epps
 836
 837         # inputs
 838         self.a = Signal(64)
 839         self.b = Signal(64)
 840         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 841         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 842         self.pbs = Signal(pbwid, reset_less=True)
 843
 844         # outputs
 845         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 846
 847         self.not_a_term = Signal(width)
 848         self.neg_lsb_a_term = Signal(width)
 849         self.not_b_term = Signal(width)
 850         self.neg_lsb_b_term = Signal(width)
 851
 852     def elaborate(self, platform):
 853         m = Module()
 854
 855         pbs, parts = self.pbs, self.parts
 856         epps = self.epps
 857         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 858         m.d.comb += p.epps.eq(epps)
 859         parts = p.parts
 860
 861         byte_count = 8 // len(parts)
 862
 863         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 864                 self.not_a_term, self.neg_lsb_a_term,
 865                 self.not_b_term, self.neg_lsb_b_term)
 866
 867         byte_width = 8 // len(parts) # byte width
 868         bit_wid = 8 * byte_width     # bit width
 869         nat, nbt, nla, nlb = [], [], [], []
 870         for i in range(len(parts)):
 871             # work out bit-inverted and +1 term for a.
 872             pa = LSBNegTerm(bit_wid)
 873             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 874             m.d.comb += pa.part.eq(parts[i])
 875             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 876             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 877             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 878             nat.append(pa.nt)
 879             nla.append(pa.nl)
 880
 881             # work out bit-inverted and +1 term for b
 882             pb = LSBNegTerm(bit_wid)
 883             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 884             m.d.comb += pb.part.eq(parts[i])
 885             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 886             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 887             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 888             nbt.append(pb.nt)
 889             nlb.append(pb.nl)
 890
 891         # concatenate together and return all 4 results.
 892         m.d.comb += [not_a_term.eq(Cat(*nat)),
 893                      not_b_term.eq(Cat(*nbt)),
 894                      neg_lsb_a_term.eq(Cat(*nla)),
 895                      neg_lsb_b_term.eq(Cat(*nlb)),
 896                     ]
 897
 898         return m
 899
 900
 901 class IntermediateOut(Elaboratable):
 902     """ selects the HI/LO part of the multiplication, for a given bit-width
 903         the output is also reconstructed in its SIMD (partition) lanes.
 904     """
 905     def __init__(self, width, out_wid, n_parts):
 906         self.width = width
 907         self.n_parts = n_parts
 908         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 909                                      for i in range(8)]
 910         self.intermed = Signal(out_wid, reset_less=True)
 911         self.output = Signal(out_wid//2, reset_less=True)
 912
 913     def elaborate(self, platform):
 914         m = Module()
 915
 916         ol = []
 917         w = self.width
 918         sel = w // 8
 919         for i in range(self.n_parts):
 920             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 921             m.d.comb += op.eq(
 922                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 923                     self.intermed.part(i * w*2, w),
 924                     self.intermed.part(i * w*2 + w, w)))
 925             ol.append(op)
 926         m.d.comb += self.output.eq(Cat(*ol))
 927
 928         return m
 929
 930
 931 class FinalOut(Elaboratable):
 932     """ selects the final output based on the partitioning.
 933
 934         each byte is selectable independently, i.e. it is possible
 935         that some partitions requested 8-bit computation whilst others
 936         requested 16 or 32 bit.
 937     """
 938     def __init__(self, out_wid):
 939         # inputs
 940         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 941         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 942         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 943
 944         self.i8 = Signal(out_wid, reset_less=True)
 945         self.i16 = Signal(out_wid, reset_less=True)
 946         self.i32 = Signal(out_wid, reset_less=True)
 947         self.i64 = Signal(out_wid, reset_less=True)
 948
 949         # output
 950         self.out = Signal(out_wid, reset_less=True)
 951
 952     def elaborate(self, platform):
 953         m = Module()
 954         ol = []
 955         for i in range(8):
 956             # select one of the outputs: d8 selects i8, d16 selects i16
 957             # d32 selects i32, and the default is i64.
 958             # d8 and d16 are ORed together in the first Mux
 959             # then the 2nd selects either i8 or i16.
 960             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 961             op = Signal(8, reset_less=True, name="op_%d" % i)
 962             m.d.comb += op.eq(
 963                 Mux(self.d8[i] | self.d16[i // 2],
 964                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 965                                      self.i16.part(i * 8, 8)),
 966                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 967                                           self.i64.part(i * 8, 8))))
 968             ol.append(op)
 969         m.d.comb += self.out.eq(Cat(*ol))
 970         return m
 971
 972
 973 class OrMod(Elaboratable):
 974     """ ORs four values together in a hierarchical tree
 975     """
 976     def __init__(self, wid):
 977         self.wid = wid
 978         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 979                      for i in range(4)]
 980         self.orout = Signal(wid, reset_less=True)
 981
 982     def elaborate(self, platform):
 983         m = Module()
 984         or1 = Signal(self.wid, reset_less=True)
 985         or2 = Signal(self.wid, reset_less=True)
 986         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 987         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 988         m.d.comb += self.orout.eq(or1 | or2)
 989
 990         return m
 991
 992
 993 class Signs(Elaboratable):
 994     """ determines whether a or b are signed numbers
 995         based on the required operation type (OP_MUL_*)
 996     """
 997
 998     def __init__(self):
 999         self.part_ops = Signal(2, reset_less=True)
1000         self.a_signed = Signal(reset_less=True)
1001         self.b_signed = Signal(reset_less=True)
1002
1003     def elaborate(self, platform):
1004
1005         m = Module()
1006
1007         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1008         bsig = (self.part_ops == OP_MUL_LOW) \
1009                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1010         m.d.comb += self.a_signed.eq(asig)
1011         m.d.comb += self.b_signed.eq(bsig)
1012
1013         return m
1014
1015
1016 class Mul8_16_32_64(Elaboratable):
1017     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1018
1019     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1020     partitions on naturally-aligned boundaries. Supports the operation being
1021     set for each partition independently.
1022
1023     :attribute part_pts: the input partition points. Has a partition point at
1024         multiples of 8 in 0 < i < 64. Each partition point's associated
1025         ``Value`` is a ``Signal``. Modification not supported, except for by
1026         ``Signal.eq``.
1027     :attribute part_ops: the operation for each byte. The operation for a
1028         particular partition is selected by assigning the selected operation
1029         code to each byte in the partition. The allowed operation codes are:
1030
1031         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1032             RISC-V's `mul` instruction.
1033         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1034             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1035             instruction.
1036         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1037             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1038             `mulhsu` instruction.
1039         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1040             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1041             instruction.
1042     """
1043
1044     def __init__(self, register_levels=()):
1045         """ register_levels: specifies the points in the cascade at which
1046             flip-flops are to be inserted.
1047         """
1048
1049         # parameter(s)
1050         self.register_levels = list(register_levels)
1051
1052         # inputs
1053         self.part_pts = PartitionPoints()
1054         for i in range(8, 64, 8):
1055             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1056         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1057         self.a = Signal(64)
1058         self.b = Signal(64)
1059
1060         # intermediates (needed for unit tests)
1061         self._intermediate_output = Signal(128)
1062
1063         # output
1064         self.output = Signal(64)
1065
1066     def elaborate(self, platform):
1067         m = Module()
1068
1069         # collect part-bytes
1070         pbs = Signal(8, reset_less=True)
1071         tl = []
1072         for i in range(8):
1073             pb = Signal(name="pb%d" % i, reset_less=True)
1074             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1075             tl.append(pb)
1076         m.d.comb += pbs.eq(Cat(*tl))
1077
1078         # create (doubled) PartitionPoints (output is double input width)
1079         expanded_part_pts = eps = PartitionPoints()
1080         for i, v in self.part_pts.items():
1081             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1082             expanded_part_pts[i * 2] = ep
1083             m.d.comb += ep.eq(v)
1084
1085         # local variables
1086         signs = []
1087         for i in range(8):
1088             s = Signs()
1089             signs.append(s)
1090             setattr(m.submodules, "signs%d" % i, s)
1091             m.d.comb += s.part_ops.eq(self.part_ops[i])
1092
1093         n_levels = len(self.register_levels)+1
1094         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1095         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1096         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1097         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1098         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1099         for mod in [part_8, part_16, part_32, part_64]:
1100             m.d.comb += mod.a.eq(self.a)
1101             m.d.comb += mod.b.eq(self.b)
1102             for i in range(len(signs)):
1103                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1104                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1105             m.d.comb += mod.pbs.eq(pbs)
1106             nat_l.append(mod.not_a_term)
1107             nbt_l.append(mod.not_b_term)
1108             nla_l.append(mod.neg_lsb_a_term)
1109             nlb_l.append(mod.neg_lsb_b_term)
1110
1111         terms = []
1112
1113         for a_index in range(8):
1114             t = ProductTerms(8, 128, 8, a_index, 8)
1115             setattr(m.submodules, "terms_%d" % a_index, t)
1116
1117             m.d.comb += t.a.eq(self.a)
1118             m.d.comb += t.b.eq(self.b)
1119             m.d.comb += t.pb_en.eq(pbs)
1120
1121             for term in t.terms:
1122                 terms.append(term)
1123
1124         # it's fine to bitwise-or data together since they are never enabled
1125         # at the same time
1126         m.submodules.nat_or = nat_or = OrMod(128)
1127         m.submodules.nbt_or = nbt_or = OrMod(128)
1128         m.submodules.nla_or = nla_or = OrMod(128)
1129         m.submodules.nlb_or = nlb_or = OrMod(128)
1130         for l, mod in [(nat_l, nat_or),
1131                              (nbt_l, nbt_or),
1132                              (nla_l, nla_or),
1133                              (nlb_l, nlb_or)]:
1134             for i in range(len(l)):
1135                 m.d.comb += mod.orin[i].eq(l[i])
1136             terms.append(mod.orout)
1137
1138         add_reduce = AddReduce(terms,
1139                                128,
1140                                self.register_levels,
1141                                expanded_part_pts,
1142                                self.part_ops)
1143
1144         out_part_ops = add_reduce.o.part_ops
1145         out_part_pts = add_reduce.o.reg_partition_points
1146
1147         m.submodules.add_reduce = add_reduce
1148         m.d.comb += self._intermediate_output.eq(add_reduce.o.output)
1149         # create _output_64
1150         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1151         m.d.comb += io64.intermed.eq(self._intermediate_output)
1152         for i in range(8):
1153             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1154
1155         # create _output_32
1156         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1157         m.d.comb += io32.intermed.eq(self._intermediate_output)
1158         for i in range(8):
1159             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1160
1161         # create _output_16
1162         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1163         m.d.comb += io16.intermed.eq(self._intermediate_output)
1164         for i in range(8):
1165             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1166
1167         # create _output_8
1168         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1169         m.d.comb += io8.intermed.eq(self._intermediate_output)
1170         for i in range(8):
1171             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1172
1173         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1174         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1175         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1176         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1177
1178         m.d.comb += p_8.epps.eq(out_part_pts)
1179         m.d.comb += p_16.epps.eq(out_part_pts)
1180         m.d.comb += p_32.epps.eq(out_part_pts)
1181         m.d.comb += p_64.epps.eq(out_part_pts)
1182
1183         # final output
1184         m.submodules.finalout = finalout = FinalOut(64)
1185         for i in range(len(part_8.parts)):
1186             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1187         for i in range(len(part_16.parts)):
1188             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1189         for i in range(len(part_32.parts)):
1190             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1191         m.d.comb += finalout.i8.eq(io8.output)
1192         m.d.comb += finalout.i16.eq(io16.output)
1193         m.d.comb += finalout.i32.eq(io32.output)
1194         m.d.comb += finalout.i64.eq(io64.output)
1195         m.d.comb += self.output.eq(finalout.out)
1196
1197         return m
1198
1199
1200 if __name__ == "__main__":
1201     m = Mul8_16_32_64()
1202     main(m, ports=[m.a,
1203                    m.b,
1204                    m._intermediate_output,
1205                    m.output,
1206                    *m.part_ops,
1207                    *m.part_pts.values()])