src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width)
 136         self.in1 = Signal(width)
 137         self.in2 = Signal(width)
 138         self.sum = Signal(width)
 139         self.carry = Signal(width)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249         # XXX these have to remain here due to some horrible nmigen
 250         # simulation bugs involving sync.  it is *not* necessary to
 251         # have them here, they should (under normal circumstances)
 252         # be moved into elaborate, as they are entirely local
 253         self._expanded_a = Signal(expanded_width) # includes extra part-points
 254         self._expanded_b = Signal(expanded_width) # likewise.
 255         self._expanded_o = Signal(expanded_width) # likewise.
 256
 257     def elaborate(self, platform):
 258         """Elaborate this module."""
 259         m = Module()
 260         expanded_index = 0
 261         # store bits in a list, use Cat later.  graphviz is much cleaner
 262         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 263
 264         # partition points are "breaks" (extra zeros or 1s) in what would
 265         # otherwise be a massive long add.  when the "break" points are 0,
 266         # whatever is in it (in the output) is discarded.  however when
 267         # there is a "1", it causes a roll-over carry to the *next* bit.
 268         # we still ignore the "break" bit in the [intermediate] output,
 269         # however by that time we've got the effect that we wanted: the
 270         # carry has been carried *over* the break point.
 271
 272         for i in range(self.width):
 273             if i in self.partition_points:
 274                 # add extra bit set to 0 + 0 for enabled partition points
 275                 # and 1 + 0 for disabled partition points
 276                 ea.append(self._expanded_a[expanded_index])
 277                 al.append(~self.partition_points[i]) # add extra bit in a
 278                 eb.append(self._expanded_b[expanded_index])
 279                 bl.append(C(0)) # yes, add a zero
 280                 expanded_index += 1 # skip the extra point.  NOT in the output
 281             ea.append(self._expanded_a[expanded_index])
 282             eb.append(self._expanded_b[expanded_index])
 283             eo.append(self._expanded_o[expanded_index])
 284             al.append(self.a[i])
 285             bl.append(self.b[i])
 286             ol.append(self.output[i])
 287             expanded_index += 1
 288
 289         # combine above using Cat
 290         m.d.comb += Cat(*ea).eq(Cat(*al))
 291         m.d.comb += Cat(*eb).eq(Cat(*bl))
 292         m.d.comb += Cat(*ol).eq(Cat(*eo))
 293
 294         # use only one addition to take advantage of look-ahead carry and
 295         # special hardware on FPGAs
 296         m.d.comb += self._expanded_o.eq(
 297             self._expanded_a + self._expanded_b)
 298         return m
 299
 300
 301 FULL_ADDER_INPUT_COUNT = 3
 302
 303 class AddReduceData:
 304
 305     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 306         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 307                           for i in range(n_parts)]
 308         self.inputs = [Signal(output_width, name=f"inputs[{i}]")
 309             for i in range(n_inputs)]
 310         self.reg_partition_points = ppoints.like()
 311
 312     def eq_from(self, reg_partition_points, inputs, part_ops):
 313         return [self.reg_partition_points.eq(reg_partition_points)] + \
 314                [self.inputs[i].eq(inputs[i])
 315                                      for i in range(len(self.inputs))] + \
 316                [self.part_ops[i].eq(part_ops[i])
 317                                      for i in range(len(self.part_ops))]
 318
 319     def eq(self, rhs):
 320         return self.eq_from(rhs.reg_partition_points, rhs.inputs, rhs.part_ops)
 321
 322
 323 class FinalReduceData:
 324
 325     def __init__(self, ppoints, output_width, n_parts):
 326         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 327                           for i in range(n_parts)]
 328         self.output = Signal(output_width)
 329         self.reg_partition_points = ppoints.like()
 330
 331     def eq_from(self, reg_partition_points, output, part_ops):
 332         return [self.reg_partition_points.eq(reg_partition_points)] + \
 333                [self.output.eq(output)] + \
 334                [self.part_ops[i].eq(part_ops[i])
 335                                      for i in range(len(self.part_ops))]
 336
 337     def eq(self, rhs):
 338         return self.eq_from(rhs.reg_partition_points, rhs.output, rhs.part_ops)
 339
 340
 341 class FinalAdd(Elaboratable):
 342     """ Final stage of add reduce
 343     """
 344
 345     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 346                        partition_points):
 347         self.i = AddReduceData(partition_points, n_inputs,
 348                                output_width, n_parts)
 349         self.o = FinalReduceData(partition_points, output_width, n_parts)
 350         self.output_width = output_width
 351         self.n_inputs = n_inputs
 352         self.n_parts = n_parts
 353         self.register_levels = list(register_levels)
 354         self.partition_points = PartitionPoints(partition_points)
 355         if not self.partition_points.fits_in_width(output_width):
 356             raise ValueError("partition_points doesn't fit in output_width")
 357
 358     def elaborate(self, platform):
 359         """Elaborate this module."""
 360         m = Module()
 361
 362         output_width = self.output_width
 363         output = Signal(output_width)
 364         if self.n_inputs == 0:
 365             # use 0 as the default output value
 366             m.d.comb += output.eq(0)
 367         elif self.n_inputs == 1:
 368             # handle single input
 369             m.d.comb += output.eq(self.i.inputs[0])
 370         else:
 371             # base case for adding 2 inputs
 372             assert self.n_inputs == 2
 373             adder = PartitionedAdder(output_width, self.i.reg_partition_points)
 374             m.submodules.final_adder = adder
 375             m.d.comb += adder.a.eq(self.i.inputs[0])
 376             m.d.comb += adder.b.eq(self.i.inputs[1])
 377             m.d.comb += output.eq(adder.output)
 378
 379         # create output
 380         m.d.comb += self.o.eq_from(self.i.reg_partition_points, output,
 381                                    self.i.part_ops)
 382
 383         return m
 384
 385
 386 class AddReduceSingle(Elaboratable):
 387     """Add list of numbers together.
 388
 389     :attribute inputs: input ``Signal``s to be summed. Modification not
 390         supported, except for by ``Signal.eq``.
 391     :attribute register_levels: List of nesting levels that should have
 392         pipeline registers.
 393     :attribute output: output sum.
 394     :attribute partition_points: the input partition points. Modification not
 395         supported, except for by ``Signal.eq``.
 396     """
 397
 398     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 399                        partition_points):
 400         """Create an ``AddReduce``.
 401
 402         :param inputs: input ``Signal``s to be summed.
 403         :param output_width: bit-width of ``output``.
 404         :param register_levels: List of nesting levels that should have
 405             pipeline registers.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.register_levels = list(register_levels)
 414         self.partition_points = PartitionPoints(partition_points)
 415         if not self.partition_points.fits_in_width(output_width):
 416             raise ValueError("partition_points doesn't fit in output_width")
 417
 418         max_level = AddReduceSingle.get_max_level(n_inputs)
 419         for level in self.register_levels:
 420             if level > max_level:
 421                 raise ValueError(
 422                     "not enough adder levels for specified register levels")
 423
 424         # this is annoying.  we have to create the modules (and terms)
 425         # because we need to know what they are (in order to set up the
 426         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 427         # etc because this is not in elaboratable.
 428         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 429         self._intermediate_terms = []
 430         if len(self.groups) != 0:
 431             self.create_next_terms()
 432
 433         self.o = AddReduceData(partition_points, len(self._intermediate_terms),
 434                                output_width, n_parts)
 435
 436     @staticmethod
 437     def get_max_level(input_count):
 438         """Get the maximum level.
 439
 440         All ``register_levels`` must be less than or equal to the maximum
 441         level.
 442         """
 443         retval = 0
 444         while True:
 445             groups = AddReduceSingle.full_adder_groups(input_count)
 446             if len(groups) == 0:
 447                 return retval
 448             input_count %= FULL_ADDER_INPUT_COUNT
 449             input_count += 2 * len(groups)
 450             retval += 1
 451
 452     @staticmethod
 453     def full_adder_groups(input_count):
 454         """Get ``inputs`` indices for which a full adder should be built."""
 455         return range(0,
 456                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 457                      FULL_ADDER_INPUT_COUNT)
 458
 459     def elaborate(self, platform):
 460         """Elaborate this module."""
 461         m = Module()
 462
 463         # copy the intermediate terms to the output
 464         for i, value in enumerate(self._intermediate_terms):
 465             m.d.comb += self.o.inputs[i].eq(value)
 466
 467         # copy reg part points and part ops to output
 468         m.d.comb += self.o.reg_partition_points.eq(self.i.reg_partition_points)
 469         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 470                                      for i in range(len(self.i.part_ops))]
 471
 472         # set up the partition mask (for the adders)
 473         mask = self.i.reg_partition_points.as_mask(self.output_width)
 474         m.d.comb += self.part_mask.eq(mask)
 475
 476         # add and link the intermediate term modules
 477         for i, (iidx, adder_i) in enumerate(self.adders):
 478             setattr(m.submodules, f"adder_{i}", adder_i)
 479
 480             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 481             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 482             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 483             m.d.comb += adder_i.mask.eq(self.part_mask)
 484
 485         return m
 486
 487     def create_next_terms(self):
 488
 489         _intermediate_terms = []
 490
 491         def add_intermediate_term(value):
 492             _intermediate_terms.append(value)
 493
 494         # store mask in intermediary (simplifies graph)
 495         self.part_mask = Signal(self.output_width, reset_less=True)
 496
 497         # create full adders for this recursive level.
 498         # this shrinks N terms to 2 * (N // 3) plus the remainder
 499         self.adders = []
 500         for i in self.groups:
 501             adder_i = MaskedFullAdder(self.output_width)
 502             self.adders.append((i, adder_i))
 503             # add both the sum and the masked-carry to the next level.
 504             # 3 inputs have now been reduced to 2...
 505             add_intermediate_term(adder_i.sum)
 506             add_intermediate_term(adder_i.mcarry)
 507         # handle the remaining inputs.
 508         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 509             add_intermediate_term(self.i.inputs[-1])
 510         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 511             # Just pass the terms to the next layer, since we wouldn't gain
 512             # anything by using a half adder since there would still be 2 terms
 513             # and just passing the terms to the next layer saves gates.
 514             add_intermediate_term(self.i.inputs[-2])
 515             add_intermediate_term(self.i.inputs[-1])
 516         else:
 517             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 518
 519         self._intermediate_terms = _intermediate_terms
 520
 521
 522 class AddReduce(Elaboratable):
 523     """Recursively Add list of numbers together.
 524
 525     :attribute inputs: input ``Signal``s to be summed. Modification not
 526         supported, except for by ``Signal.eq``.
 527     :attribute register_levels: List of nesting levels that should have
 528         pipeline registers.
 529     :attribute output: output sum.
 530     :attribute partition_points: the input partition points. Modification not
 531         supported, except for by ``Signal.eq``.
 532     """
 533
 534     def __init__(self, inputs, output_width, register_levels, partition_points,
 535                        part_ops):
 536         """Create an ``AddReduce``.
 537
 538         :param inputs: input ``Signal``s to be summed.
 539         :param output_width: bit-width of ``output``.
 540         :param register_levels: List of nesting levels that should have
 541             pipeline registers.
 542         :param partition_points: the input partition points.
 543         """
 544         self.inputs = inputs
 545         self.part_ops = part_ops
 546         n_parts = len(part_ops)
 547         self.o = FinalReduceData(partition_points, output_width, n_parts)
 548         self.output_width = output_width
 549         self.register_levels = register_levels
 550         self.partition_points = partition_points
 551
 552         self.create_levels()
 553
 554     @staticmethod
 555     def get_max_level(input_count):
 556         return AddReduceSingle.get_max_level(input_count)
 557
 558     @staticmethod
 559     def next_register_levels(register_levels):
 560         """``Iterable`` of ``register_levels`` for next recursive level."""
 561         for level in register_levels:
 562             if level > 0:
 563                 yield level - 1
 564
 565     def create_levels(self):
 566         """creates reduction levels"""
 567
 568         mods = []
 569         next_levels = self.register_levels
 570         partition_points = self.partition_points
 571         part_ops = self.part_ops
 572         n_parts = len(part_ops)
 573         inputs = self.inputs
 574         ilen = len(inputs)
 575         while True:
 576             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 577                                          next_levels, partition_points)
 578             mods.append(next_level)
 579             next_levels = list(AddReduce.next_register_levels(next_levels))
 580             partition_points = next_level.i.reg_partition_points
 581             inputs = next_level.o.inputs
 582             ilen = len(inputs)
 583             part_ops = next_level.i.part_ops
 584             groups = AddReduceSingle.full_adder_groups(len(inputs))
 585             if len(groups) == 0:
 586                 break
 587
 588         next_level = FinalAdd(ilen, self.output_width, n_parts,
 589                               next_levels, partition_points)
 590         mods.append(next_level)
 591
 592         self.levels = mods
 593
 594     def elaborate(self, platform):
 595         """Elaborate this module."""
 596         m = Module()
 597
 598         for i, next_level in enumerate(self.levels):
 599             setattr(m.submodules, "next_level%d" % i, next_level)
 600
 601         partition_points = self.partition_points
 602         inputs = self.inputs
 603         part_ops = self.part_ops
 604         n_parts = len(part_ops)
 605         n_inputs = len(inputs)
 606         output_width = self.output_width
 607         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 608         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 609         for idx in range(len(self.levels)):
 610             mcur = self.levels[idx]
 611             if 0 in mcur.register_levels:
 612                 m.d.sync += mcur.i.eq(i)
 613             else:
 614                 m.d.comb += mcur.i.eq(i)
 615             i = mcur.o # for next loop
 616
 617         print ("levels", len(self.levels), i)
 618         # output comes from last module
 619         m.d.comb += self.o.eq(i)
 620
 621         return m
 622
 623
 624 OP_MUL_LOW = 0
 625 OP_MUL_SIGNED_HIGH = 1
 626 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 627 OP_MUL_UNSIGNED_HIGH = 3
 628
 629
 630 def get_term(value, shift=0, enabled=None):
 631     if enabled is not None:
 632         value = Mux(enabled, value, 0)
 633     if shift > 0:
 634         value = Cat(Repl(C(0, 1), shift), value)
 635     else:
 636         assert shift == 0
 637     return value
 638
 639
 640 class ProductTerm(Elaboratable):
 641     """ this class creates a single product term (a[..]*b[..]).
 642         it has a design flaw in that is the *output* that is selected,
 643         where the multiplication(s) are combinatorially generated
 644         all the time.
 645     """
 646
 647     def __init__(self, width, twidth, pbwid, a_index, b_index):
 648         self.a_index = a_index
 649         self.b_index = b_index
 650         shift = 8 * (self.a_index + self.b_index)
 651         self.pwidth = width
 652         self.twidth = twidth
 653         self.width = width*2
 654         self.shift = shift
 655
 656         self.ti = Signal(self.width, reset_less=True)
 657         self.term = Signal(twidth, reset_less=True)
 658         self.a = Signal(twidth//2, reset_less=True)
 659         self.b = Signal(twidth//2, reset_less=True)
 660         self.pb_en = Signal(pbwid, reset_less=True)
 661
 662         self.tl = tl = []
 663         min_index = min(self.a_index, self.b_index)
 664         max_index = max(self.a_index, self.b_index)
 665         for i in range(min_index, max_index):
 666             tl.append(self.pb_en[i])
 667         name = "te_%d_%d" % (self.a_index, self.b_index)
 668         if len(tl) > 0:
 669             term_enabled = Signal(name=name, reset_less=True)
 670         else:
 671             term_enabled = None
 672         self.enabled = term_enabled
 673         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 674
 675     def elaborate(self, platform):
 676
 677         m = Module()
 678         if self.enabled is not None:
 679             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 680
 681         bsa = Signal(self.width, reset_less=True)
 682         bsb = Signal(self.width, reset_less=True)
 683         a_index, b_index = self.a_index, self.b_index
 684         pwidth = self.pwidth
 685         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 686         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 687         m.d.comb += self.ti.eq(bsa * bsb)
 688         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 689         """
 690         #TODO: sort out width issues, get inputs a/b switched on/off.
 691         #data going into Muxes is 1/2 the required width
 692
 693         pwidth = self.pwidth
 694         width = self.width
 695         bsa = Signal(self.twidth//2, reset_less=True)
 696         bsb = Signal(self.twidth//2, reset_less=True)
 697         asel = Signal(width, reset_less=True)
 698         bsel = Signal(width, reset_less=True)
 699         a_index, b_index = self.a_index, self.b_index
 700         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 701         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 702         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 703         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 704         m.d.comb += self.ti.eq(bsa * bsb)
 705         m.d.comb += self.term.eq(self.ti)
 706         """
 707
 708         return m
 709
 710
 711 class ProductTerms(Elaboratable):
 712     """ creates a bank of product terms.  also performs the actual bit-selection
 713         this class is to be wrapped with a for-loop on the "a" operand.
 714         it creates a second-level for-loop on the "b" operand.
 715     """
 716     def __init__(self, width, twidth, pbwid, a_index, blen):
 717         self.a_index = a_index
 718         self.blen = blen
 719         self.pwidth = width
 720         self.twidth = twidth
 721         self.pbwid = pbwid
 722         self.a = Signal(twidth//2, reset_less=True)
 723         self.b = Signal(twidth//2, reset_less=True)
 724         self.pb_en = Signal(pbwid, reset_less=True)
 725         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 726                             for i in range(blen)]
 727
 728     def elaborate(self, platform):
 729
 730         m = Module()
 731
 732         for b_index in range(self.blen):
 733             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 734                             self.a_index, b_index)
 735             setattr(m.submodules, "term_%d" % b_index, t)
 736
 737             m.d.comb += t.a.eq(self.a)
 738             m.d.comb += t.b.eq(self.b)
 739             m.d.comb += t.pb_en.eq(self.pb_en)
 740
 741             m.d.comb += self.terms[b_index].eq(t.term)
 742
 743         return m
 744
 745
 746 class LSBNegTerm(Elaboratable):
 747
 748     def __init__(self, bit_width):
 749         self.bit_width = bit_width
 750         self.part = Signal(reset_less=True)
 751         self.signed = Signal(reset_less=True)
 752         self.op = Signal(bit_width, reset_less=True)
 753         self.msb = Signal(reset_less=True)
 754         self.nt = Signal(bit_width*2, reset_less=True)
 755         self.nl = Signal(bit_width*2, reset_less=True)
 756
 757     def elaborate(self, platform):
 758         m = Module()
 759         comb = m.d.comb
 760         bit_wid = self.bit_width
 761         ext = Repl(0, bit_wid) # extend output to HI part
 762
 763         # determine sign of each incoming number *in this partition*
 764         enabled = Signal(reset_less=True)
 765         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 766
 767         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 768         # negation operation is split into a bitwise not and a +1.
 769         # likewise for 16, 32, and 64-bit values.
 770
 771         # width-extended 1s complement if a is signed, otherwise zero
 772         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 773
 774         # add 1 if signed, otherwise add zero
 775         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 776
 777         return m
 778
 779
 780 class Parts(Elaboratable):
 781
 782     def __init__(self, pbwid, epps, n_parts):
 783         self.pbwid = pbwid
 784         # inputs
 785         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 786         # outputs
 787         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 788
 789     def elaborate(self, platform):
 790         m = Module()
 791
 792         epps, parts = self.epps, self.parts
 793         # collect part-bytes (double factor because the input is extended)
 794         pbs = Signal(self.pbwid, reset_less=True)
 795         tl = []
 796         for i in range(self.pbwid):
 797             pb = Signal(name="pb%d" % i, reset_less=True)
 798             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 799             tl.append(pb)
 800         m.d.comb += pbs.eq(Cat(*tl))
 801
 802         # negated-temporary copy of partition bits
 803         npbs = Signal.like(pbs, reset_less=True)
 804         m.d.comb += npbs.eq(~pbs)
 805         byte_count = 8 // len(parts)
 806         for i in range(len(parts)):
 807             pbl = []
 808             pbl.append(npbs[i * byte_count - 1])
 809             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 810                 pbl.append(pbs[j])
 811             pbl.append(npbs[(i + 1) * byte_count - 1])
 812             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 813             m.d.comb += value.eq(Cat(*pbl))
 814             m.d.comb += parts[i].eq(~(value).bool())
 815
 816         return m
 817
 818
 819 class Part(Elaboratable):
 820     """ a key class which, depending on the partitioning, will determine
 821         what action to take when parts of the output are signed or unsigned.
 822
 823         this requires 2 pieces of data *per operand, per partition*:
 824         whether the MSB is HI/LO (per partition!), and whether a signed
 825         or unsigned operation has been *requested*.
 826
 827         once that is determined, signed is basically carried out
 828         by splitting 2's complement into 1's complement plus one.
 829         1's complement is just a bit-inversion.
 830
 831         the extra terms - as separate terms - are then thrown at the
 832         AddReduce alongside the multiplication part-results.
 833     """
 834     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 835
 836         self.pbwid = pbwid
 837         self.epps = epps
 838
 839         # inputs
 840         self.a = Signal(64)
 841         self.b = Signal(64)
 842         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 843         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 844         self.pbs = Signal(pbwid, reset_less=True)
 845
 846         # outputs
 847         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 848
 849         self.not_a_term = Signal(width)
 850         self.neg_lsb_a_term = Signal(width)
 851         self.not_b_term = Signal(width)
 852         self.neg_lsb_b_term = Signal(width)
 853
 854     def elaborate(self, platform):
 855         m = Module()
 856
 857         pbs, parts = self.pbs, self.parts
 858         epps = self.epps
 859         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 860         m.d.comb += p.epps.eq(epps)
 861         parts = p.parts
 862
 863         byte_count = 8 // len(parts)
 864
 865         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 866                 self.not_a_term, self.neg_lsb_a_term,
 867                 self.not_b_term, self.neg_lsb_b_term)
 868
 869         byte_width = 8 // len(parts) # byte width
 870         bit_wid = 8 * byte_width     # bit width
 871         nat, nbt, nla, nlb = [], [], [], []
 872         for i in range(len(parts)):
 873             # work out bit-inverted and +1 term for a.
 874             pa = LSBNegTerm(bit_wid)
 875             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 876             m.d.comb += pa.part.eq(parts[i])
 877             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 878             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 879             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 880             nat.append(pa.nt)
 881             nla.append(pa.nl)
 882
 883             # work out bit-inverted and +1 term for b
 884             pb = LSBNegTerm(bit_wid)
 885             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 886             m.d.comb += pb.part.eq(parts[i])
 887             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 888             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 889             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 890             nbt.append(pb.nt)
 891             nlb.append(pb.nl)
 892
 893         # concatenate together and return all 4 results.
 894         m.d.comb += [not_a_term.eq(Cat(*nat)),
 895                      not_b_term.eq(Cat(*nbt)),
 896                      neg_lsb_a_term.eq(Cat(*nla)),
 897                      neg_lsb_b_term.eq(Cat(*nlb)),
 898                     ]
 899
 900         return m
 901
 902
 903 class IntermediateOut(Elaboratable):
 904     """ selects the HI/LO part of the multiplication, for a given bit-width
 905         the output is also reconstructed in its SIMD (partition) lanes.
 906     """
 907     def __init__(self, width, out_wid, n_parts):
 908         self.width = width
 909         self.n_parts = n_parts
 910         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 911                                      for i in range(8)]
 912         self.intermed = Signal(out_wid, reset_less=True)
 913         self.output = Signal(out_wid//2, reset_less=True)
 914
 915     def elaborate(self, platform):
 916         m = Module()
 917
 918         ol = []
 919         w = self.width
 920         sel = w // 8
 921         for i in range(self.n_parts):
 922             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 923             m.d.comb += op.eq(
 924                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 925                     self.intermed.part(i * w*2, w),
 926                     self.intermed.part(i * w*2 + w, w)))
 927             ol.append(op)
 928         m.d.comb += self.output.eq(Cat(*ol))
 929
 930         return m
 931
 932
 933 class FinalOut(Elaboratable):
 934     """ selects the final output based on the partitioning.
 935
 936         each byte is selectable independently, i.e. it is possible
 937         that some partitions requested 8-bit computation whilst others
 938         requested 16 or 32 bit.
 939     """
 940     def __init__(self, out_wid):
 941         # inputs
 942         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 943         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 944         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 945
 946         self.i8 = Signal(out_wid, reset_less=True)
 947         self.i16 = Signal(out_wid, reset_less=True)
 948         self.i32 = Signal(out_wid, reset_less=True)
 949         self.i64 = Signal(out_wid, reset_less=True)
 950
 951         # output
 952         self.out = Signal(out_wid, reset_less=True)
 953
 954     def elaborate(self, platform):
 955         m = Module()
 956         ol = []
 957         for i in range(8):
 958             # select one of the outputs: d8 selects i8, d16 selects i16
 959             # d32 selects i32, and the default is i64.
 960             # d8 and d16 are ORed together in the first Mux
 961             # then the 2nd selects either i8 or i16.
 962             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 963             op = Signal(8, reset_less=True, name="op_%d" % i)
 964             m.d.comb += op.eq(
 965                 Mux(self.d8[i] | self.d16[i // 2],
 966                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 967                                      self.i16.part(i * 8, 8)),
 968                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 969                                           self.i64.part(i * 8, 8))))
 970             ol.append(op)
 971         m.d.comb += self.out.eq(Cat(*ol))
 972         return m
 973
 974
 975 class OrMod(Elaboratable):
 976     """ ORs four values together in a hierarchical tree
 977     """
 978     def __init__(self, wid):
 979         self.wid = wid
 980         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 981                      for i in range(4)]
 982         self.orout = Signal(wid, reset_less=True)
 983
 984     def elaborate(self, platform):
 985         m = Module()
 986         or1 = Signal(self.wid, reset_less=True)
 987         or2 = Signal(self.wid, reset_less=True)
 988         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 989         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 990         m.d.comb += self.orout.eq(or1 | or2)
 991
 992         return m
 993
 994
 995 class Signs(Elaboratable):
 996     """ determines whether a or b are signed numbers
 997         based on the required operation type (OP_MUL_*)
 998     """
 999
1000     def __init__(self):
1001         self.part_ops = Signal(2, reset_less=True)
1002         self.a_signed = Signal(reset_less=True)
1003         self.b_signed = Signal(reset_less=True)
1004
1005     def elaborate(self, platform):
1006
1007         m = Module()
1008
1009         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1010         bsig = (self.part_ops == OP_MUL_LOW) \
1011                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1012         m.d.comb += self.a_signed.eq(asig)
1013         m.d.comb += self.b_signed.eq(bsig)
1014
1015         return m
1016
1017
1018 class Mul8_16_32_64(Elaboratable):
1019     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1020
1021     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1022     partitions on naturally-aligned boundaries. Supports the operation being
1023     set for each partition independently.
1024
1025     :attribute part_pts: the input partition points. Has a partition point at
1026         multiples of 8 in 0 < i < 64. Each partition point's associated
1027         ``Value`` is a ``Signal``. Modification not supported, except for by
1028         ``Signal.eq``.
1029     :attribute part_ops: the operation for each byte. The operation for a
1030         particular partition is selected by assigning the selected operation
1031         code to each byte in the partition. The allowed operation codes are:
1032
1033         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1034             RISC-V's `mul` instruction.
1035         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1036             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1037             instruction.
1038         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1039             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1040             `mulhsu` instruction.
1041         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1042             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1043             instruction.
1044     """
1045
1046     def __init__(self, register_levels=()):
1047         """ register_levels: specifies the points in the cascade at which
1048             flip-flops are to be inserted.
1049         """
1050
1051         # parameter(s)
1052         self.register_levels = list(register_levels)
1053
1054         # inputs
1055         self.part_pts = PartitionPoints()
1056         for i in range(8, 64, 8):
1057             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1058         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1059         self.a = Signal(64)
1060         self.b = Signal(64)
1061
1062         # intermediates (needed for unit tests)
1063         self._intermediate_output = Signal(128)
1064
1065         # output
1066         self.output = Signal(64)
1067
1068     def elaborate(self, platform):
1069         m = Module()
1070
1071         # collect part-bytes
1072         pbs = Signal(8, reset_less=True)
1073         tl = []
1074         for i in range(8):
1075             pb = Signal(name="pb%d" % i, reset_less=True)
1076             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1077             tl.append(pb)
1078         m.d.comb += pbs.eq(Cat(*tl))
1079
1080         # create (doubled) PartitionPoints (output is double input width)
1081         expanded_part_pts = eps = PartitionPoints()
1082         for i, v in self.part_pts.items():
1083             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1084             expanded_part_pts[i * 2] = ep
1085             m.d.comb += ep.eq(v)
1086
1087         # local variables
1088         signs = []
1089         for i in range(8):
1090             s = Signs()
1091             signs.append(s)
1092             setattr(m.submodules, "signs%d" % i, s)
1093             m.d.comb += s.part_ops.eq(self.part_ops[i])
1094
1095         n_levels = len(self.register_levels)+1
1096         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1097         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1098         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1099         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1100         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1101         for mod in [part_8, part_16, part_32, part_64]:
1102             m.d.comb += mod.a.eq(self.a)
1103             m.d.comb += mod.b.eq(self.b)
1104             for i in range(len(signs)):
1105                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1106                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1107             m.d.comb += mod.pbs.eq(pbs)
1108             nat_l.append(mod.not_a_term)
1109             nbt_l.append(mod.not_b_term)
1110             nla_l.append(mod.neg_lsb_a_term)
1111             nlb_l.append(mod.neg_lsb_b_term)
1112
1113         terms = []
1114
1115         for a_index in range(8):
1116             t = ProductTerms(8, 128, 8, a_index, 8)
1117             setattr(m.submodules, "terms_%d" % a_index, t)
1118
1119             m.d.comb += t.a.eq(self.a)
1120             m.d.comb += t.b.eq(self.b)
1121             m.d.comb += t.pb_en.eq(pbs)
1122
1123             for term in t.terms:
1124                 terms.append(term)
1125
1126         # it's fine to bitwise-or data together since they are never enabled
1127         # at the same time
1128         m.submodules.nat_or = nat_or = OrMod(128)
1129         m.submodules.nbt_or = nbt_or = OrMod(128)
1130         m.submodules.nla_or = nla_or = OrMod(128)
1131         m.submodules.nlb_or = nlb_or = OrMod(128)
1132         for l, mod in [(nat_l, nat_or),
1133                              (nbt_l, nbt_or),
1134                              (nla_l, nla_or),
1135                              (nlb_l, nlb_or)]:
1136             for i in range(len(l)):
1137                 m.d.comb += mod.orin[i].eq(l[i])
1138             terms.append(mod.orout)
1139
1140         add_reduce = AddReduce(terms,
1141                                128,
1142                                self.register_levels,
1143                                expanded_part_pts,
1144                                self.part_ops)
1145
1146         out_part_ops = add_reduce.o.part_ops
1147         out_part_pts = add_reduce.o.reg_partition_points
1148
1149         m.submodules.add_reduce = add_reduce
1150         m.d.comb += self._intermediate_output.eq(add_reduce.o.output)
1151         # create _output_64
1152         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1153         m.d.comb += io64.intermed.eq(self._intermediate_output)
1154         for i in range(8):
1155             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1156
1157         # create _output_32
1158         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1159         m.d.comb += io32.intermed.eq(self._intermediate_output)
1160         for i in range(8):
1161             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1162
1163         # create _output_16
1164         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1165         m.d.comb += io16.intermed.eq(self._intermediate_output)
1166         for i in range(8):
1167             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1168
1169         # create _output_8
1170         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1171         m.d.comb += io8.intermed.eq(self._intermediate_output)
1172         for i in range(8):
1173             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1174
1175         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1176         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1177         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1178         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1179
1180         m.d.comb += p_8.epps.eq(out_part_pts)
1181         m.d.comb += p_16.epps.eq(out_part_pts)
1182         m.d.comb += p_32.epps.eq(out_part_pts)
1183         m.d.comb += p_64.epps.eq(out_part_pts)
1184
1185         # final output
1186         m.submodules.finalout = finalout = FinalOut(64)
1187         for i in range(len(part_8.parts)):
1188             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1189         for i in range(len(part_16.parts)):
1190             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1191         for i in range(len(part_32.parts)):
1192             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1193         m.d.comb += finalout.i8.eq(io8.output)
1194         m.d.comb += finalout.i16.eq(io16.output)
1195         m.d.comb += finalout.i32.eq(io32.output)
1196         m.d.comb += finalout.i64.eq(io64.output)
1197         m.d.comb += self.output.eq(finalout.out)
1198
1199         return m
1200
1201
1202 if __name__ == "__main__":
1203     m = Mul8_16_32_64()
1204     main(m, ports=[m.a,
1205                    m.b,
1206                    m._intermediate_output,
1207                    m.output,
1208                    *m.part_ops,
1209                    *m.part_pts.values()])