src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width)
 136         self.in1 = Signal(width)
 137         self.in2 = Signal(width)
 138         self.sum = Signal(width)
 139         self.carry = Signal(width)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249         # XXX these have to remain here due to some horrible nmigen
 250         # simulation bugs involving sync.  it is *not* necessary to
 251         # have them here, they should (under normal circumstances)
 252         # be moved into elaborate, as they are entirely local
 253         self._expanded_a = Signal(expanded_width) # includes extra part-points
 254         self._expanded_b = Signal(expanded_width) # likewise.
 255         self._expanded_o = Signal(expanded_width) # likewise.
 256
 257     def elaborate(self, platform):
 258         """Elaborate this module."""
 259         m = Module()
 260         expanded_index = 0
 261         # store bits in a list, use Cat later.  graphviz is much cleaner
 262         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 263
 264         # partition points are "breaks" (extra zeros or 1s) in what would
 265         # otherwise be a massive long add.  when the "break" points are 0,
 266         # whatever is in it (in the output) is discarded.  however when
 267         # there is a "1", it causes a roll-over carry to the *next* bit.
 268         # we still ignore the "break" bit in the [intermediate] output,
 269         # however by that time we've got the effect that we wanted: the
 270         # carry has been carried *over* the break point.
 271
 272         for i in range(self.width):
 273             if i in self.partition_points:
 274                 # add extra bit set to 0 + 0 for enabled partition points
 275                 # and 1 + 0 for disabled partition points
 276                 ea.append(self._expanded_a[expanded_index])
 277                 al.append(~self.partition_points[i]) # add extra bit in a
 278                 eb.append(self._expanded_b[expanded_index])
 279                 bl.append(C(0)) # yes, add a zero
 280                 expanded_index += 1 # skip the extra point.  NOT in the output
 281             ea.append(self._expanded_a[expanded_index])
 282             eb.append(self._expanded_b[expanded_index])
 283             eo.append(self._expanded_o[expanded_index])
 284             al.append(self.a[i])
 285             bl.append(self.b[i])
 286             ol.append(self.output[i])
 287             expanded_index += 1
 288
 289         # combine above using Cat
 290         m.d.comb += Cat(*ea).eq(Cat(*al))
 291         m.d.comb += Cat(*eb).eq(Cat(*bl))
 292         m.d.comb += Cat(*ol).eq(Cat(*eo))
 293
 294         # use only one addition to take advantage of look-ahead carry and
 295         # special hardware on FPGAs
 296         m.d.comb += self._expanded_o.eq(
 297             self._expanded_a + self._expanded_b)
 298         return m
 299
 300
 301 FULL_ADDER_INPUT_COUNT = 3
 302
 303 class AddReduceData:
 304
 305     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 306         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 307                           for i in range(n_parts)]
 308         self.inputs = [Signal(output_width, name=f"inputs[{i}]")
 309             for i in range(n_inputs)]
 310         self.reg_partition_points = ppoints.like()
 311
 312     def eq_from(self, reg_partition_points, inputs, part_ops):
 313         return [self.reg_partition_points.eq(reg_partition_points)] + \
 314                [self.inputs[i].eq(inputs[i])
 315                                      for i in range(len(self.inputs))] + \
 316                [self.part_ops[i].eq(part_ops[i])
 317                                      for i in range(len(self.part_ops))]
 318
 319     def eq(self, rhs):
 320         return self.eq_from(rhs.reg_partition_points, rhs.inputs, rhs.part_ops)
 321
 322
 323 class FinalReduceData:
 324
 325     def __init__(self, ppoints, output_width, n_parts):
 326         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 327                           for i in range(n_parts)]
 328         self.output = Signal(output_width)
 329         self.reg_partition_points = ppoints.like()
 330
 331     def eq_from(self, reg_partition_points, output, part_ops):
 332         return [self.reg_partition_points.eq(reg_partition_points)] + \
 333                [self.output.eq(output)] + \
 334                [self.part_ops[i].eq(part_ops[i])
 335                                      for i in range(len(self.part_ops))]
 336
 337     def eq(self, rhs):
 338         return self.eq_from(rhs.reg_partition_points, rhs.output, rhs.part_ops)
 339
 340
 341 class FinalAdd(Elaboratable):
 342     """ Final stage of add reduce
 343     """
 344
 345     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 346                        partition_points):
 347         self.i = AddReduceData(partition_points, n_inputs,
 348                                output_width, n_parts)
 349         self.o = FinalReduceData(partition_points, output_width, n_parts)
 350         self.output_width = output_width
 351         self.n_inputs = n_inputs
 352         self.n_parts = n_parts
 353         self.register_levels = list(register_levels)
 354         self.partition_points = PartitionPoints(partition_points)
 355         if not self.partition_points.fits_in_width(output_width):
 356             raise ValueError("partition_points doesn't fit in output_width")
 357
 358     def elaborate(self, platform):
 359         """Elaborate this module."""
 360         m = Module()
 361
 362         output_width = self.output_width
 363         output = Signal(output_width)
 364         if self.n_inputs == 0:
 365             # use 0 as the default output value
 366             m.d.comb += output.eq(0)
 367         elif self.n_inputs == 1:
 368             # handle single input
 369             m.d.comb += output.eq(self.i.inputs[0])
 370         else:
 371             # base case for adding 2 inputs
 372             assert self.n_inputs == 2
 373             adder = PartitionedAdder(output_width, self.i.reg_partition_points)
 374             m.submodules.final_adder = adder
 375             m.d.comb += adder.a.eq(self.i.inputs[0])
 376             m.d.comb += adder.b.eq(self.i.inputs[1])
 377             m.d.comb += output.eq(adder.output)
 378
 379         # create output
 380         m.d.comb += self.o.eq_from(self.i.reg_partition_points, output,
 381                                    self.i.part_ops)
 382
 383         return m
 384
 385
 386 class AddReduceSingle(Elaboratable):
 387     """Add list of numbers together.
 388
 389     :attribute inputs: input ``Signal``s to be summed. Modification not
 390         supported, except for by ``Signal.eq``.
 391     :attribute register_levels: List of nesting levels that should have
 392         pipeline registers.
 393     :attribute output: output sum.
 394     :attribute partition_points: the input partition points. Modification not
 395         supported, except for by ``Signal.eq``.
 396     """
 397
 398     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 399                        partition_points):
 400         """Create an ``AddReduce``.
 401
 402         :param inputs: input ``Signal``s to be summed.
 403         :param output_width: bit-width of ``output``.
 404         :param register_levels: List of nesting levels that should have
 405             pipeline registers.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.register_levels = list(register_levels)
 414         self.partition_points = PartitionPoints(partition_points)
 415         if not self.partition_points.fits_in_width(output_width):
 416             raise ValueError("partition_points doesn't fit in output_width")
 417
 418         max_level = AddReduceSingle.get_max_level(n_inputs)
 419         for level in self.register_levels:
 420             if level > max_level:
 421                 raise ValueError(
 422                     "not enough adder levels for specified register levels")
 423
 424         # this is annoying.  we have to create the modules (and terms)
 425         # because we need to know what they are (in order to set up the
 426         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 427         # etc because this is not in elaboratable.
 428         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 429         self._intermediate_terms = []
 430         if len(self.groups) != 0:
 431             self.create_next_terms()
 432
 433         self.o = AddReduceData(partition_points, len(self._intermediate_terms),
 434                                output_width, n_parts)
 435
 436     @staticmethod
 437     def get_max_level(input_count):
 438         """Get the maximum level.
 439
 440         All ``register_levels`` must be less than or equal to the maximum
 441         level.
 442         """
 443         retval = 0
 444         while True:
 445             groups = AddReduceSingle.full_adder_groups(input_count)
 446             if len(groups) == 0:
 447                 return retval
 448             input_count %= FULL_ADDER_INPUT_COUNT
 449             input_count += 2 * len(groups)
 450             retval += 1
 451
 452     @staticmethod
 453     def full_adder_groups(input_count):
 454         """Get ``inputs`` indices for which a full adder should be built."""
 455         return range(0,
 456                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 457                      FULL_ADDER_INPUT_COUNT)
 458
 459     def elaborate(self, platform):
 460         """Elaborate this module."""
 461         m = Module()
 462
 463         # copy the intermediate terms to the output
 464         for i, value in enumerate(self._intermediate_terms):
 465             m.d.comb += self.o.inputs[i].eq(value)
 466
 467         # copy reg part points and part ops to output
 468         m.d.comb += self.o.reg_partition_points.eq(self.i.reg_partition_points)
 469         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 470                                      for i in range(len(self.i.part_ops))]
 471
 472         # set up the partition mask (for the adders)
 473         mask = self.i.reg_partition_points.as_mask(self.output_width)
 474         m.d.comb += self.part_mask.eq(mask)
 475
 476         # add and link the intermediate term modules
 477         for i, (iidx, adder_i) in enumerate(self.adders):
 478             setattr(m.submodules, f"adder_{i}", adder_i)
 479
 480             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 481             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 482             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 483             m.d.comb += adder_i.mask.eq(self.part_mask)
 484
 485         return m
 486
 487     def create_next_terms(self):
 488
 489         _intermediate_terms = []
 490
 491         def add_intermediate_term(value):
 492             _intermediate_terms.append(value)
 493
 494         # store mask in intermediary (simplifies graph)
 495         self.part_mask = Signal(self.output_width, reset_less=True)
 496
 497         # create full adders for this recursive level.
 498         # this shrinks N terms to 2 * (N // 3) plus the remainder
 499         self.adders = []
 500         for i in self.groups:
 501             adder_i = MaskedFullAdder(self.output_width)
 502             self.adders.append((i, adder_i))
 503             # add both the sum and the masked-carry to the next level.
 504             # 3 inputs have now been reduced to 2...
 505             add_intermediate_term(adder_i.sum)
 506             add_intermediate_term(adder_i.mcarry)
 507         # handle the remaining inputs.
 508         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 509             add_intermediate_term(self.i.inputs[-1])
 510         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 511             # Just pass the terms to the next layer, since we wouldn't gain
 512             # anything by using a half adder since there would still be 2 terms
 513             # and just passing the terms to the next layer saves gates.
 514             add_intermediate_term(self.i.inputs[-2])
 515             add_intermediate_term(self.i.inputs[-1])
 516         else:
 517             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 518
 519         self._intermediate_terms = _intermediate_terms
 520
 521
 522 class AddReduce(Elaboratable):
 523     """Recursively Add list of numbers together.
 524
 525     :attribute inputs: input ``Signal``s to be summed. Modification not
 526         supported, except for by ``Signal.eq``.
 527     :attribute register_levels: List of nesting levels that should have
 528         pipeline registers.
 529     :attribute output: output sum.
 530     :attribute partition_points: the input partition points. Modification not
 531         supported, except for by ``Signal.eq``.
 532     """
 533
 534     def __init__(self, inputs, output_width, register_levels, partition_points,
 535                        part_ops):
 536         """Create an ``AddReduce``.
 537
 538         :param inputs: input ``Signal``s to be summed.
 539         :param output_width: bit-width of ``output``.
 540         :param register_levels: List of nesting levels that should have
 541             pipeline registers.
 542         :param partition_points: the input partition points.
 543         """
 544         self.inputs = inputs
 545         self.part_ops = part_ops
 546         n_parts = len(part_ops)
 547         self.o = FinalReduceData(partition_points, output_width, n_parts)
 548         self.output_width = output_width
 549         self.register_levels = register_levels
 550         self.partition_points = partition_points
 551
 552         self.create_levels()
 553
 554     @staticmethod
 555     def get_max_level(input_count):
 556         return AddReduceSingle.get_max_level(input_count)
 557
 558     @staticmethod
 559     def next_register_levels(register_levels):
 560         """``Iterable`` of ``register_levels`` for next recursive level."""
 561         for level in register_levels:
 562             if level > 0:
 563                 yield level - 1
 564
 565     def create_levels(self):
 566         """creates reduction levels"""
 567
 568         mods = []
 569         next_levels = self.register_levels
 570         partition_points = self.partition_points
 571         part_ops = self.part_ops
 572         n_parts = len(part_ops)
 573         inputs = self.inputs
 574         ilen = len(inputs)
 575         while True:
 576             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 577                                          next_levels, partition_points)
 578             mods.append(next_level)
 579             next_levels = list(AddReduce.next_register_levels(next_levels))
 580             partition_points = next_level.i.reg_partition_points
 581             inputs = next_level.o.inputs
 582             ilen = len(inputs)
 583             part_ops = next_level.i.part_ops
 584             groups = AddReduceSingle.full_adder_groups(len(inputs))
 585             if len(groups) == 0:
 586                 break
 587
 588         if ilen != 0:
 589             next_level = FinalAdd(ilen, self.output_width, n_parts,
 590                                   next_levels, partition_points)
 591             mods.append(next_level)
 592
 593         self.levels = mods
 594
 595     def elaborate(self, platform):
 596         """Elaborate this module."""
 597         m = Module()
 598
 599         for i, next_level in enumerate(self.levels):
 600             setattr(m.submodules, "next_level%d" % i, next_level)
 601
 602         partition_points = self.partition_points
 603         inputs = self.inputs
 604         part_ops = self.part_ops
 605         n_parts = len(part_ops)
 606         n_inputs = len(inputs)
 607         output_width = self.output_width
 608         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 609         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 610         for idx in range(len(self.levels)):
 611             mcur = self.levels[idx]
 612             if 0 in mcur.register_levels:
 613                 m.d.sync += mcur.i.eq(i)
 614             else:
 615                 m.d.comb += mcur.i.eq(i)
 616             i = mcur.o # for next loop
 617
 618         print ("levels", len(self.levels), i)
 619         # output comes from last module
 620         m.d.comb += self.o.eq(i)
 621
 622         return m
 623
 624
 625 OP_MUL_LOW = 0
 626 OP_MUL_SIGNED_HIGH = 1
 627 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 628 OP_MUL_UNSIGNED_HIGH = 3
 629
 630
 631 def get_term(value, shift=0, enabled=None):
 632     if enabled is not None:
 633         value = Mux(enabled, value, 0)
 634     if shift > 0:
 635         value = Cat(Repl(C(0, 1), shift), value)
 636     else:
 637         assert shift == 0
 638     return value
 639
 640
 641 class ProductTerm(Elaboratable):
 642     """ this class creates a single product term (a[..]*b[..]).
 643         it has a design flaw in that is the *output* that is selected,
 644         where the multiplication(s) are combinatorially generated
 645         all the time.
 646     """
 647
 648     def __init__(self, width, twidth, pbwid, a_index, b_index):
 649         self.a_index = a_index
 650         self.b_index = b_index
 651         shift = 8 * (self.a_index + self.b_index)
 652         self.pwidth = width
 653         self.twidth = twidth
 654         self.width = width*2
 655         self.shift = shift
 656
 657         self.ti = Signal(self.width, reset_less=True)
 658         self.term = Signal(twidth, reset_less=True)
 659         self.a = Signal(twidth//2, reset_less=True)
 660         self.b = Signal(twidth//2, reset_less=True)
 661         self.pb_en = Signal(pbwid, reset_less=True)
 662
 663         self.tl = tl = []
 664         min_index = min(self.a_index, self.b_index)
 665         max_index = max(self.a_index, self.b_index)
 666         for i in range(min_index, max_index):
 667             tl.append(self.pb_en[i])
 668         name = "te_%d_%d" % (self.a_index, self.b_index)
 669         if len(tl) > 0:
 670             term_enabled = Signal(name=name, reset_less=True)
 671         else:
 672             term_enabled = None
 673         self.enabled = term_enabled
 674         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 675
 676     def elaborate(self, platform):
 677
 678         m = Module()
 679         if self.enabled is not None:
 680             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 681
 682         bsa = Signal(self.width, reset_less=True)
 683         bsb = Signal(self.width, reset_less=True)
 684         a_index, b_index = self.a_index, self.b_index
 685         pwidth = self.pwidth
 686         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 687         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 688         m.d.comb += self.ti.eq(bsa * bsb)
 689         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 690         """
 691         #TODO: sort out width issues, get inputs a/b switched on/off.
 692         #data going into Muxes is 1/2 the required width
 693
 694         pwidth = self.pwidth
 695         width = self.width
 696         bsa = Signal(self.twidth//2, reset_less=True)
 697         bsb = Signal(self.twidth//2, reset_less=True)
 698         asel = Signal(width, reset_less=True)
 699         bsel = Signal(width, reset_less=True)
 700         a_index, b_index = self.a_index, self.b_index
 701         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 702         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 703         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 704         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 705         m.d.comb += self.ti.eq(bsa * bsb)
 706         m.d.comb += self.term.eq(self.ti)
 707         """
 708
 709         return m
 710
 711
 712 class ProductTerms(Elaboratable):
 713     """ creates a bank of product terms.  also performs the actual bit-selection
 714         this class is to be wrapped with a for-loop on the "a" operand.
 715         it creates a second-level for-loop on the "b" operand.
 716     """
 717     def __init__(self, width, twidth, pbwid, a_index, blen):
 718         self.a_index = a_index
 719         self.blen = blen
 720         self.pwidth = width
 721         self.twidth = twidth
 722         self.pbwid = pbwid
 723         self.a = Signal(twidth//2, reset_less=True)
 724         self.b = Signal(twidth//2, reset_less=True)
 725         self.pb_en = Signal(pbwid, reset_less=True)
 726         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 727                             for i in range(blen)]
 728
 729     def elaborate(self, platform):
 730
 731         m = Module()
 732
 733         for b_index in range(self.blen):
 734             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 735                             self.a_index, b_index)
 736             setattr(m.submodules, "term_%d" % b_index, t)
 737
 738             m.d.comb += t.a.eq(self.a)
 739             m.d.comb += t.b.eq(self.b)
 740             m.d.comb += t.pb_en.eq(self.pb_en)
 741
 742             m.d.comb += self.terms[b_index].eq(t.term)
 743
 744         return m
 745
 746
 747 class LSBNegTerm(Elaboratable):
 748
 749     def __init__(self, bit_width):
 750         self.bit_width = bit_width
 751         self.part = Signal(reset_less=True)
 752         self.signed = Signal(reset_less=True)
 753         self.op = Signal(bit_width, reset_less=True)
 754         self.msb = Signal(reset_less=True)
 755         self.nt = Signal(bit_width*2, reset_less=True)
 756         self.nl = Signal(bit_width*2, reset_less=True)
 757
 758     def elaborate(self, platform):
 759         m = Module()
 760         comb = m.d.comb
 761         bit_wid = self.bit_width
 762         ext = Repl(0, bit_wid) # extend output to HI part
 763
 764         # determine sign of each incoming number *in this partition*
 765         enabled = Signal(reset_less=True)
 766         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 767
 768         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 769         # negation operation is split into a bitwise not and a +1.
 770         # likewise for 16, 32, and 64-bit values.
 771
 772         # width-extended 1s complement if a is signed, otherwise zero
 773         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 774
 775         # add 1 if signed, otherwise add zero
 776         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 777
 778         return m
 779
 780
 781 class Parts(Elaboratable):
 782
 783     def __init__(self, pbwid, epps, n_parts):
 784         self.pbwid = pbwid
 785         # inputs
 786         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 787         # outputs
 788         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 789
 790     def elaborate(self, platform):
 791         m = Module()
 792
 793         epps, parts = self.epps, self.parts
 794         # collect part-bytes (double factor because the input is extended)
 795         pbs = Signal(self.pbwid, reset_less=True)
 796         tl = []
 797         for i in range(self.pbwid):
 798             pb = Signal(name="pb%d" % i, reset_less=True)
 799             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 800             tl.append(pb)
 801         m.d.comb += pbs.eq(Cat(*tl))
 802
 803         # negated-temporary copy of partition bits
 804         npbs = Signal.like(pbs, reset_less=True)
 805         m.d.comb += npbs.eq(~pbs)
 806         byte_count = 8 // len(parts)
 807         for i in range(len(parts)):
 808             pbl = []
 809             pbl.append(npbs[i * byte_count - 1])
 810             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 811                 pbl.append(pbs[j])
 812             pbl.append(npbs[(i + 1) * byte_count - 1])
 813             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 814             m.d.comb += value.eq(Cat(*pbl))
 815             m.d.comb += parts[i].eq(~(value).bool())
 816
 817         return m
 818
 819
 820 class Part(Elaboratable):
 821     """ a key class which, depending on the partitioning, will determine
 822         what action to take when parts of the output are signed or unsigned.
 823
 824         this requires 2 pieces of data *per operand, per partition*:
 825         whether the MSB is HI/LO (per partition!), and whether a signed
 826         or unsigned operation has been *requested*.
 827
 828         once that is determined, signed is basically carried out
 829         by splitting 2's complement into 1's complement plus one.
 830         1's complement is just a bit-inversion.
 831
 832         the extra terms - as separate terms - are then thrown at the
 833         AddReduce alongside the multiplication part-results.
 834     """
 835     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 836
 837         self.pbwid = pbwid
 838         self.epps = epps
 839
 840         # inputs
 841         self.a = Signal(64)
 842         self.b = Signal(64)
 843         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 844         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 845         self.pbs = Signal(pbwid, reset_less=True)
 846
 847         # outputs
 848         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 849
 850         self.not_a_term = Signal(width)
 851         self.neg_lsb_a_term = Signal(width)
 852         self.not_b_term = Signal(width)
 853         self.neg_lsb_b_term = Signal(width)
 854
 855     def elaborate(self, platform):
 856         m = Module()
 857
 858         pbs, parts = self.pbs, self.parts
 859         epps = self.epps
 860         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 861         m.d.comb += p.epps.eq(epps)
 862         parts = p.parts
 863
 864         byte_count = 8 // len(parts)
 865
 866         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 867                 self.not_a_term, self.neg_lsb_a_term,
 868                 self.not_b_term, self.neg_lsb_b_term)
 869
 870         byte_width = 8 // len(parts) # byte width
 871         bit_wid = 8 * byte_width     # bit width
 872         nat, nbt, nla, nlb = [], [], [], []
 873         for i in range(len(parts)):
 874             # work out bit-inverted and +1 term for a.
 875             pa = LSBNegTerm(bit_wid)
 876             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 877             m.d.comb += pa.part.eq(parts[i])
 878             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 879             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 880             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 881             nat.append(pa.nt)
 882             nla.append(pa.nl)
 883
 884             # work out bit-inverted and +1 term for b
 885             pb = LSBNegTerm(bit_wid)
 886             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 887             m.d.comb += pb.part.eq(parts[i])
 888             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 889             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 890             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 891             nbt.append(pb.nt)
 892             nlb.append(pb.nl)
 893
 894         # concatenate together and return all 4 results.
 895         m.d.comb += [not_a_term.eq(Cat(*nat)),
 896                      not_b_term.eq(Cat(*nbt)),
 897                      neg_lsb_a_term.eq(Cat(*nla)),
 898                      neg_lsb_b_term.eq(Cat(*nlb)),
 899                     ]
 900
 901         return m
 902
 903
 904 class IntermediateOut(Elaboratable):
 905     """ selects the HI/LO part of the multiplication, for a given bit-width
 906         the output is also reconstructed in its SIMD (partition) lanes.
 907     """
 908     def __init__(self, width, out_wid, n_parts):
 909         self.width = width
 910         self.n_parts = n_parts
 911         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 912                                      for i in range(8)]
 913         self.intermed = Signal(out_wid, reset_less=True)
 914         self.output = Signal(out_wid//2, reset_less=True)
 915
 916     def elaborate(self, platform):
 917         m = Module()
 918
 919         ol = []
 920         w = self.width
 921         sel = w // 8
 922         for i in range(self.n_parts):
 923             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 924             m.d.comb += op.eq(
 925                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 926                     self.intermed.part(i * w*2, w),
 927                     self.intermed.part(i * w*2 + w, w)))
 928             ol.append(op)
 929         m.d.comb += self.output.eq(Cat(*ol))
 930
 931         return m
 932
 933
 934 class FinalOut(Elaboratable):
 935     """ selects the final output based on the partitioning.
 936
 937         each byte is selectable independently, i.e. it is possible
 938         that some partitions requested 8-bit computation whilst others
 939         requested 16 or 32 bit.
 940     """
 941     def __init__(self, out_wid):
 942         # inputs
 943         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 944         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 945         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 946
 947         self.i8 = Signal(out_wid, reset_less=True)
 948         self.i16 = Signal(out_wid, reset_less=True)
 949         self.i32 = Signal(out_wid, reset_less=True)
 950         self.i64 = Signal(out_wid, reset_less=True)
 951
 952         # output
 953         self.out = Signal(out_wid, reset_less=True)
 954
 955     def elaborate(self, platform):
 956         m = Module()
 957         ol = []
 958         for i in range(8):
 959             # select one of the outputs: d8 selects i8, d16 selects i16
 960             # d32 selects i32, and the default is i64.
 961             # d8 and d16 are ORed together in the first Mux
 962             # then the 2nd selects either i8 or i16.
 963             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 964             op = Signal(8, reset_less=True, name="op_%d" % i)
 965             m.d.comb += op.eq(
 966                 Mux(self.d8[i] | self.d16[i // 2],
 967                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 968                                      self.i16.part(i * 8, 8)),
 969                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 970                                           self.i64.part(i * 8, 8))))
 971             ol.append(op)
 972         m.d.comb += self.out.eq(Cat(*ol))
 973         return m
 974
 975
 976 class OrMod(Elaboratable):
 977     """ ORs four values together in a hierarchical tree
 978     """
 979     def __init__(self, wid):
 980         self.wid = wid
 981         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 982                      for i in range(4)]
 983         self.orout = Signal(wid, reset_less=True)
 984
 985     def elaborate(self, platform):
 986         m = Module()
 987         or1 = Signal(self.wid, reset_less=True)
 988         or2 = Signal(self.wid, reset_less=True)
 989         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 990         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 991         m.d.comb += self.orout.eq(or1 | or2)
 992
 993         return m
 994
 995
 996 class Signs(Elaboratable):
 997     """ determines whether a or b are signed numbers
 998         based on the required operation type (OP_MUL_*)
 999     """
1000
1001     def __init__(self):
1002         self.part_ops = Signal(2, reset_less=True)
1003         self.a_signed = Signal(reset_less=True)
1004         self.b_signed = Signal(reset_less=True)
1005
1006     def elaborate(self, platform):
1007
1008         m = Module()
1009
1010         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1011         bsig = (self.part_ops == OP_MUL_LOW) \
1012                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1013         m.d.comb += self.a_signed.eq(asig)
1014         m.d.comb += self.b_signed.eq(bsig)
1015
1016         return m
1017
1018
1019 class Mul8_16_32_64(Elaboratable):
1020     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1021
1022     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1023     partitions on naturally-aligned boundaries. Supports the operation being
1024     set for each partition independently.
1025
1026     :attribute part_pts: the input partition points. Has a partition point at
1027         multiples of 8 in 0 < i < 64. Each partition point's associated
1028         ``Value`` is a ``Signal``. Modification not supported, except for by
1029         ``Signal.eq``.
1030     :attribute part_ops: the operation for each byte. The operation for a
1031         particular partition is selected by assigning the selected operation
1032         code to each byte in the partition. The allowed operation codes are:
1033
1034         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1035             RISC-V's `mul` instruction.
1036         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1037             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1038             instruction.
1039         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1040             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1041             `mulhsu` instruction.
1042         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1043             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1044             instruction.
1045     """
1046
1047     def __init__(self, register_levels=()):
1048         """ register_levels: specifies the points in the cascade at which
1049             flip-flops are to be inserted.
1050         """
1051
1052         # parameter(s)
1053         self.register_levels = list(register_levels)
1054
1055         # inputs
1056         self.part_pts = PartitionPoints()
1057         for i in range(8, 64, 8):
1058             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1059         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1060         self.a = Signal(64)
1061         self.b = Signal(64)
1062
1063         # intermediates (needed for unit tests)
1064         self._intermediate_output = Signal(128)
1065
1066         # output
1067         self.output = Signal(64)
1068
1069     def elaborate(self, platform):
1070         m = Module()
1071
1072         # collect part-bytes
1073         pbs = Signal(8, reset_less=True)
1074         tl = []
1075         for i in range(8):
1076             pb = Signal(name="pb%d" % i, reset_less=True)
1077             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1078             tl.append(pb)
1079         m.d.comb += pbs.eq(Cat(*tl))
1080
1081         # create (doubled) PartitionPoints (output is double input width)
1082         expanded_part_pts = eps = PartitionPoints()
1083         for i, v in self.part_pts.items():
1084             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1085             expanded_part_pts[i * 2] = ep
1086             m.d.comb += ep.eq(v)
1087
1088         # local variables
1089         signs = []
1090         for i in range(8):
1091             s = Signs()
1092             signs.append(s)
1093             setattr(m.submodules, "signs%d" % i, s)
1094             m.d.comb += s.part_ops.eq(self.part_ops[i])
1095
1096         n_levels = len(self.register_levels)+1
1097         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1098         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1099         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1100         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1101         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1102         for mod in [part_8, part_16, part_32, part_64]:
1103             m.d.comb += mod.a.eq(self.a)
1104             m.d.comb += mod.b.eq(self.b)
1105             for i in range(len(signs)):
1106                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1107                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1108             m.d.comb += mod.pbs.eq(pbs)
1109             nat_l.append(mod.not_a_term)
1110             nbt_l.append(mod.not_b_term)
1111             nla_l.append(mod.neg_lsb_a_term)
1112             nlb_l.append(mod.neg_lsb_b_term)
1113
1114         terms = []
1115
1116         for a_index in range(8):
1117             t = ProductTerms(8, 128, 8, a_index, 8)
1118             setattr(m.submodules, "terms_%d" % a_index, t)
1119
1120             m.d.comb += t.a.eq(self.a)
1121             m.d.comb += t.b.eq(self.b)
1122             m.d.comb += t.pb_en.eq(pbs)
1123
1124             for term in t.terms:
1125                 terms.append(term)
1126
1127         # it's fine to bitwise-or data together since they are never enabled
1128         # at the same time
1129         m.submodules.nat_or = nat_or = OrMod(128)
1130         m.submodules.nbt_or = nbt_or = OrMod(128)
1131         m.submodules.nla_or = nla_or = OrMod(128)
1132         m.submodules.nlb_or = nlb_or = OrMod(128)
1133         for l, mod in [(nat_l, nat_or),
1134                              (nbt_l, nbt_or),
1135                              (nla_l, nla_or),
1136                              (nlb_l, nlb_or)]:
1137             for i in range(len(l)):
1138                 m.d.comb += mod.orin[i].eq(l[i])
1139             terms.append(mod.orout)
1140
1141         add_reduce = AddReduce(terms,
1142                                128,
1143                                self.register_levels,
1144                                expanded_part_pts,
1145                                self.part_ops)
1146
1147         out_part_ops = add_reduce.o.part_ops
1148         out_part_pts = add_reduce.o.reg_partition_points
1149
1150         m.submodules.add_reduce = add_reduce
1151         m.d.comb += self._intermediate_output.eq(add_reduce.o.output)
1152         # create _output_64
1153         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1154         m.d.comb += io64.intermed.eq(self._intermediate_output)
1155         for i in range(8):
1156             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1157
1158         # create _output_32
1159         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1160         m.d.comb += io32.intermed.eq(self._intermediate_output)
1161         for i in range(8):
1162             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1163
1164         # create _output_16
1165         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1166         m.d.comb += io16.intermed.eq(self._intermediate_output)
1167         for i in range(8):
1168             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1169
1170         # create _output_8
1171         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1172         m.d.comb += io8.intermed.eq(self._intermediate_output)
1173         for i in range(8):
1174             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1175
1176         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1177         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1178         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1179         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1180
1181         m.d.comb += p_8.epps.eq(out_part_pts)
1182         m.d.comb += p_16.epps.eq(out_part_pts)
1183         m.d.comb += p_32.epps.eq(out_part_pts)
1184         m.d.comb += p_64.epps.eq(out_part_pts)
1185
1186         # final output
1187         m.submodules.finalout = finalout = FinalOut(64)
1188         for i in range(len(part_8.parts)):
1189             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1190         for i in range(len(part_16.parts)):
1191             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1192         for i in range(len(part_32.parts)):
1193             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1194         m.d.comb += finalout.i8.eq(io8.output)
1195         m.d.comb += finalout.i16.eq(io16.output)
1196         m.d.comb += finalout.i32.eq(io32.output)
1197         m.d.comb += finalout.i64.eq(io64.output)
1198         m.d.comb += self.output.eq(finalout.out)
1199
1200         return m
1201
1202
1203 if __name__ == "__main__":
1204     m = Mul8_16_32_64()
1205     main(m, ports=[m.a,
1206                    m.b,
1207                    m._intermediate_output,
1208                    m.output,
1209                    *m.part_ops,
1210                    *m.part_pts.values()])