src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width)
 136         self.in1 = Signal(width)
 137         self.in2 = Signal(width)
 138         self.sum = Signal(width)
 139         self.carry = Signal(width)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249         # XXX these have to remain here due to some horrible nmigen
 250         # simulation bugs involving sync.  it is *not* necessary to
 251         # have them here, they should (under normal circumstances)
 252         # be moved into elaborate, as they are entirely local
 253         self._expanded_a = Signal(expanded_width) # includes extra part-points
 254         self._expanded_b = Signal(expanded_width) # likewise.
 255         self._expanded_o = Signal(expanded_width) # likewise.
 256
 257     def elaborate(self, platform):
 258         """Elaborate this module."""
 259         m = Module()
 260         expanded_index = 0
 261         # store bits in a list, use Cat later.  graphviz is much cleaner
 262         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 263
 264         # partition points are "breaks" (extra zeros or 1s) in what would
 265         # otherwise be a massive long add.  when the "break" points are 0,
 266         # whatever is in it (in the output) is discarded.  however when
 267         # there is a "1", it causes a roll-over carry to the *next* bit.
 268         # we still ignore the "break" bit in the [intermediate] output,
 269         # however by that time we've got the effect that we wanted: the
 270         # carry has been carried *over* the break point.
 271
 272         for i in range(self.width):
 273             if i in self.partition_points:
 274                 # add extra bit set to 0 + 0 for enabled partition points
 275                 # and 1 + 0 for disabled partition points
 276                 ea.append(self._expanded_a[expanded_index])
 277                 al.append(~self.partition_points[i]) # add extra bit in a
 278                 eb.append(self._expanded_b[expanded_index])
 279                 bl.append(C(0)) # yes, add a zero
 280                 expanded_index += 1 # skip the extra point.  NOT in the output
 281             ea.append(self._expanded_a[expanded_index])
 282             eb.append(self._expanded_b[expanded_index])
 283             eo.append(self._expanded_o[expanded_index])
 284             al.append(self.a[i])
 285             bl.append(self.b[i])
 286             ol.append(self.output[i])
 287             expanded_index += 1
 288
 289         # combine above using Cat
 290         m.d.comb += Cat(*ea).eq(Cat(*al))
 291         m.d.comb += Cat(*eb).eq(Cat(*bl))
 292         m.d.comb += Cat(*ol).eq(Cat(*eo))
 293
 294         # use only one addition to take advantage of look-ahead carry and
 295         # special hardware on FPGAs
 296         m.d.comb += self._expanded_o.eq(
 297             self._expanded_a + self._expanded_b)
 298         return m
 299
 300
 301 FULL_ADDER_INPUT_COUNT = 3
 302
 303 class AddReduceData:
 304
 305     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 306         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 307                           for i in range(n_parts)]
 308         self.inputs = [Signal(output_width, name=f"inputs[{i}]")
 309             for i in range(n_inputs)]
 310         self.reg_partition_points = ppoints.like()
 311
 312     def eq_from(self, reg_partition_points, inputs, part_ops):
 313         return [self.reg_partition_points.eq(reg_partition_points)] + \
 314                [self.inputs[i].eq(inputs[i])
 315                                      for i in range(len(self.inputs))] + \
 316                [self.part_ops[i].eq(part_ops[i])
 317                                      for i in range(len(self.part_ops))]
 318
 319     def eq(self, rhs):
 320         return self.eq_from(rhs.reg_partition_points, rhs.inputs, rhs.part_ops)
 321
 322
 323 class FinalReduceData:
 324
 325     def __init__(self, ppoints, output_width, n_parts):
 326         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 327                           for i in range(n_parts)]
 328         self.output = Signal(output_width)
 329         self.reg_partition_points = ppoints.like()
 330
 331     def eq_from(self, reg_partition_points, output, part_ops):
 332         return [self.reg_partition_points.eq(reg_partition_points)] + \
 333                [self.output.eq(output)] + \
 334                [self.part_ops[i].eq(part_ops[i])
 335                                      for i in range(len(self.part_ops))]
 336
 337     def eq(self, rhs):
 338         return self.eq_from(rhs.reg_partition_points, rhs.output, rhs.part_ops)
 339
 340
 341 class FinalAdd(Elaboratable):
 342     """ Final stage of add reduce
 343     """
 344
 345     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 346                        partition_points):
 347         self.i = AddReduceData(partition_points, n_inputs,
 348                                output_width, n_parts)
 349         self.o = FinalReduceData(partition_points, output_width, n_parts)
 350         self.output_width = output_width
 351         self.n_inputs = n_inputs
 352         self.n_parts = n_parts
 353         self.register_levels = list(register_levels)
 354         self.partition_points = PartitionPoints(partition_points)
 355         if not self.partition_points.fits_in_width(output_width):
 356             raise ValueError("partition_points doesn't fit in output_width")
 357
 358     def elaborate(self, platform):
 359         """Elaborate this module."""
 360         m = Module()
 361
 362         output_width = self.output_width
 363         output = Signal(output_width)
 364         if self.n_inputs == 0:
 365             # use 0 as the default output value
 366             m.d.comb += output.eq(0)
 367         elif self.n_inputs == 1:
 368             # handle single input
 369             m.d.comb += output.eq(self.i.inputs[0])
 370         else:
 371             # base case for adding 2 inputs
 372             assert self.n_inputs == 2
 373             adder = PartitionedAdder(output_width, self.i.reg_partition_points)
 374             m.submodules.final_adder = adder
 375             m.d.comb += adder.a.eq(self.i.inputs[0])
 376             m.d.comb += adder.b.eq(self.i.inputs[1])
 377             m.d.comb += output.eq(adder.output)
 378
 379         # create output
 380         m.d.comb += self.o.eq_from(self.i.reg_partition_points, output,
 381                                    self.i.part_ops)
 382
 383         return m
 384
 385
 386 class AddReduceSingle(Elaboratable):
 387     """Add list of numbers together.
 388
 389     :attribute inputs: input ``Signal``s to be summed. Modification not
 390         supported, except for by ``Signal.eq``.
 391     :attribute register_levels: List of nesting levels that should have
 392         pipeline registers.
 393     :attribute output: output sum.
 394     :attribute partition_points: the input partition points. Modification not
 395         supported, except for by ``Signal.eq``.
 396     """
 397
 398     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 399                        partition_points):
 400         """Create an ``AddReduce``.
 401
 402         :param inputs: input ``Signal``s to be summed.
 403         :param output_width: bit-width of ``output``.
 404         :param register_levels: List of nesting levels that should have
 405             pipeline registers.
 406         :param partition_points: the input partition points.
 407         """
 408         self.n_inputs = n_inputs
 409         self.n_parts = n_parts
 410         self.output_width = output_width
 411         self.i = AddReduceData(partition_points, n_inputs,
 412                                output_width, n_parts)
 413         self.register_levels = list(register_levels)
 414         self.partition_points = PartitionPoints(partition_points)
 415         if not self.partition_points.fits_in_width(output_width):
 416             raise ValueError("partition_points doesn't fit in output_width")
 417
 418         max_level = AddReduceSingle.get_max_level(n_inputs)
 419         for level in self.register_levels:
 420             if level > max_level:
 421                 raise ValueError(
 422                     "not enough adder levels for specified register levels")
 423
 424         # this is annoying.  we have to create the modules (and terms)
 425         # because we need to know what they are (in order to set up the
 426         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 427         # etc because this is not in elaboratable.
 428         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 429         self._intermediate_terms = []
 430         if len(self.groups) != 0:
 431             self.create_next_terms()
 432
 433         self.o = AddReduceData(partition_points, len(self._intermediate_terms),
 434                                output_width, n_parts)
 435
 436     @staticmethod
 437     def get_max_level(input_count):
 438         """Get the maximum level.
 439
 440         All ``register_levels`` must be less than or equal to the maximum
 441         level.
 442         """
 443         retval = 0
 444         while True:
 445             groups = AddReduceSingle.full_adder_groups(input_count)
 446             if len(groups) == 0:
 447                 return retval
 448             input_count %= FULL_ADDER_INPUT_COUNT
 449             input_count += 2 * len(groups)
 450             retval += 1
 451
 452     @staticmethod
 453     def full_adder_groups(input_count):
 454         """Get ``inputs`` indices for which a full adder should be built."""
 455         return range(0,
 456                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 457                      FULL_ADDER_INPUT_COUNT)
 458
 459     def elaborate(self, platform):
 460         """Elaborate this module."""
 461         m = Module()
 462
 463         # copy the intermediate terms to the output
 464         for i, value in enumerate(self._intermediate_terms):
 465             m.d.comb += self.o.inputs[i].eq(value)
 466
 467         # copy reg part points and part ops to output
 468         m.d.comb += self.o.reg_partition_points.eq(self.i.reg_partition_points)
 469         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 470                                      for i in range(len(self.i.part_ops))]
 471
 472         # set up the partition mask (for the adders)
 473         mask = self.i.reg_partition_points.as_mask(self.output_width)
 474         m.d.comb += self.part_mask.eq(mask)
 475
 476         # add and link the intermediate term modules
 477         for i, (iidx, adder_i) in enumerate(self.adders):
 478             setattr(m.submodules, f"adder_{i}", adder_i)
 479
 480             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 481             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 482             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 483             m.d.comb += adder_i.mask.eq(self.part_mask)
 484
 485         return m
 486
 487     def create_next_terms(self):
 488
 489         _intermediate_terms = []
 490
 491         def add_intermediate_term(value):
 492             _intermediate_terms.append(value)
 493
 494         # store mask in intermediary (simplifies graph)
 495         self.part_mask = Signal(self.output_width, reset_less=True)
 496
 497         # create full adders for this recursive level.
 498         # this shrinks N terms to 2 * (N // 3) plus the remainder
 499         self.adders = []
 500         for i in self.groups:
 501             adder_i = MaskedFullAdder(self.output_width)
 502             self.adders.append((i, adder_i))
 503             # add both the sum and the masked-carry to the next level.
 504             # 3 inputs have now been reduced to 2...
 505             add_intermediate_term(adder_i.sum)
 506             add_intermediate_term(adder_i.mcarry)
 507         # handle the remaining inputs.
 508         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 509             add_intermediate_term(self.i.inputs[-1])
 510         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 511             # Just pass the terms to the next layer, since we wouldn't gain
 512             # anything by using a half adder since there would still be 2 terms
 513             # and just passing the terms to the next layer saves gates.
 514             add_intermediate_term(self.i.inputs[-2])
 515             add_intermediate_term(self.i.inputs[-1])
 516         else:
 517             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 518
 519         self._intermediate_terms = _intermediate_terms
 520
 521
 522 class AddReduce(Elaboratable):
 523     """Recursively Add list of numbers together.
 524
 525     :attribute inputs: input ``Signal``s to be summed. Modification not
 526         supported, except for by ``Signal.eq``.
 527     :attribute register_levels: List of nesting levels that should have
 528         pipeline registers.
 529     :attribute output: output sum.
 530     :attribute partition_points: the input partition points. Modification not
 531         supported, except for by ``Signal.eq``.
 532     """
 533
 534     def __init__(self, inputs, output_width, register_levels, partition_points,
 535                        part_ops):
 536         """Create an ``AddReduce``.
 537
 538         :param inputs: input ``Signal``s to be summed.
 539         :param output_width: bit-width of ``output``.
 540         :param register_levels: List of nesting levels that should have
 541             pipeline registers.
 542         :param partition_points: the input partition points.
 543         """
 544         self.inputs = inputs
 545         self.part_ops = part_ops
 546         self.out_part_ops = [Signal(2, name=f"out_part_ops_{i}")
 547                           for i in range(len(part_ops))]
 548         self.output = Signal(output_width)
 549         self.output_width = output_width
 550         self.register_levels = register_levels
 551         self.partition_points = partition_points
 552
 553         self.create_levels()
 554
 555     @staticmethod
 556     def get_max_level(input_count):
 557         return AddReduceSingle.get_max_level(input_count)
 558
 559     @staticmethod
 560     def next_register_levels(register_levels):
 561         """``Iterable`` of ``register_levels`` for next recursive level."""
 562         for level in register_levels:
 563             if level > 0:
 564                 yield level - 1
 565
 566     def create_levels(self):
 567         """creates reduction levels"""
 568
 569         mods = []
 570         next_levels = self.register_levels
 571         partition_points = self.partition_points
 572         part_ops = self.part_ops
 573         n_parts = len(part_ops)
 574         inputs = self.inputs
 575         ilen = len(inputs)
 576         while True:
 577             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 578                                          next_levels, partition_points)
 579             mods.append(next_level)
 580             next_levels = list(AddReduce.next_register_levels(next_levels))
 581             partition_points = next_level.i.reg_partition_points
 582             inputs = next_level.o.inputs
 583             ilen = len(inputs)
 584             part_ops = next_level.i.part_ops
 585             groups = AddReduceSingle.full_adder_groups(len(inputs))
 586             if len(groups) == 0:
 587                 break
 588
 589         if ilen != 0:
 590             next_level = FinalAdd(ilen, self.output_width, n_parts,
 591                                   next_levels, partition_points)
 592             mods.append(next_level)
 593
 594         self.levels = mods
 595
 596     def elaborate(self, platform):
 597         """Elaborate this module."""
 598         m = Module()
 599
 600         for i, next_level in enumerate(self.levels):
 601             setattr(m.submodules, "next_level%d" % i, next_level)
 602
 603         partition_points = self.partition_points
 604         inputs = self.inputs
 605         part_ops = self.part_ops
 606         n_parts = len(part_ops)
 607         n_inputs = len(inputs)
 608         output_width = self.output_width
 609         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 610         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 611         for idx in range(len(self.levels)):
 612             mcur = self.levels[idx]
 613             if 0 in mcur.register_levels:
 614                 m.d.sync += mcur.i.eq(i)
 615             else:
 616                 m.d.comb += mcur.i.eq(i)
 617             i = mcur.o # for next loop
 618
 619         # output comes from last module
 620         m.d.comb += self.output.eq(i.output)
 621         copy_part_ops = [self.out_part_ops[idx].eq(i.part_ops[idx])
 622                                      for idx in range(len(self.part_ops))]
 623         m.d.comb += copy_part_ops
 624
 625         return m
 626
 627
 628 OP_MUL_LOW = 0
 629 OP_MUL_SIGNED_HIGH = 1
 630 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 631 OP_MUL_UNSIGNED_HIGH = 3
 632
 633
 634 def get_term(value, shift=0, enabled=None):
 635     if enabled is not None:
 636         value = Mux(enabled, value, 0)
 637     if shift > 0:
 638         value = Cat(Repl(C(0, 1), shift), value)
 639     else:
 640         assert shift == 0
 641     return value
 642
 643
 644 class ProductTerm(Elaboratable):
 645     """ this class creates a single product term (a[..]*b[..]).
 646         it has a design flaw in that is the *output* that is selected,
 647         where the multiplication(s) are combinatorially generated
 648         all the time.
 649     """
 650
 651     def __init__(self, width, twidth, pbwid, a_index, b_index):
 652         self.a_index = a_index
 653         self.b_index = b_index
 654         shift = 8 * (self.a_index + self.b_index)
 655         self.pwidth = width
 656         self.twidth = twidth
 657         self.width = width*2
 658         self.shift = shift
 659
 660         self.ti = Signal(self.width, reset_less=True)
 661         self.term = Signal(twidth, reset_less=True)
 662         self.a = Signal(twidth//2, reset_less=True)
 663         self.b = Signal(twidth//2, reset_less=True)
 664         self.pb_en = Signal(pbwid, reset_less=True)
 665
 666         self.tl = tl = []
 667         min_index = min(self.a_index, self.b_index)
 668         max_index = max(self.a_index, self.b_index)
 669         for i in range(min_index, max_index):
 670             tl.append(self.pb_en[i])
 671         name = "te_%d_%d" % (self.a_index, self.b_index)
 672         if len(tl) > 0:
 673             term_enabled = Signal(name=name, reset_less=True)
 674         else:
 675             term_enabled = None
 676         self.enabled = term_enabled
 677         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 678
 679     def elaborate(self, platform):
 680
 681         m = Module()
 682         if self.enabled is not None:
 683             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 684
 685         bsa = Signal(self.width, reset_less=True)
 686         bsb = Signal(self.width, reset_less=True)
 687         a_index, b_index = self.a_index, self.b_index
 688         pwidth = self.pwidth
 689         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 690         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 691         m.d.comb += self.ti.eq(bsa * bsb)
 692         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 693         """
 694         #TODO: sort out width issues, get inputs a/b switched on/off.
 695         #data going into Muxes is 1/2 the required width
 696
 697         pwidth = self.pwidth
 698         width = self.width
 699         bsa = Signal(self.twidth//2, reset_less=True)
 700         bsb = Signal(self.twidth//2, reset_less=True)
 701         asel = Signal(width, reset_less=True)
 702         bsel = Signal(width, reset_less=True)
 703         a_index, b_index = self.a_index, self.b_index
 704         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 705         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 706         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 707         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 708         m.d.comb += self.ti.eq(bsa * bsb)
 709         m.d.comb += self.term.eq(self.ti)
 710         """
 711
 712         return m
 713
 714
 715 class ProductTerms(Elaboratable):
 716     """ creates a bank of product terms.  also performs the actual bit-selection
 717         this class is to be wrapped with a for-loop on the "a" operand.
 718         it creates a second-level for-loop on the "b" operand.
 719     """
 720     def __init__(self, width, twidth, pbwid, a_index, blen):
 721         self.a_index = a_index
 722         self.blen = blen
 723         self.pwidth = width
 724         self.twidth = twidth
 725         self.pbwid = pbwid
 726         self.a = Signal(twidth//2, reset_less=True)
 727         self.b = Signal(twidth//2, reset_less=True)
 728         self.pb_en = Signal(pbwid, reset_less=True)
 729         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 730                             for i in range(blen)]
 731
 732     def elaborate(self, platform):
 733
 734         m = Module()
 735
 736         for b_index in range(self.blen):
 737             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 738                             self.a_index, b_index)
 739             setattr(m.submodules, "term_%d" % b_index, t)
 740
 741             m.d.comb += t.a.eq(self.a)
 742             m.d.comb += t.b.eq(self.b)
 743             m.d.comb += t.pb_en.eq(self.pb_en)
 744
 745             m.d.comb += self.terms[b_index].eq(t.term)
 746
 747         return m
 748
 749
 750 class LSBNegTerm(Elaboratable):
 751
 752     def __init__(self, bit_width):
 753         self.bit_width = bit_width
 754         self.part = Signal(reset_less=True)
 755         self.signed = Signal(reset_less=True)
 756         self.op = Signal(bit_width, reset_less=True)
 757         self.msb = Signal(reset_less=True)
 758         self.nt = Signal(bit_width*2, reset_less=True)
 759         self.nl = Signal(bit_width*2, reset_less=True)
 760
 761     def elaborate(self, platform):
 762         m = Module()
 763         comb = m.d.comb
 764         bit_wid = self.bit_width
 765         ext = Repl(0, bit_wid) # extend output to HI part
 766
 767         # determine sign of each incoming number *in this partition*
 768         enabled = Signal(reset_less=True)
 769         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 770
 771         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 772         # negation operation is split into a bitwise not and a +1.
 773         # likewise for 16, 32, and 64-bit values.
 774
 775         # width-extended 1s complement if a is signed, otherwise zero
 776         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 777
 778         # add 1 if signed, otherwise add zero
 779         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 780
 781         return m
 782
 783
 784 class Parts(Elaboratable):
 785
 786     def __init__(self, pbwid, epps, n_parts):
 787         self.pbwid = pbwid
 788         # inputs
 789         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 790         # outputs
 791         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 792
 793     def elaborate(self, platform):
 794         m = Module()
 795
 796         epps, parts = self.epps, self.parts
 797         # collect part-bytes (double factor because the input is extended)
 798         pbs = Signal(self.pbwid, reset_less=True)
 799         tl = []
 800         for i in range(self.pbwid):
 801             pb = Signal(name="pb%d" % i, reset_less=True)
 802             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 803             tl.append(pb)
 804         m.d.comb += pbs.eq(Cat(*tl))
 805
 806         # negated-temporary copy of partition bits
 807         npbs = Signal.like(pbs, reset_less=True)
 808         m.d.comb += npbs.eq(~pbs)
 809         byte_count = 8 // len(parts)
 810         for i in range(len(parts)):
 811             pbl = []
 812             pbl.append(npbs[i * byte_count - 1])
 813             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 814                 pbl.append(pbs[j])
 815             pbl.append(npbs[(i + 1) * byte_count - 1])
 816             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 817             m.d.comb += value.eq(Cat(*pbl))
 818             m.d.comb += parts[i].eq(~(value).bool())
 819
 820         return m
 821
 822
 823 class Part(Elaboratable):
 824     """ a key class which, depending on the partitioning, will determine
 825         what action to take when parts of the output are signed or unsigned.
 826
 827         this requires 2 pieces of data *per operand, per partition*:
 828         whether the MSB is HI/LO (per partition!), and whether a signed
 829         or unsigned operation has been *requested*.
 830
 831         once that is determined, signed is basically carried out
 832         by splitting 2's complement into 1's complement plus one.
 833         1's complement is just a bit-inversion.
 834
 835         the extra terms - as separate terms - are then thrown at the
 836         AddReduce alongside the multiplication part-results.
 837     """
 838     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 839
 840         self.pbwid = pbwid
 841         self.epps = epps
 842
 843         # inputs
 844         self.a = Signal(64)
 845         self.b = Signal(64)
 846         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 847         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 848         self.pbs = Signal(pbwid, reset_less=True)
 849
 850         # outputs
 851         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 852
 853         self.not_a_term = Signal(width)
 854         self.neg_lsb_a_term = Signal(width)
 855         self.not_b_term = Signal(width)
 856         self.neg_lsb_b_term = Signal(width)
 857
 858     def elaborate(self, platform):
 859         m = Module()
 860
 861         pbs, parts = self.pbs, self.parts
 862         epps = self.epps
 863         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 864         m.d.comb += p.epps.eq(epps)
 865         parts = p.parts
 866
 867         byte_count = 8 // len(parts)
 868
 869         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 870                 self.not_a_term, self.neg_lsb_a_term,
 871                 self.not_b_term, self.neg_lsb_b_term)
 872
 873         byte_width = 8 // len(parts) # byte width
 874         bit_wid = 8 * byte_width     # bit width
 875         nat, nbt, nla, nlb = [], [], [], []
 876         for i in range(len(parts)):
 877             # work out bit-inverted and +1 term for a.
 878             pa = LSBNegTerm(bit_wid)
 879             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 880             m.d.comb += pa.part.eq(parts[i])
 881             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 882             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 883             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 884             nat.append(pa.nt)
 885             nla.append(pa.nl)
 886
 887             # work out bit-inverted and +1 term for b
 888             pb = LSBNegTerm(bit_wid)
 889             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 890             m.d.comb += pb.part.eq(parts[i])
 891             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 892             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 893             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 894             nbt.append(pb.nt)
 895             nlb.append(pb.nl)
 896
 897         # concatenate together and return all 4 results.
 898         m.d.comb += [not_a_term.eq(Cat(*nat)),
 899                      not_b_term.eq(Cat(*nbt)),
 900                      neg_lsb_a_term.eq(Cat(*nla)),
 901                      neg_lsb_b_term.eq(Cat(*nlb)),
 902                     ]
 903
 904         return m
 905
 906
 907 class IntermediateOut(Elaboratable):
 908     """ selects the HI/LO part of the multiplication, for a given bit-width
 909         the output is also reconstructed in its SIMD (partition) lanes.
 910     """
 911     def __init__(self, width, out_wid, n_parts):
 912         self.width = width
 913         self.n_parts = n_parts
 914         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 915                                      for i in range(8)]
 916         self.intermed = Signal(out_wid, reset_less=True)
 917         self.output = Signal(out_wid//2, reset_less=True)
 918
 919     def elaborate(self, platform):
 920         m = Module()
 921
 922         ol = []
 923         w = self.width
 924         sel = w // 8
 925         for i in range(self.n_parts):
 926             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 927             m.d.comb += op.eq(
 928                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 929                     self.intermed.part(i * w*2, w),
 930                     self.intermed.part(i * w*2 + w, w)))
 931             ol.append(op)
 932         m.d.comb += self.output.eq(Cat(*ol))
 933
 934         return m
 935
 936
 937 class FinalOut(Elaboratable):
 938     """ selects the final output based on the partitioning.
 939
 940         each byte is selectable independently, i.e. it is possible
 941         that some partitions requested 8-bit computation whilst others
 942         requested 16 or 32 bit.
 943     """
 944     def __init__(self, out_wid):
 945         # inputs
 946         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 947         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 948         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 949
 950         self.i8 = Signal(out_wid, reset_less=True)
 951         self.i16 = Signal(out_wid, reset_less=True)
 952         self.i32 = Signal(out_wid, reset_less=True)
 953         self.i64 = Signal(out_wid, reset_less=True)
 954
 955         # output
 956         self.out = Signal(out_wid, reset_less=True)
 957
 958     def elaborate(self, platform):
 959         m = Module()
 960         ol = []
 961         for i in range(8):
 962             # select one of the outputs: d8 selects i8, d16 selects i16
 963             # d32 selects i32, and the default is i64.
 964             # d8 and d16 are ORed together in the first Mux
 965             # then the 2nd selects either i8 or i16.
 966             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 967             op = Signal(8, reset_less=True, name="op_%d" % i)
 968             m.d.comb += op.eq(
 969                 Mux(self.d8[i] | self.d16[i // 2],
 970                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 971                                      self.i16.part(i * 8, 8)),
 972                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 973                                           self.i64.part(i * 8, 8))))
 974             ol.append(op)
 975         m.d.comb += self.out.eq(Cat(*ol))
 976         return m
 977
 978
 979 class OrMod(Elaboratable):
 980     """ ORs four values together in a hierarchical tree
 981     """
 982     def __init__(self, wid):
 983         self.wid = wid
 984         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 985                      for i in range(4)]
 986         self.orout = Signal(wid, reset_less=True)
 987
 988     def elaborate(self, platform):
 989         m = Module()
 990         or1 = Signal(self.wid, reset_less=True)
 991         or2 = Signal(self.wid, reset_less=True)
 992         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 993         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 994         m.d.comb += self.orout.eq(or1 | or2)
 995
 996         return m
 997
 998
 999 class Signs(Elaboratable):
1000     """ determines whether a or b are signed numbers
1001         based on the required operation type (OP_MUL_*)
1002     """
1003
1004     def __init__(self):
1005         self.part_ops = Signal(2, reset_less=True)
1006         self.a_signed = Signal(reset_less=True)
1007         self.b_signed = Signal(reset_less=True)
1008
1009     def elaborate(self, platform):
1010
1011         m = Module()
1012
1013         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1014         bsig = (self.part_ops == OP_MUL_LOW) \
1015                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1016         m.d.comb += self.a_signed.eq(asig)
1017         m.d.comb += self.b_signed.eq(bsig)
1018
1019         return m
1020
1021
1022 class Mul8_16_32_64(Elaboratable):
1023     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1024
1025     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1026     partitions on naturally-aligned boundaries. Supports the operation being
1027     set for each partition independently.
1028
1029     :attribute part_pts: the input partition points. Has a partition point at
1030         multiples of 8 in 0 < i < 64. Each partition point's associated
1031         ``Value`` is a ``Signal``. Modification not supported, except for by
1032         ``Signal.eq``.
1033     :attribute part_ops: the operation for each byte. The operation for a
1034         particular partition is selected by assigning the selected operation
1035         code to each byte in the partition. The allowed operation codes are:
1036
1037         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1038             RISC-V's `mul` instruction.
1039         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1040             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1041             instruction.
1042         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1043             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1044             `mulhsu` instruction.
1045         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1046             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1047             instruction.
1048     """
1049
1050     def __init__(self, register_levels=()):
1051         """ register_levels: specifies the points in the cascade at which
1052             flip-flops are to be inserted.
1053         """
1054
1055         # parameter(s)
1056         self.register_levels = list(register_levels)
1057
1058         # inputs
1059         self.part_pts = PartitionPoints()
1060         for i in range(8, 64, 8):
1061             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1062         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1063         self.a = Signal(64)
1064         self.b = Signal(64)
1065
1066         # intermediates (needed for unit tests)
1067         self._intermediate_output = Signal(128)
1068
1069         # output
1070         self.output = Signal(64)
1071
1072     def elaborate(self, platform):
1073         m = Module()
1074
1075         # collect part-bytes
1076         pbs = Signal(8, reset_less=True)
1077         tl = []
1078         for i in range(8):
1079             pb = Signal(name="pb%d" % i, reset_less=True)
1080             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1081             tl.append(pb)
1082         m.d.comb += pbs.eq(Cat(*tl))
1083
1084         # create (doubled) PartitionPoints (output is double input width)
1085         expanded_part_pts = eps = PartitionPoints()
1086         for i, v in self.part_pts.items():
1087             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1088             expanded_part_pts[i * 2] = ep
1089             m.d.comb += ep.eq(v)
1090
1091         # local variables
1092         signs = []
1093         for i in range(8):
1094             s = Signs()
1095             signs.append(s)
1096             setattr(m.submodules, "signs%d" % i, s)
1097             m.d.comb += s.part_ops.eq(self.part_ops[i])
1098
1099         n_levels = len(self.register_levels)+1
1100         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1101         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1102         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1103         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1104         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1105         for mod in [part_8, part_16, part_32, part_64]:
1106             m.d.comb += mod.a.eq(self.a)
1107             m.d.comb += mod.b.eq(self.b)
1108             for i in range(len(signs)):
1109                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1110                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1111             m.d.comb += mod.pbs.eq(pbs)
1112             nat_l.append(mod.not_a_term)
1113             nbt_l.append(mod.not_b_term)
1114             nla_l.append(mod.neg_lsb_a_term)
1115             nlb_l.append(mod.neg_lsb_b_term)
1116
1117         terms = []
1118
1119         for a_index in range(8):
1120             t = ProductTerms(8, 128, 8, a_index, 8)
1121             setattr(m.submodules, "terms_%d" % a_index, t)
1122
1123             m.d.comb += t.a.eq(self.a)
1124             m.d.comb += t.b.eq(self.b)
1125             m.d.comb += t.pb_en.eq(pbs)
1126
1127             for term in t.terms:
1128                 terms.append(term)
1129
1130         # it's fine to bitwise-or data together since they are never enabled
1131         # at the same time
1132         m.submodules.nat_or = nat_or = OrMod(128)
1133         m.submodules.nbt_or = nbt_or = OrMod(128)
1134         m.submodules.nla_or = nla_or = OrMod(128)
1135         m.submodules.nlb_or = nlb_or = OrMod(128)
1136         for l, mod in [(nat_l, nat_or),
1137                              (nbt_l, nbt_or),
1138                              (nla_l, nla_or),
1139                              (nlb_l, nlb_or)]:
1140             for i in range(len(l)):
1141                 m.d.comb += mod.orin[i].eq(l[i])
1142             terms.append(mod.orout)
1143
1144         add_reduce = AddReduce(terms,
1145                                128,
1146                                self.register_levels,
1147                                expanded_part_pts,
1148                                self.part_ops)
1149
1150         out_part_ops = add_reduce.out_part_ops
1151         out_part_pts = add_reduce.levels[-1].o.reg_partition_points
1152
1153         m.submodules.add_reduce = add_reduce
1154         m.d.comb += self._intermediate_output.eq(add_reduce.output)
1155         # create _output_64
1156         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1157         m.d.comb += io64.intermed.eq(self._intermediate_output)
1158         for i in range(8):
1159             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1160
1161         # create _output_32
1162         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1163         m.d.comb += io32.intermed.eq(self._intermediate_output)
1164         for i in range(8):
1165             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1166
1167         # create _output_16
1168         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1169         m.d.comb += io16.intermed.eq(self._intermediate_output)
1170         for i in range(8):
1171             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1172
1173         # create _output_8
1174         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1175         m.d.comb += io8.intermed.eq(self._intermediate_output)
1176         for i in range(8):
1177             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1178
1179         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1180         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1181         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1182         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1183
1184         m.d.comb += p_8.epps.eq(out_part_pts)
1185         m.d.comb += p_16.epps.eq(out_part_pts)
1186         m.d.comb += p_32.epps.eq(out_part_pts)
1187         m.d.comb += p_64.epps.eq(out_part_pts)
1188
1189         # final output
1190         m.submodules.finalout = finalout = FinalOut(64)
1191         for i in range(len(part_8.parts)):
1192             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1193         for i in range(len(part_16.parts)):
1194             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1195         for i in range(len(part_32.parts)):
1196             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1197         m.d.comb += finalout.i8.eq(io8.output)
1198         m.d.comb += finalout.i16.eq(io16.output)
1199         m.d.comb += finalout.i32.eq(io32.output)
1200         m.d.comb += finalout.i64.eq(io64.output)
1201         m.d.comb += self.output.eq(finalout.out)
1202
1203         return m
1204
1205
1206 if __name__ == "__main__":
1207     m = Mul8_16_32_64()
1208     main(m, ports=[m.a,
1209                    m.b,
1210                    m._intermediate_output,
1211                    m.output,
1212                    *m.part_ops,
1213                    *m.part_pts.values()])