src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width, reset_less=True)
 136         self.in1 = Signal(width, reset_less=True)
 137         self.in2 = Signal(width, reset_less=True)
 138         self.sum = Signal(width, reset_less=True)
 139         self.carry = Signal(width, reset_less=True)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249
 250     def elaborate(self, platform):
 251         """Elaborate this module."""
 252         m = Module()
 253         expanded_a = Signal(self._expanded_width, reset_less=True)
 254         expanded_b = Signal(self._expanded_width, reset_less=True)
 255         expanded_o = Signal(self._expanded_width, reset_less=True)
 256
 257         expanded_index = 0
 258         # store bits in a list, use Cat later.  graphviz is much cleaner
 259         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 260
 261         # partition points are "breaks" (extra zeros or 1s) in what would
 262         # otherwise be a massive long add.  when the "break" points are 0,
 263         # whatever is in it (in the output) is discarded.  however when
 264         # there is a "1", it causes a roll-over carry to the *next* bit.
 265         # we still ignore the "break" bit in the [intermediate] output,
 266         # however by that time we've got the effect that we wanted: the
 267         # carry has been carried *over* the break point.
 268
 269         for i in range(self.width):
 270             if i in self.partition_points:
 271                 # add extra bit set to 0 + 0 for enabled partition points
 272                 # and 1 + 0 for disabled partition points
 273                 ea.append(expanded_a[expanded_index])
 274                 al.append(~self.partition_points[i]) # add extra bit in a
 275                 eb.append(expanded_b[expanded_index])
 276                 bl.append(C(0)) # yes, add a zero
 277                 expanded_index += 1 # skip the extra point.  NOT in the output
 278             ea.append(expanded_a[expanded_index])
 279             eb.append(expanded_b[expanded_index])
 280             eo.append(expanded_o[expanded_index])
 281             al.append(self.a[i])
 282             bl.append(self.b[i])
 283             ol.append(self.output[i])
 284             expanded_index += 1
 285
 286         # combine above using Cat
 287         m.d.comb += Cat(*ea).eq(Cat(*al))
 288         m.d.comb += Cat(*eb).eq(Cat(*bl))
 289         m.d.comb += Cat(*ol).eq(Cat(*eo))
 290
 291         # use only one addition to take advantage of look-ahead carry and
 292         # special hardware on FPGAs
 293         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 294         return m
 295
 296
 297 FULL_ADDER_INPUT_COUNT = 3
 298
 299 class AddReduceData:
 300
 301     def __init__(self, ppoints, n_inputs, output_width, n_parts):
 302         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 303                           for i in range(n_parts)]
 304         self.inputs = [Signal(output_width, name=f"inputs[{i}]", reset_less=True)
 305             for i in range(n_inputs)]
 306         self.reg_partition_points = ppoints.like()
 307
 308     def eq_from(self, reg_partition_points, inputs, part_ops):
 309         return [self.reg_partition_points.eq(reg_partition_points)] + \
 310                [self.inputs[i].eq(inputs[i])
 311                                      for i in range(len(self.inputs))] + \
 312                [self.part_ops[i].eq(part_ops[i])
 313                                      for i in range(len(self.part_ops))]
 314
 315     def eq(self, rhs):
 316         return self.eq_from(rhs.reg_partition_points, rhs.inputs, rhs.part_ops)
 317
 318
 319 class FinalReduceData:
 320
 321     def __init__(self, ppoints, output_width, n_parts):
 322         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 323                           for i in range(n_parts)]
 324         self.output = Signal(output_width, reset_less=True)
 325         self.reg_partition_points = ppoints.like()
 326
 327     def eq_from(self, reg_partition_points, output, part_ops):
 328         return [self.reg_partition_points.eq(reg_partition_points)] + \
 329                [self.output.eq(output)] + \
 330                [self.part_ops[i].eq(part_ops[i])
 331                                      for i in range(len(self.part_ops))]
 332
 333     def eq(self, rhs):
 334         return self.eq_from(rhs.reg_partition_points, rhs.output, rhs.part_ops)
 335
 336
 337 class FinalAdd(Elaboratable):
 338     """ Final stage of add reduce
 339     """
 340
 341     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 342                        partition_points):
 343         self.i = AddReduceData(partition_points, n_inputs,
 344                                output_width, n_parts)
 345         self.o = FinalReduceData(partition_points, output_width, n_parts)
 346         self.output_width = output_width
 347         self.n_inputs = n_inputs
 348         self.n_parts = n_parts
 349         self.register_levels = list(register_levels)
 350         self.partition_points = PartitionPoints(partition_points)
 351         if not self.partition_points.fits_in_width(output_width):
 352             raise ValueError("partition_points doesn't fit in output_width")
 353
 354     def elaborate(self, platform):
 355         """Elaborate this module."""
 356         m = Module()
 357
 358         output_width = self.output_width
 359         output = Signal(output_width, reset_less=True)
 360         if self.n_inputs == 0:
 361             # use 0 as the default output value
 362             m.d.comb += output.eq(0)
 363         elif self.n_inputs == 1:
 364             # handle single input
 365             m.d.comb += output.eq(self.i.inputs[0])
 366         else:
 367             # base case for adding 2 inputs
 368             assert self.n_inputs == 2
 369             adder = PartitionedAdder(output_width, self.i.reg_partition_points)
 370             m.submodules.final_adder = adder
 371             m.d.comb += adder.a.eq(self.i.inputs[0])
 372             m.d.comb += adder.b.eq(self.i.inputs[1])
 373             m.d.comb += output.eq(adder.output)
 374
 375         # create output
 376         m.d.comb += self.o.eq_from(self.i.reg_partition_points, output,
 377                                    self.i.part_ops)
 378
 379         return m
 380
 381
 382 class AddReduceSingle(Elaboratable):
 383     """Add list of numbers together.
 384
 385     :attribute inputs: input ``Signal``s to be summed. Modification not
 386         supported, except for by ``Signal.eq``.
 387     :attribute register_levels: List of nesting levels that should have
 388         pipeline registers.
 389     :attribute output: output sum.
 390     :attribute partition_points: the input partition points. Modification not
 391         supported, except for by ``Signal.eq``.
 392     """
 393
 394     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 395                        partition_points):
 396         """Create an ``AddReduce``.
 397
 398         :param inputs: input ``Signal``s to be summed.
 399         :param output_width: bit-width of ``output``.
 400         :param register_levels: List of nesting levels that should have
 401             pipeline registers.
 402         :param partition_points: the input partition points.
 403         """
 404         self.n_inputs = n_inputs
 405         self.n_parts = n_parts
 406         self.output_width = output_width
 407         self.i = AddReduceData(partition_points, n_inputs,
 408                                output_width, n_parts)
 409         self.register_levels = list(register_levels)
 410         self.partition_points = PartitionPoints(partition_points)
 411         if not self.partition_points.fits_in_width(output_width):
 412             raise ValueError("partition_points doesn't fit in output_width")
 413
 414         max_level = AddReduceSingle.get_max_level(n_inputs)
 415         for level in self.register_levels:
 416             if level > max_level:
 417                 raise ValueError(
 418                     "not enough adder levels for specified register levels")
 419
 420         # this is annoying.  we have to create the modules (and terms)
 421         # because we need to know what they are (in order to set up the
 422         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 423         # etc because this is not in elaboratable.
 424         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 425         self._intermediate_terms = []
 426         self.adders = []
 427         if len(self.groups) != 0:
 428             self.create_next_terms()
 429
 430         self.o = AddReduceData(partition_points, len(self._intermediate_terms),
 431                                output_width, n_parts)
 432
 433     @staticmethod
 434     def get_max_level(input_count):
 435         """Get the maximum level.
 436
 437         All ``register_levels`` must be less than or equal to the maximum
 438         level.
 439         """
 440         retval = 0
 441         while True:
 442             groups = AddReduceSingle.full_adder_groups(input_count)
 443             if len(groups) == 0:
 444                 return retval
 445             input_count %= FULL_ADDER_INPUT_COUNT
 446             input_count += 2 * len(groups)
 447             retval += 1
 448
 449     @staticmethod
 450     def full_adder_groups(input_count):
 451         """Get ``inputs`` indices for which a full adder should be built."""
 452         return range(0,
 453                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 454                      FULL_ADDER_INPUT_COUNT)
 455
 456     def elaborate(self, platform):
 457         """Elaborate this module."""
 458         m = Module()
 459
 460         # copy the intermediate terms to the output
 461         for i, value in enumerate(self._intermediate_terms):
 462             m.d.comb += self.o.inputs[i].eq(value)
 463
 464         # copy reg part points and part ops to output
 465         m.d.comb += self.o.reg_partition_points.eq(self.i.reg_partition_points)
 466         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 467                                      for i in range(len(self.i.part_ops))]
 468
 469         # set up the partition mask (for the adders)
 470         part_mask = Signal(self.output_width, reset_less=True)
 471
 472         mask = self.i.reg_partition_points.as_mask(self.output_width)
 473         m.d.comb += part_mask.eq(mask)
 474
 475         # add and link the intermediate term modules
 476         for i, (iidx, adder_i) in enumerate(self.adders):
 477             setattr(m.submodules, f"adder_{i}", adder_i)
 478
 479             m.d.comb += adder_i.in0.eq(self.i.inputs[iidx])
 480             m.d.comb += adder_i.in1.eq(self.i.inputs[iidx + 1])
 481             m.d.comb += adder_i.in2.eq(self.i.inputs[iidx + 2])
 482             m.d.comb += adder_i.mask.eq(part_mask)
 483
 484         return m
 485
 486     def create_next_terms(self):
 487
 488         _intermediate_terms = []
 489
 490         def add_intermediate_term(value):
 491             _intermediate_terms.append(value)
 492
 493         # create full adders for this recursive level.
 494         # this shrinks N terms to 2 * (N // 3) plus the remainder
 495         for i in self.groups:
 496             adder_i = MaskedFullAdder(self.output_width)
 497             self.adders.append((i, adder_i))
 498             # add both the sum and the masked-carry to the next level.
 499             # 3 inputs have now been reduced to 2...
 500             add_intermediate_term(adder_i.sum)
 501             add_intermediate_term(adder_i.mcarry)
 502         # handle the remaining inputs.
 503         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 504             add_intermediate_term(self.i.inputs[-1])
 505         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 506             # Just pass the terms to the next layer, since we wouldn't gain
 507             # anything by using a half adder since there would still be 2 terms
 508             # and just passing the terms to the next layer saves gates.
 509             add_intermediate_term(self.i.inputs[-2])
 510             add_intermediate_term(self.i.inputs[-1])
 511         else:
 512             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 513
 514         self._intermediate_terms = _intermediate_terms
 515
 516
 517 class AddReduce(Elaboratable):
 518     """Recursively Add list of numbers together.
 519
 520     :attribute inputs: input ``Signal``s to be summed. Modification not
 521         supported, except for by ``Signal.eq``.
 522     :attribute register_levels: List of nesting levels that should have
 523         pipeline registers.
 524     :attribute output: output sum.
 525     :attribute partition_points: the input partition points. Modification not
 526         supported, except for by ``Signal.eq``.
 527     """
 528
 529     def __init__(self, inputs, output_width, register_levels, partition_points,
 530                        part_ops):
 531         """Create an ``AddReduce``.
 532
 533         :param inputs: input ``Signal``s to be summed.
 534         :param output_width: bit-width of ``output``.
 535         :param register_levels: List of nesting levels that should have
 536             pipeline registers.
 537         :param partition_points: the input partition points.
 538         """
 539         self.inputs = inputs
 540         self.part_ops = part_ops
 541         n_parts = len(part_ops)
 542         self.o = FinalReduceData(partition_points, output_width, n_parts)
 543         self.output_width = output_width
 544         self.register_levels = register_levels
 545         self.partition_points = partition_points
 546
 547         self.create_levels()
 548
 549     @staticmethod
 550     def get_max_level(input_count):
 551         return AddReduceSingle.get_max_level(input_count)
 552
 553     @staticmethod
 554     def next_register_levels(register_levels):
 555         """``Iterable`` of ``register_levels`` for next recursive level."""
 556         for level in register_levels:
 557             if level > 0:
 558                 yield level - 1
 559
 560     def create_levels(self):
 561         """creates reduction levels"""
 562
 563         mods = []
 564         next_levels = self.register_levels
 565         partition_points = self.partition_points
 566         part_ops = self.part_ops
 567         n_parts = len(part_ops)
 568         inputs = self.inputs
 569         ilen = len(inputs)
 570         while True:
 571             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 572                                          next_levels, partition_points)
 573             mods.append(next_level)
 574             next_levels = list(AddReduce.next_register_levels(next_levels))
 575             partition_points = next_level.i.reg_partition_points
 576             inputs = next_level.o.inputs
 577             ilen = len(inputs)
 578             part_ops = next_level.i.part_ops
 579             groups = AddReduceSingle.full_adder_groups(len(inputs))
 580             if len(groups) == 0:
 581                 break
 582
 583         next_level = FinalAdd(ilen, self.output_width, n_parts,
 584                               next_levels, partition_points)
 585         mods.append(next_level)
 586
 587         self.levels = mods
 588
 589     def elaborate(self, platform):
 590         """Elaborate this module."""
 591         m = Module()
 592
 593         for i, next_level in enumerate(self.levels):
 594             setattr(m.submodules, "next_level%d" % i, next_level)
 595
 596         partition_points = self.partition_points
 597         inputs = self.inputs
 598         part_ops = self.part_ops
 599         n_parts = len(part_ops)
 600         n_inputs = len(inputs)
 601         output_width = self.output_width
 602         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 603         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 604         for idx in range(len(self.levels)):
 605             mcur = self.levels[idx]
 606             if 0 in mcur.register_levels:
 607                 m.d.sync += mcur.i.eq(i)
 608             else:
 609                 m.d.comb += mcur.i.eq(i)
 610             i = mcur.o # for next loop
 611
 612         # output comes from last module
 613         m.d.comb += self.o.eq(i)
 614
 615         return m
 616
 617
 618 OP_MUL_LOW = 0
 619 OP_MUL_SIGNED_HIGH = 1
 620 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 621 OP_MUL_UNSIGNED_HIGH = 3
 622
 623
 624 def get_term(value, shift=0, enabled=None):
 625     if enabled is not None:
 626         value = Mux(enabled, value, 0)
 627     if shift > 0:
 628         value = Cat(Repl(C(0, 1), shift), value)
 629     else:
 630         assert shift == 0
 631     return value
 632
 633
 634 class ProductTerm(Elaboratable):
 635     """ this class creates a single product term (a[..]*b[..]).
 636         it has a design flaw in that is the *output* that is selected,
 637         where the multiplication(s) are combinatorially generated
 638         all the time.
 639     """
 640
 641     def __init__(self, width, twidth, pbwid, a_index, b_index):
 642         self.a_index = a_index
 643         self.b_index = b_index
 644         shift = 8 * (self.a_index + self.b_index)
 645         self.pwidth = width
 646         self.twidth = twidth
 647         self.width = width*2
 648         self.shift = shift
 649
 650         self.ti = Signal(self.width, reset_less=True)
 651         self.term = Signal(twidth, reset_less=True)
 652         self.a = Signal(twidth//2, reset_less=True)
 653         self.b = Signal(twidth//2, reset_less=True)
 654         self.pb_en = Signal(pbwid, reset_less=True)
 655
 656         self.tl = tl = []
 657         min_index = min(self.a_index, self.b_index)
 658         max_index = max(self.a_index, self.b_index)
 659         for i in range(min_index, max_index):
 660             tl.append(self.pb_en[i])
 661         name = "te_%d_%d" % (self.a_index, self.b_index)
 662         if len(tl) > 0:
 663             term_enabled = Signal(name=name, reset_less=True)
 664         else:
 665             term_enabled = None
 666         self.enabled = term_enabled
 667         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 668
 669     def elaborate(self, platform):
 670
 671         m = Module()
 672         if self.enabled is not None:
 673             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 674
 675         bsa = Signal(self.width, reset_less=True)
 676         bsb = Signal(self.width, reset_less=True)
 677         a_index, b_index = self.a_index, self.b_index
 678         pwidth = self.pwidth
 679         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 680         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 681         m.d.comb += self.ti.eq(bsa * bsb)
 682         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 683         """
 684         #TODO: sort out width issues, get inputs a/b switched on/off.
 685         #data going into Muxes is 1/2 the required width
 686
 687         pwidth = self.pwidth
 688         width = self.width
 689         bsa = Signal(self.twidth//2, reset_less=True)
 690         bsb = Signal(self.twidth//2, reset_less=True)
 691         asel = Signal(width, reset_less=True)
 692         bsel = Signal(width, reset_less=True)
 693         a_index, b_index = self.a_index, self.b_index
 694         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 695         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 696         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 697         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 698         m.d.comb += self.ti.eq(bsa * bsb)
 699         m.d.comb += self.term.eq(self.ti)
 700         """
 701
 702         return m
 703
 704
 705 class ProductTerms(Elaboratable):
 706     """ creates a bank of product terms.  also performs the actual bit-selection
 707         this class is to be wrapped with a for-loop on the "a" operand.
 708         it creates a second-level for-loop on the "b" operand.
 709     """
 710     def __init__(self, width, twidth, pbwid, a_index, blen):
 711         self.a_index = a_index
 712         self.blen = blen
 713         self.pwidth = width
 714         self.twidth = twidth
 715         self.pbwid = pbwid
 716         self.a = Signal(twidth//2, reset_less=True)
 717         self.b = Signal(twidth//2, reset_less=True)
 718         self.pb_en = Signal(pbwid, reset_less=True)
 719         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 720                             for i in range(blen)]
 721
 722     def elaborate(self, platform):
 723
 724         m = Module()
 725
 726         for b_index in range(self.blen):
 727             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 728                             self.a_index, b_index)
 729             setattr(m.submodules, "term_%d" % b_index, t)
 730
 731             m.d.comb += t.a.eq(self.a)
 732             m.d.comb += t.b.eq(self.b)
 733             m.d.comb += t.pb_en.eq(self.pb_en)
 734
 735             m.d.comb += self.terms[b_index].eq(t.term)
 736
 737         return m
 738
 739
 740 class LSBNegTerm(Elaboratable):
 741
 742     def __init__(self, bit_width):
 743         self.bit_width = bit_width
 744         self.part = Signal(reset_less=True)
 745         self.signed = Signal(reset_less=True)
 746         self.op = Signal(bit_width, reset_less=True)
 747         self.msb = Signal(reset_less=True)
 748         self.nt = Signal(bit_width*2, reset_less=True)
 749         self.nl = Signal(bit_width*2, reset_less=True)
 750
 751     def elaborate(self, platform):
 752         m = Module()
 753         comb = m.d.comb
 754         bit_wid = self.bit_width
 755         ext = Repl(0, bit_wid) # extend output to HI part
 756
 757         # determine sign of each incoming number *in this partition*
 758         enabled = Signal(reset_less=True)
 759         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 760
 761         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 762         # negation operation is split into a bitwise not and a +1.
 763         # likewise for 16, 32, and 64-bit values.
 764
 765         # width-extended 1s complement if a is signed, otherwise zero
 766         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 767
 768         # add 1 if signed, otherwise add zero
 769         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 770
 771         return m
 772
 773
 774 class Parts(Elaboratable):
 775
 776     def __init__(self, pbwid, epps, n_parts):
 777         self.pbwid = pbwid
 778         # inputs
 779         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 780         # outputs
 781         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 782                       for i in range(n_parts)]
 783
 784     def elaborate(self, platform):
 785         m = Module()
 786
 787         epps, parts = self.epps, self.parts
 788         # collect part-bytes (double factor because the input is extended)
 789         pbs = Signal(self.pbwid, reset_less=True)
 790         tl = []
 791         for i in range(self.pbwid):
 792             pb = Signal(name="pb%d" % i, reset_less=True)
 793             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 794             tl.append(pb)
 795         m.d.comb += pbs.eq(Cat(*tl))
 796
 797         # negated-temporary copy of partition bits
 798         npbs = Signal.like(pbs, reset_less=True)
 799         m.d.comb += npbs.eq(~pbs)
 800         byte_count = 8 // len(parts)
 801         for i in range(len(parts)):
 802             pbl = []
 803             pbl.append(npbs[i * byte_count - 1])
 804             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 805                 pbl.append(pbs[j])
 806             pbl.append(npbs[(i + 1) * byte_count - 1])
 807             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 808             m.d.comb += value.eq(Cat(*pbl))
 809             m.d.comb += parts[i].eq(~(value).bool())
 810
 811         return m
 812
 813
 814 class Part(Elaboratable):
 815     """ a key class which, depending on the partitioning, will determine
 816         what action to take when parts of the output are signed or unsigned.
 817
 818         this requires 2 pieces of data *per operand, per partition*:
 819         whether the MSB is HI/LO (per partition!), and whether a signed
 820         or unsigned operation has been *requested*.
 821
 822         once that is determined, signed is basically carried out
 823         by splitting 2's complement into 1's complement plus one.
 824         1's complement is just a bit-inversion.
 825
 826         the extra terms - as separate terms - are then thrown at the
 827         AddReduce alongside the multiplication part-results.
 828     """
 829     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 830
 831         self.pbwid = pbwid
 832         self.epps = epps
 833
 834         # inputs
 835         self.a = Signal(64)
 836         self.b = Signal(64)
 837         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 838         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 839         self.pbs = Signal(pbwid, reset_less=True)
 840
 841         # outputs
 842         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 843
 844         self.not_a_term = Signal(width)
 845         self.neg_lsb_a_term = Signal(width)
 846         self.not_b_term = Signal(width)
 847         self.neg_lsb_b_term = Signal(width)
 848
 849     def elaborate(self, platform):
 850         m = Module()
 851
 852         pbs, parts = self.pbs, self.parts
 853         epps = self.epps
 854         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 855         m.d.comb += p.epps.eq(epps)
 856         parts = p.parts
 857
 858         byte_count = 8 // len(parts)
 859
 860         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 861                 self.not_a_term, self.neg_lsb_a_term,
 862                 self.not_b_term, self.neg_lsb_b_term)
 863
 864         byte_width = 8 // len(parts) # byte width
 865         bit_wid = 8 * byte_width     # bit width
 866         nat, nbt, nla, nlb = [], [], [], []
 867         for i in range(len(parts)):
 868             # work out bit-inverted and +1 term for a.
 869             pa = LSBNegTerm(bit_wid)
 870             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 871             m.d.comb += pa.part.eq(parts[i])
 872             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 873             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 874             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 875             nat.append(pa.nt)
 876             nla.append(pa.nl)
 877
 878             # work out bit-inverted and +1 term for b
 879             pb = LSBNegTerm(bit_wid)
 880             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 881             m.d.comb += pb.part.eq(parts[i])
 882             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 883             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 884             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 885             nbt.append(pb.nt)
 886             nlb.append(pb.nl)
 887
 888         # concatenate together and return all 4 results.
 889         m.d.comb += [not_a_term.eq(Cat(*nat)),
 890                      not_b_term.eq(Cat(*nbt)),
 891                      neg_lsb_a_term.eq(Cat(*nla)),
 892                      neg_lsb_b_term.eq(Cat(*nlb)),
 893                     ]
 894
 895         return m
 896
 897
 898 class IntermediateOut(Elaboratable):
 899     """ selects the HI/LO part of the multiplication, for a given bit-width
 900         the output is also reconstructed in its SIMD (partition) lanes.
 901     """
 902     def __init__(self, width, out_wid, n_parts):
 903         self.width = width
 904         self.n_parts = n_parts
 905         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 906                                      for i in range(8)]
 907         self.intermed = Signal(out_wid, reset_less=True)
 908         self.output = Signal(out_wid//2, reset_less=True)
 909
 910     def elaborate(self, platform):
 911         m = Module()
 912
 913         ol = []
 914         w = self.width
 915         sel = w // 8
 916         for i in range(self.n_parts):
 917             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 918             m.d.comb += op.eq(
 919                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 920                     self.intermed.part(i * w*2, w),
 921                     self.intermed.part(i * w*2 + w, w)))
 922             ol.append(op)
 923         m.d.comb += self.output.eq(Cat(*ol))
 924
 925         return m
 926
 927
 928 class FinalOut(Elaboratable):
 929     """ selects the final output based on the partitioning.
 930
 931         each byte is selectable independently, i.e. it is possible
 932         that some partitions requested 8-bit computation whilst others
 933         requested 16 or 32 bit.
 934     """
 935     def __init__(self, out_wid):
 936         # inputs
 937         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 938         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 939         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 940
 941         self.i8 = Signal(out_wid, reset_less=True)
 942         self.i16 = Signal(out_wid, reset_less=True)
 943         self.i32 = Signal(out_wid, reset_less=True)
 944         self.i64 = Signal(out_wid, reset_less=True)
 945
 946         # output
 947         self.out = Signal(out_wid, reset_less=True)
 948
 949     def elaborate(self, platform):
 950         m = Module()
 951         ol = []
 952         for i in range(8):
 953             # select one of the outputs: d8 selects i8, d16 selects i16
 954             # d32 selects i32, and the default is i64.
 955             # d8 and d16 are ORed together in the first Mux
 956             # then the 2nd selects either i8 or i16.
 957             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 958             op = Signal(8, reset_less=True, name="op_%d" % i)
 959             m.d.comb += op.eq(
 960                 Mux(self.d8[i] | self.d16[i // 2],
 961                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 962                                      self.i16.part(i * 8, 8)),
 963                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 964                                           self.i64.part(i * 8, 8))))
 965             ol.append(op)
 966         m.d.comb += self.out.eq(Cat(*ol))
 967         return m
 968
 969
 970 class OrMod(Elaboratable):
 971     """ ORs four values together in a hierarchical tree
 972     """
 973     def __init__(self, wid):
 974         self.wid = wid
 975         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 976                      for i in range(4)]
 977         self.orout = Signal(wid, reset_less=True)
 978
 979     def elaborate(self, platform):
 980         m = Module()
 981         or1 = Signal(self.wid, reset_less=True)
 982         or2 = Signal(self.wid, reset_less=True)
 983         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 984         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 985         m.d.comb += self.orout.eq(or1 | or2)
 986
 987         return m
 988
 989
 990 class Signs(Elaboratable):
 991     """ determines whether a or b are signed numbers
 992         based on the required operation type (OP_MUL_*)
 993     """
 994
 995     def __init__(self):
 996         self.part_ops = Signal(2, reset_less=True)
 997         self.a_signed = Signal(reset_less=True)
 998         self.b_signed = Signal(reset_less=True)
 999
1000     def elaborate(self, platform):
1001
1002         m = Module()
1003
1004         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1005         bsig = (self.part_ops == OP_MUL_LOW) \
1006                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1007         m.d.comb += self.a_signed.eq(asig)
1008         m.d.comb += self.b_signed.eq(bsig)
1009
1010         return m
1011
1012
1013 class Mul8_16_32_64(Elaboratable):
1014     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1015
1016     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1017     partitions on naturally-aligned boundaries. Supports the operation being
1018     set for each partition independently.
1019
1020     :attribute part_pts: the input partition points. Has a partition point at
1021         multiples of 8 in 0 < i < 64. Each partition point's associated
1022         ``Value`` is a ``Signal``. Modification not supported, except for by
1023         ``Signal.eq``.
1024     :attribute part_ops: the operation for each byte. The operation for a
1025         particular partition is selected by assigning the selected operation
1026         code to each byte in the partition. The allowed operation codes are:
1027
1028         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1029             RISC-V's `mul` instruction.
1030         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1031             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1032             instruction.
1033         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1034             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1035             `mulhsu` instruction.
1036         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1037             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1038             instruction.
1039     """
1040
1041     def __init__(self, register_levels=()):
1042         """ register_levels: specifies the points in the cascade at which
1043             flip-flops are to be inserted.
1044         """
1045
1046         # parameter(s)
1047         self.register_levels = list(register_levels)
1048
1049         # inputs
1050         self.part_pts = PartitionPoints()
1051         for i in range(8, 64, 8):
1052             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1053         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1054         self.a = Signal(64)
1055         self.b = Signal(64)
1056
1057         # intermediates (needed for unit tests)
1058         self._intermediate_output = Signal(128)
1059
1060         # output
1061         self.output = Signal(64)
1062
1063     def elaborate(self, platform):
1064         m = Module()
1065
1066         # collect part-bytes
1067         pbs = Signal(8, reset_less=True)
1068         tl = []
1069         for i in range(8):
1070             pb = Signal(name="pb%d" % i, reset_less=True)
1071             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1072             tl.append(pb)
1073         m.d.comb += pbs.eq(Cat(*tl))
1074
1075         # create (doubled) PartitionPoints (output is double input width)
1076         expanded_part_pts = eps = PartitionPoints()
1077         for i, v in self.part_pts.items():
1078             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1079             expanded_part_pts[i * 2] = ep
1080             m.d.comb += ep.eq(v)
1081
1082         # local variables
1083         signs = []
1084         for i in range(8):
1085             s = Signs()
1086             signs.append(s)
1087             setattr(m.submodules, "signs%d" % i, s)
1088             m.d.comb += s.part_ops.eq(self.part_ops[i])
1089
1090         n_levels = len(self.register_levels)+1
1091         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1092         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1093         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1094         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1095         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1096         for mod in [part_8, part_16, part_32, part_64]:
1097             m.d.comb += mod.a.eq(self.a)
1098             m.d.comb += mod.b.eq(self.b)
1099             for i in range(len(signs)):
1100                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1101                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1102             m.d.comb += mod.pbs.eq(pbs)
1103             nat_l.append(mod.not_a_term)
1104             nbt_l.append(mod.not_b_term)
1105             nla_l.append(mod.neg_lsb_a_term)
1106             nlb_l.append(mod.neg_lsb_b_term)
1107
1108         terms = []
1109
1110         for a_index in range(8):
1111             t = ProductTerms(8, 128, 8, a_index, 8)
1112             setattr(m.submodules, "terms_%d" % a_index, t)
1113
1114             m.d.comb += t.a.eq(self.a)
1115             m.d.comb += t.b.eq(self.b)
1116             m.d.comb += t.pb_en.eq(pbs)
1117
1118             for term in t.terms:
1119                 terms.append(term)
1120
1121         # it's fine to bitwise-or data together since they are never enabled
1122         # at the same time
1123         m.submodules.nat_or = nat_or = OrMod(128)
1124         m.submodules.nbt_or = nbt_or = OrMod(128)
1125         m.submodules.nla_or = nla_or = OrMod(128)
1126         m.submodules.nlb_or = nlb_or = OrMod(128)
1127         for l, mod in [(nat_l, nat_or),
1128                              (nbt_l, nbt_or),
1129                              (nla_l, nla_or),
1130                              (nlb_l, nlb_or)]:
1131             for i in range(len(l)):
1132                 m.d.comb += mod.orin[i].eq(l[i])
1133             terms.append(mod.orout)
1134
1135         add_reduce = AddReduce(terms,
1136                                128,
1137                                self.register_levels,
1138                                expanded_part_pts,
1139                                self.part_ops)
1140
1141         out_part_ops = add_reduce.o.part_ops
1142         out_part_pts = add_reduce.o.reg_partition_points
1143
1144         m.submodules.add_reduce = add_reduce
1145         m.d.comb += self._intermediate_output.eq(add_reduce.o.output)
1146         # create _output_64
1147         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1148         m.d.comb += io64.intermed.eq(self._intermediate_output)
1149         for i in range(8):
1150             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1151
1152         # create _output_32
1153         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1154         m.d.comb += io32.intermed.eq(self._intermediate_output)
1155         for i in range(8):
1156             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1157
1158         # create _output_16
1159         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1160         m.d.comb += io16.intermed.eq(self._intermediate_output)
1161         for i in range(8):
1162             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1163
1164         # create _output_8
1165         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1166         m.d.comb += io8.intermed.eq(self._intermediate_output)
1167         for i in range(8):
1168             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1169
1170         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1171         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1172         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1173         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1174
1175         m.d.comb += p_8.epps.eq(out_part_pts)
1176         m.d.comb += p_16.epps.eq(out_part_pts)
1177         m.d.comb += p_32.epps.eq(out_part_pts)
1178         m.d.comb += p_64.epps.eq(out_part_pts)
1179
1180         # final output
1181         m.submodules.finalout = finalout = FinalOut(64)
1182         for i in range(len(part_8.parts)):
1183             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1184         for i in range(len(part_16.parts)):
1185             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1186         for i in range(len(part_32.parts)):
1187             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1188         m.d.comb += finalout.i8.eq(io8.output)
1189         m.d.comb += finalout.i16.eq(io16.output)
1190         m.d.comb += finalout.i32.eq(io32.output)
1191         m.d.comb += finalout.i64.eq(io64.output)
1192         m.d.comb += self.output.eq(finalout.out)
1193
1194         return m
1195
1196
1197 if __name__ == "__main__":
1198     m = Mul8_16_32_64()
1199     main(m, ports=[m.a,
1200                    m.b,
1201                    m._intermediate_output,
1202                    m.output,
1203                    *m.part_ops,
1204                    *m.part_pts.values()])