src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         """
  82         bits = []
  83         for i in range(width):
  84             if i in self:
  85                 bits.append(~self[i])
  86             else:
  87                 bits.append(True)
  88         return Cat(*bits)
  89
  90     def get_max_partition_count(self, width):
  91         """Get the maximum number of partitions.
  92
  93         Gets the number of partitions when all partition points are enabled.
  94         """
  95         retval = 1
  96         for point in self.keys():
  97             if point < width:
  98                 retval += 1
  99         return retval
 100
 101     def fits_in_width(self, width):
 102         """Check if all partition points are smaller than `width`."""
 103         for point in self.keys():
 104             if point >= width:
 105                 return False
 106         return True
 107
 108     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 109         if index == -1 or index == 7:
 110             return C(True, 1)
 111         assert index >= 0 and index < 8
 112         return self[(index * 8 + 8)*mfactor]
 113
 114
 115 class FullAdder(Elaboratable):
 116     """Full Adder.
 117
 118     :attribute in0: the first input
 119     :attribute in1: the second input
 120     :attribute in2: the third input
 121     :attribute sum: the sum output
 122     :attribute carry: the carry output
 123
 124     Rather than do individual full adders (and have an array of them,
 125     which would be very slow to simulate), this module can specify the
 126     bit width of the inputs and outputs: in effect it performs multiple
 127     Full 3-2 Add operations "in parallel".
 128     """
 129
 130     def __init__(self, width):
 131         """Create a ``FullAdder``.
 132
 133         :param width: the bit width of the input and output
 134         """
 135         self.in0 = Signal(width)
 136         self.in1 = Signal(width)
 137         self.in2 = Signal(width)
 138         self.sum = Signal(width)
 139         self.carry = Signal(width)
 140
 141     def elaborate(self, platform):
 142         """Elaborate this module."""
 143         m = Module()
 144         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 145         m.d.comb += self.carry.eq((self.in0 & self.in1)
 146                                   | (self.in1 & self.in2)
 147                                   | (self.in2 & self.in0))
 148         return m
 149
 150
 151 class MaskedFullAdder(Elaboratable):
 152     """Masked Full Adder.
 153
 154     :attribute mask: the carry partition mask
 155     :attribute in0: the first input
 156     :attribute in1: the second input
 157     :attribute in2: the third input
 158     :attribute sum: the sum output
 159     :attribute mcarry: the masked carry output
 160
 161     FullAdders are always used with a "mask" on the output.  To keep
 162     the graphviz "clean", this class performs the masking here rather
 163     than inside a large for-loop.
 164
 165     See the following discussion as to why this is no longer derived
 166     from FullAdder.  Each carry is shifted here *before* being ANDed
 167     with the mask, so that an AOI cell may be used (which is more
 168     gate-efficient)
 169     https://en.wikipedia.org/wiki/AND-OR-Invert
 170     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 171     """
 172
 173     def __init__(self, width):
 174         """Create a ``MaskedFullAdder``.
 175
 176         :param width: the bit width of the input and output
 177         """
 178         self.width = width
 179         self.mask = Signal(width, reset_less=True)
 180         self.mcarry = Signal(width, reset_less=True)
 181         self.in0 = Signal(width, reset_less=True)
 182         self.in1 = Signal(width, reset_less=True)
 183         self.in2 = Signal(width, reset_less=True)
 184         self.sum = Signal(width, reset_less=True)
 185
 186     def elaborate(self, platform):
 187         """Elaborate this module."""
 188         m = Module()
 189         s1 = Signal(self.width, reset_less=True)
 190         s2 = Signal(self.width, reset_less=True)
 191         s3 = Signal(self.width, reset_less=True)
 192         c1 = Signal(self.width, reset_less=True)
 193         c2 = Signal(self.width, reset_less=True)
 194         c3 = Signal(self.width, reset_less=True)
 195         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 196         m.d.comb += s1.eq(Cat(0, self.in0))
 197         m.d.comb += s2.eq(Cat(0, self.in1))
 198         m.d.comb += s3.eq(Cat(0, self.in2))
 199         m.d.comb += c1.eq(s1 & s2 & self.mask)
 200         m.d.comb += c2.eq(s2 & s3 & self.mask)
 201         m.d.comb += c3.eq(s3 & s1 & self.mask)
 202         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 203         return m
 204
 205
 206 class PartitionedAdder(Elaboratable):
 207     """Partitioned Adder.
 208
 209     Performs the final add.  The partition points are included in the
 210     actual add (in one of the operands only), which causes a carry over
 211     to the next bit.  Then the final output *removes* the extra bits from
 212     the result.
 213
 214     partition: .... P... P... P... P... (32 bits)
 215     a        : .... .... .... .... .... (32 bits)
 216     b        : .... .... .... .... .... (32 bits)
 217     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 218     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 219     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 220     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 221
 222     :attribute width: the bit width of the input and output. Read-only.
 223     :attribute a: the first input to the adder
 224     :attribute b: the second input to the adder
 225     :attribute output: the sum output
 226     :attribute partition_points: the input partition points. Modification not
 227         supported, except for by ``Signal.eq``.
 228     """
 229
 230     def __init__(self, width, partition_points):
 231         """Create a ``PartitionedAdder``.
 232
 233         :param width: the bit width of the input and output
 234         :param partition_points: the input partition points
 235         """
 236         self.width = width
 237         self.a = Signal(width)
 238         self.b = Signal(width)
 239         self.output = Signal(width)
 240         self.partition_points = PartitionPoints(partition_points)
 241         if not self.partition_points.fits_in_width(width):
 242             raise ValueError("partition_points doesn't fit in width")
 243         expanded_width = 0
 244         for i in range(self.width):
 245             if i in self.partition_points:
 246                 expanded_width += 1
 247             expanded_width += 1
 248         self._expanded_width = expanded_width
 249         # XXX these have to remain here due to some horrible nmigen
 250         # simulation bugs involving sync.  it is *not* necessary to
 251         # have them here, they should (under normal circumstances)
 252         # be moved into elaborate, as they are entirely local
 253         self._expanded_a = Signal(expanded_width) # includes extra part-points
 254         self._expanded_b = Signal(expanded_width) # likewise.
 255         self._expanded_o = Signal(expanded_width) # likewise.
 256
 257     def elaborate(self, platform):
 258         """Elaborate this module."""
 259         m = Module()
 260         expanded_index = 0
 261         # store bits in a list, use Cat later.  graphviz is much cleaner
 262         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 263
 264         # partition points are "breaks" (extra zeros or 1s) in what would
 265         # otherwise be a massive long add.  when the "break" points are 0,
 266         # whatever is in it (in the output) is discarded.  however when
 267         # there is a "1", it causes a roll-over carry to the *next* bit.
 268         # we still ignore the "break" bit in the [intermediate] output,
 269         # however by that time we've got the effect that we wanted: the
 270         # carry has been carried *over* the break point.
 271
 272         for i in range(self.width):
 273             if i in self.partition_points:
 274                 # add extra bit set to 0 + 0 for enabled partition points
 275                 # and 1 + 0 for disabled partition points
 276                 ea.append(self._expanded_a[expanded_index])
 277                 al.append(~self.partition_points[i]) # add extra bit in a
 278                 eb.append(self._expanded_b[expanded_index])
 279                 bl.append(C(0)) # yes, add a zero
 280                 expanded_index += 1 # skip the extra point.  NOT in the output
 281             ea.append(self._expanded_a[expanded_index])
 282             eb.append(self._expanded_b[expanded_index])
 283             eo.append(self._expanded_o[expanded_index])
 284             al.append(self.a[i])
 285             bl.append(self.b[i])
 286             ol.append(self.output[i])
 287             expanded_index += 1
 288
 289         # combine above using Cat
 290         m.d.comb += Cat(*ea).eq(Cat(*al))
 291         m.d.comb += Cat(*eb).eq(Cat(*bl))
 292         m.d.comb += Cat(*ol).eq(Cat(*eo))
 293
 294         # use only one addition to take advantage of look-ahead carry and
 295         # special hardware on FPGAs
 296         m.d.comb += self._expanded_o.eq(
 297             self._expanded_a + self._expanded_b)
 298         return m
 299
 300
 301 FULL_ADDER_INPUT_COUNT = 3
 302
 303 class AddReduceData:
 304
 305     def __init__(self, ppoints, output_width, n_parts):
 306         self.part_ops = [Signal(2, name=f"part_ops_{i}")
 307                           for i in range(n_parts)]
 308         self.inputs = [Signal(output_width, name=f"inputs[{i}]")
 309             for i in range(len(self.inputs))]
 310         self.reg_partition_points = partition_points.like()
 311
 312     def eq(self, rhs):
 313         return [self.reg_partition_points.eq(rhs.reg_partition_points)] + \
 314                [self.inputs[i].eq(rhs.inputs[i])
 315                                      for i in range(len(self.inputs))] + \
 316                [self.part_ops[i].eq(rhs.part_ops[i])
 317                                      for i in range(len(self.part_ops))]
 318
 319
 320 class FinalAdd(Elaboratable):
 321     """ Final stage of add reduce
 322     """
 323
 324     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 325                        partition_points):
 326         self.n_inputs = n_inputs
 327         self.n_parts = n_parts
 328         self.out_part_ops = [Signal(2, name=f"out_part_ops_{i}")
 329                           for i in range(n_parts)]
 330         self._resized_inputs = [
 331             Signal(output_width, name=f"resized_inputs[{i}]")
 332             for i in range(n_inputs)]
 333         self.register_levels = list(register_levels)
 334         self.output = Signal(output_width)
 335         self.partition_points = PartitionPoints(partition_points)
 336         if not self.partition_points.fits_in_width(output_width):
 337             raise ValueError("partition_points doesn't fit in output_width")
 338         self._reg_partition_points = self.partition_points.like()
 339         self.intermediate_terms = []
 340
 341     def elaborate(self, platform):
 342         """Elaborate this module."""
 343         m = Module()
 344
 345         if self.n_inputs == 0:
 346             # use 0 as the default output value
 347             m.d.comb += self.output.eq(0)
 348         elif self.n_inputs == 1:
 349             # handle single input
 350             m.d.comb += self.output.eq(self._resized_inputs[0])
 351         else:
 352             # base case for adding 2 inputs
 353             assert self.n_inputs == 2
 354             adder = PartitionedAdder(len(self.output),
 355                                      self._reg_partition_points)
 356             m.submodules.final_adder = adder
 357             m.d.comb += adder.a.eq(self._resized_inputs[0])
 358             m.d.comb += adder.b.eq(self._resized_inputs[1])
 359             m.d.comb += self.output.eq(adder.output)
 360         return m
 361
 362
 363 class AddReduceSingle(Elaboratable):
 364     """Add list of numbers together.
 365
 366     :attribute inputs: input ``Signal``s to be summed. Modification not
 367         supported, except for by ``Signal.eq``.
 368     :attribute register_levels: List of nesting levels that should have
 369         pipeline registers.
 370     :attribute output: output sum.
 371     :attribute partition_points: the input partition points. Modification not
 372         supported, except for by ``Signal.eq``.
 373     """
 374
 375     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 376                        partition_points):
 377         """Create an ``AddReduce``.
 378
 379         :param inputs: input ``Signal``s to be summed.
 380         :param output_width: bit-width of ``output``.
 381         :param register_levels: List of nesting levels that should have
 382             pipeline registers.
 383         :param partition_points: the input partition points.
 384         """
 385         self.n_inputs = n_inputs
 386         self.n_parts = n_parts
 387         self.output_width = output_width
 388         self.out_part_ops = [Signal(2, name=f"out_part_ops_{i}")
 389                           for i in range(n_parts)]
 390         self._resized_inputs = [
 391             Signal(output_width, name=f"resized_inputs[{i}]")
 392             for i in range(n_inputs)]
 393         self.register_levels = list(register_levels)
 394         self.partition_points = PartitionPoints(partition_points)
 395         if not self.partition_points.fits_in_width(output_width):
 396             raise ValueError("partition_points doesn't fit in output_width")
 397         self._reg_partition_points = self.partition_points.like()
 398
 399         max_level = AddReduceSingle.get_max_level(n_inputs)
 400         for level in self.register_levels:
 401             if level > max_level:
 402                 raise ValueError(
 403                     "not enough adder levels for specified register levels")
 404
 405         # this is annoying.  we have to create the modules (and terms)
 406         # because we need to know what they are (in order to set up the
 407         # interconnects back in AddReduce), but cannot do the m.d.comb +=
 408         # etc because this is not in elaboratable.
 409         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 410         self._intermediate_terms = []
 411         if len(self.groups) != 0:
 412             self.create_next_terms()
 413
 414     @staticmethod
 415     def get_max_level(input_count):
 416         """Get the maximum level.
 417
 418         All ``register_levels`` must be less than or equal to the maximum
 419         level.
 420         """
 421         retval = 0
 422         while True:
 423             groups = AddReduceSingle.full_adder_groups(input_count)
 424             if len(groups) == 0:
 425                 return retval
 426             input_count %= FULL_ADDER_INPUT_COUNT
 427             input_count += 2 * len(groups)
 428             retval += 1
 429
 430     @staticmethod
 431     def full_adder_groups(input_count):
 432         """Get ``inputs`` indices for which a full adder should be built."""
 433         return range(0,
 434                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 435                      FULL_ADDER_INPUT_COUNT)
 436
 437     def elaborate(self, platform):
 438         """Elaborate this module."""
 439         m = Module()
 440
 441         for (value, term) in self._intermediate_terms:
 442             m.d.comb += term.eq(value)
 443
 444         mask = self._reg_partition_points.as_mask(self.output_width)
 445         m.d.comb += self.part_mask.eq(mask)
 446
 447         # add and link the intermediate term modules
 448         for i, (iidx, adder_i) in enumerate(self.adders):
 449             setattr(m.submodules, f"adder_{i}", adder_i)
 450
 451             m.d.comb += adder_i.in0.eq(self._resized_inputs[iidx])
 452             m.d.comb += adder_i.in1.eq(self._resized_inputs[iidx + 1])
 453             m.d.comb += adder_i.in2.eq(self._resized_inputs[iidx + 2])
 454             m.d.comb += adder_i.mask.eq(self.part_mask)
 455
 456         return m
 457
 458     def create_next_terms(self):
 459
 460         # go on to prepare recursive case
 461         intermediate_terms = []
 462         _intermediate_terms = []
 463
 464         def add_intermediate_term(value):
 465             intermediate_term = Signal(
 466                 self.output_width,
 467                 name=f"intermediate_terms[{len(intermediate_terms)}]")
 468             _intermediate_terms.append((value, intermediate_term))
 469             intermediate_terms.append(intermediate_term)
 470
 471         # store mask in intermediary (simplifies graph)
 472         self.part_mask = Signal(self.output_width, reset_less=True)
 473
 474         # create full adders for this recursive level.
 475         # this shrinks N terms to 2 * (N // 3) plus the remainder
 476         self.adders = []
 477         for i in self.groups:
 478             adder_i = MaskedFullAdder(self.output_width)
 479             self.adders.append((i, adder_i))
 480             # add both the sum and the masked-carry to the next level.
 481             # 3 inputs have now been reduced to 2...
 482             add_intermediate_term(adder_i.sum)
 483             add_intermediate_term(adder_i.mcarry)
 484         # handle the remaining inputs.
 485         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 486             add_intermediate_term(self._resized_inputs[-1])
 487         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 488             # Just pass the terms to the next layer, since we wouldn't gain
 489             # anything by using a half adder since there would still be 2 terms
 490             # and just passing the terms to the next layer saves gates.
 491             add_intermediate_term(self._resized_inputs[-2])
 492             add_intermediate_term(self._resized_inputs[-1])
 493         else:
 494             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 495
 496         self.intermediate_terms = intermediate_terms
 497         self._intermediate_terms = _intermediate_terms
 498
 499
 500 class AddReduce(Elaboratable):
 501     """Recursively Add list of numbers together.
 502
 503     :attribute inputs: input ``Signal``s to be summed. Modification not
 504         supported, except for by ``Signal.eq``.
 505     :attribute register_levels: List of nesting levels that should have
 506         pipeline registers.
 507     :attribute output: output sum.
 508     :attribute partition_points: the input partition points. Modification not
 509         supported, except for by ``Signal.eq``.
 510     """
 511
 512     def __init__(self, inputs, output_width, register_levels, partition_points,
 513                        part_ops):
 514         """Create an ``AddReduce``.
 515
 516         :param inputs: input ``Signal``s to be summed.
 517         :param output_width: bit-width of ``output``.
 518         :param register_levels: List of nesting levels that should have
 519             pipeline registers.
 520         :param partition_points: the input partition points.
 521         """
 522         self.inputs = inputs
 523         self.part_ops = part_ops
 524         self.out_part_ops = [Signal(2, name=f"out_part_ops_{i}")
 525                           for i in range(len(part_ops))]
 526         self.output = Signal(output_width)
 527         self.output_width = output_width
 528         self.register_levels = register_levels
 529         self.partition_points = partition_points
 530
 531         self.create_levels()
 532
 533     @staticmethod
 534     def get_max_level(input_count):
 535         return AddReduceSingle.get_max_level(input_count)
 536
 537     @staticmethod
 538     def next_register_levels(register_levels):
 539         """``Iterable`` of ``register_levels`` for next recursive level."""
 540         for level in register_levels:
 541             if level > 0:
 542                 yield level - 1
 543
 544     def create_levels(self):
 545         """creates reduction levels"""
 546
 547         mods = []
 548         next_levels = self.register_levels
 549         partition_points = self.partition_points
 550         inputs = self.inputs
 551         part_ops = self.part_ops
 552         n_parts = len(part_ops)
 553         while True:
 554             ilen = len(inputs)
 555             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 556                                          next_levels, partition_points)
 557             mods.append(next_level)
 558             next_levels = list(AddReduce.next_register_levels(next_levels))
 559             partition_points = next_level._reg_partition_points
 560             inputs = next_level.intermediate_terms
 561             ilen = len(inputs)
 562             part_ops = next_level.out_part_ops
 563             groups = AddReduceSingle.full_adder_groups(len(inputs))
 564             if len(groups) == 0:
 565                 break
 566
 567         if ilen != 0:
 568             next_level = FinalAdd(ilen, self.output_width, n_parts,
 569                                   next_levels, partition_points)
 570             mods.append(next_level)
 571
 572         self.levels = mods
 573
 574     def elaborate(self, platform):
 575         """Elaborate this module."""
 576         m = Module()
 577
 578         for i, next_level in enumerate(self.levels):
 579             setattr(m.submodules, "next_level%d" % i, next_level)
 580
 581         partition_points = self.partition_points
 582         inputs = self.inputs
 583         part_ops = self.part_ops
 584         for i in range(len(self.levels)):
 585             mcur = self.levels[i]
 586             inassign = [mcur._resized_inputs[i].eq(inputs[i])
 587                                          for i in range(len(inputs))]
 588             copy_part_ops = [mcur.out_part_ops[i].eq(part_ops[i])
 589                                          for i in range(len(part_ops))]
 590             if 0 in mcur.register_levels:
 591                 m.d.sync += copy_part_ops
 592                 m.d.sync += inassign
 593                 m.d.sync += mcur._reg_partition_points.eq(partition_points)
 594             else:
 595                 m.d.comb += copy_part_ops
 596                 m.d.comb += inassign
 597                 m.d.comb += mcur._reg_partition_points.eq(partition_points)
 598             partition_points = mcur._reg_partition_points
 599             inputs = mcur.intermediate_terms
 600             part_ops = mcur.out_part_ops
 601
 602         # output comes from last module
 603         m.d.comb += self.output.eq(next_level.output)
 604         copy_part_ops = [self.out_part_ops[i].eq(next_level.out_part_ops[i])
 605                                      for i in range(len(self.part_ops))]
 606         m.d.comb += copy_part_ops
 607
 608         return m
 609
 610
 611 OP_MUL_LOW = 0
 612 OP_MUL_SIGNED_HIGH = 1
 613 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 614 OP_MUL_UNSIGNED_HIGH = 3
 615
 616
 617 def get_term(value, shift=0, enabled=None):
 618     if enabled is not None:
 619         value = Mux(enabled, value, 0)
 620     if shift > 0:
 621         value = Cat(Repl(C(0, 1), shift), value)
 622     else:
 623         assert shift == 0
 624     return value
 625
 626
 627 class ProductTerm(Elaboratable):
 628     """ this class creates a single product term (a[..]*b[..]).
 629         it has a design flaw in that is the *output* that is selected,
 630         where the multiplication(s) are combinatorially generated
 631         all the time.
 632     """
 633
 634     def __init__(self, width, twidth, pbwid, a_index, b_index):
 635         self.a_index = a_index
 636         self.b_index = b_index
 637         shift = 8 * (self.a_index + self.b_index)
 638         self.pwidth = width
 639         self.twidth = twidth
 640         self.width = width*2
 641         self.shift = shift
 642
 643         self.ti = Signal(self.width, reset_less=True)
 644         self.term = Signal(twidth, reset_less=True)
 645         self.a = Signal(twidth//2, reset_less=True)
 646         self.b = Signal(twidth//2, reset_less=True)
 647         self.pb_en = Signal(pbwid, reset_less=True)
 648
 649         self.tl = tl = []
 650         min_index = min(self.a_index, self.b_index)
 651         max_index = max(self.a_index, self.b_index)
 652         for i in range(min_index, max_index):
 653             tl.append(self.pb_en[i])
 654         name = "te_%d_%d" % (self.a_index, self.b_index)
 655         if len(tl) > 0:
 656             term_enabled = Signal(name=name, reset_less=True)
 657         else:
 658             term_enabled = None
 659         self.enabled = term_enabled
 660         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 661
 662     def elaborate(self, platform):
 663
 664         m = Module()
 665         if self.enabled is not None:
 666             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 667
 668         bsa = Signal(self.width, reset_less=True)
 669         bsb = Signal(self.width, reset_less=True)
 670         a_index, b_index = self.a_index, self.b_index
 671         pwidth = self.pwidth
 672         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 673         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 674         m.d.comb += self.ti.eq(bsa * bsb)
 675         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 676         """
 677         #TODO: sort out width issues, get inputs a/b switched on/off.
 678         #data going into Muxes is 1/2 the required width
 679
 680         pwidth = self.pwidth
 681         width = self.width
 682         bsa = Signal(self.twidth//2, reset_less=True)
 683         bsb = Signal(self.twidth//2, reset_less=True)
 684         asel = Signal(width, reset_less=True)
 685         bsel = Signal(width, reset_less=True)
 686         a_index, b_index = self.a_index, self.b_index
 687         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 688         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 689         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 690         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 691         m.d.comb += self.ti.eq(bsa * bsb)
 692         m.d.comb += self.term.eq(self.ti)
 693         """
 694
 695         return m
 696
 697
 698 class ProductTerms(Elaboratable):
 699     """ creates a bank of product terms.  also performs the actual bit-selection
 700         this class is to be wrapped with a for-loop on the "a" operand.
 701         it creates a second-level for-loop on the "b" operand.
 702     """
 703     def __init__(self, width, twidth, pbwid, a_index, blen):
 704         self.a_index = a_index
 705         self.blen = blen
 706         self.pwidth = width
 707         self.twidth = twidth
 708         self.pbwid = pbwid
 709         self.a = Signal(twidth//2, reset_less=True)
 710         self.b = Signal(twidth//2, reset_less=True)
 711         self.pb_en = Signal(pbwid, reset_less=True)
 712         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 713                             for i in range(blen)]
 714
 715     def elaborate(self, platform):
 716
 717         m = Module()
 718
 719         for b_index in range(self.blen):
 720             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 721                             self.a_index, b_index)
 722             setattr(m.submodules, "term_%d" % b_index, t)
 723
 724             m.d.comb += t.a.eq(self.a)
 725             m.d.comb += t.b.eq(self.b)
 726             m.d.comb += t.pb_en.eq(self.pb_en)
 727
 728             m.d.comb += self.terms[b_index].eq(t.term)
 729
 730         return m
 731
 732
 733 class LSBNegTerm(Elaboratable):
 734
 735     def __init__(self, bit_width):
 736         self.bit_width = bit_width
 737         self.part = Signal(reset_less=True)
 738         self.signed = Signal(reset_less=True)
 739         self.op = Signal(bit_width, reset_less=True)
 740         self.msb = Signal(reset_less=True)
 741         self.nt = Signal(bit_width*2, reset_less=True)
 742         self.nl = Signal(bit_width*2, reset_less=True)
 743
 744     def elaborate(self, platform):
 745         m = Module()
 746         comb = m.d.comb
 747         bit_wid = self.bit_width
 748         ext = Repl(0, bit_wid) # extend output to HI part
 749
 750         # determine sign of each incoming number *in this partition*
 751         enabled = Signal(reset_less=True)
 752         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 753
 754         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 755         # negation operation is split into a bitwise not and a +1.
 756         # likewise for 16, 32, and 64-bit values.
 757
 758         # width-extended 1s complement if a is signed, otherwise zero
 759         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 760
 761         # add 1 if signed, otherwise add zero
 762         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 763
 764         return m
 765
 766
 767 class Parts(Elaboratable):
 768
 769     def __init__(self, pbwid, epps, n_parts):
 770         self.pbwid = pbwid
 771         # inputs
 772         self.epps = PartitionPoints.like(epps, name="epps") # expanded points
 773         # outputs
 774         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 775
 776     def elaborate(self, platform):
 777         m = Module()
 778
 779         epps, parts = self.epps, self.parts
 780         # collect part-bytes (double factor because the input is extended)
 781         pbs = Signal(self.pbwid, reset_less=True)
 782         tl = []
 783         for i in range(self.pbwid):
 784             pb = Signal(name="pb%d" % i, reset_less=True)
 785             m.d.comb += pb.eq(epps.part_byte(i, mfactor=2)) # double
 786             tl.append(pb)
 787         m.d.comb += pbs.eq(Cat(*tl))
 788
 789         # negated-temporary copy of partition bits
 790         npbs = Signal.like(pbs, reset_less=True)
 791         m.d.comb += npbs.eq(~pbs)
 792         byte_count = 8 // len(parts)
 793         for i in range(len(parts)):
 794             pbl = []
 795             pbl.append(npbs[i * byte_count - 1])
 796             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 797                 pbl.append(pbs[j])
 798             pbl.append(npbs[(i + 1) * byte_count - 1])
 799             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 800             m.d.comb += value.eq(Cat(*pbl))
 801             m.d.comb += parts[i].eq(~(value).bool())
 802
 803         return m
 804
 805
 806 class Part(Elaboratable):
 807     """ a key class which, depending on the partitioning, will determine
 808         what action to take when parts of the output are signed or unsigned.
 809
 810         this requires 2 pieces of data *per operand, per partition*:
 811         whether the MSB is HI/LO (per partition!), and whether a signed
 812         or unsigned operation has been *requested*.
 813
 814         once that is determined, signed is basically carried out
 815         by splitting 2's complement into 1's complement plus one.
 816         1's complement is just a bit-inversion.
 817
 818         the extra terms - as separate terms - are then thrown at the
 819         AddReduce alongside the multiplication part-results.
 820     """
 821     def __init__(self, epps, width, n_parts, n_levels, pbwid):
 822
 823         self.pbwid = pbwid
 824         self.epps = epps
 825
 826         # inputs
 827         self.a = Signal(64)
 828         self.b = Signal(64)
 829         self.a_signed = [Signal(name=f"a_signed_{i}") for i in range(8)]
 830         self.b_signed = [Signal(name=f"_b_signed_{i}") for i in range(8)]
 831         self.pbs = Signal(pbwid, reset_less=True)
 832
 833         # outputs
 834         self.parts = [Signal(name=f"part_{i}") for i in range(n_parts)]
 835
 836         self.not_a_term = Signal(width)
 837         self.neg_lsb_a_term = Signal(width)
 838         self.not_b_term = Signal(width)
 839         self.neg_lsb_b_term = Signal(width)
 840
 841     def elaborate(self, platform):
 842         m = Module()
 843
 844         pbs, parts = self.pbs, self.parts
 845         epps = self.epps
 846         m.submodules.p = p = Parts(self.pbwid, epps, len(parts))
 847         m.d.comb += p.epps.eq(epps)
 848         parts = p.parts
 849
 850         byte_count = 8 // len(parts)
 851
 852         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 853                 self.not_a_term, self.neg_lsb_a_term,
 854                 self.not_b_term, self.neg_lsb_b_term)
 855
 856         byte_width = 8 // len(parts) # byte width
 857         bit_wid = 8 * byte_width     # bit width
 858         nat, nbt, nla, nlb = [], [], [], []
 859         for i in range(len(parts)):
 860             # work out bit-inverted and +1 term for a.
 861             pa = LSBNegTerm(bit_wid)
 862             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 863             m.d.comb += pa.part.eq(parts[i])
 864             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 865             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 866             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 867             nat.append(pa.nt)
 868             nla.append(pa.nl)
 869
 870             # work out bit-inverted and +1 term for b
 871             pb = LSBNegTerm(bit_wid)
 872             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 873             m.d.comb += pb.part.eq(parts[i])
 874             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 875             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 876             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 877             nbt.append(pb.nt)
 878             nlb.append(pb.nl)
 879
 880         # concatenate together and return all 4 results.
 881         m.d.comb += [not_a_term.eq(Cat(*nat)),
 882                      not_b_term.eq(Cat(*nbt)),
 883                      neg_lsb_a_term.eq(Cat(*nla)),
 884                      neg_lsb_b_term.eq(Cat(*nlb)),
 885                     ]
 886
 887         return m
 888
 889
 890 class IntermediateOut(Elaboratable):
 891     """ selects the HI/LO part of the multiplication, for a given bit-width
 892         the output is also reconstructed in its SIMD (partition) lanes.
 893     """
 894     def __init__(self, width, out_wid, n_parts):
 895         self.width = width
 896         self.n_parts = n_parts
 897         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 898                                      for i in range(8)]
 899         self.intermed = Signal(out_wid, reset_less=True)
 900         self.output = Signal(out_wid//2, reset_less=True)
 901
 902     def elaborate(self, platform):
 903         m = Module()
 904
 905         ol = []
 906         w = self.width
 907         sel = w // 8
 908         for i in range(self.n_parts):
 909             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 910             m.d.comb += op.eq(
 911                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 912                     self.intermed.part(i * w*2, w),
 913                     self.intermed.part(i * w*2 + w, w)))
 914             ol.append(op)
 915         m.d.comb += self.output.eq(Cat(*ol))
 916
 917         return m
 918
 919
 920 class FinalOut(Elaboratable):
 921     """ selects the final output based on the partitioning.
 922
 923         each byte is selectable independently, i.e. it is possible
 924         that some partitions requested 8-bit computation whilst others
 925         requested 16 or 32 bit.
 926     """
 927     def __init__(self, out_wid):
 928         # inputs
 929         self.d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 930         self.d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 931         self.d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 932
 933         self.i8 = Signal(out_wid, reset_less=True)
 934         self.i16 = Signal(out_wid, reset_less=True)
 935         self.i32 = Signal(out_wid, reset_less=True)
 936         self.i64 = Signal(out_wid, reset_less=True)
 937
 938         # output
 939         self.out = Signal(out_wid, reset_less=True)
 940
 941     def elaborate(self, platform):
 942         m = Module()
 943         ol = []
 944         for i in range(8):
 945             # select one of the outputs: d8 selects i8, d16 selects i16
 946             # d32 selects i32, and the default is i64.
 947             # d8 and d16 are ORed together in the first Mux
 948             # then the 2nd selects either i8 or i16.
 949             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
 950             op = Signal(8, reset_less=True, name="op_%d" % i)
 951             m.d.comb += op.eq(
 952                 Mux(self.d8[i] | self.d16[i // 2],
 953                     Mux(self.d8[i], self.i8.part(i * 8, 8),
 954                                      self.i16.part(i * 8, 8)),
 955                     Mux(self.d32[i // 4], self.i32.part(i * 8, 8),
 956                                           self.i64.part(i * 8, 8))))
 957             ol.append(op)
 958         m.d.comb += self.out.eq(Cat(*ol))
 959         return m
 960
 961
 962 class OrMod(Elaboratable):
 963     """ ORs four values together in a hierarchical tree
 964     """
 965     def __init__(self, wid):
 966         self.wid = wid
 967         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
 968                      for i in range(4)]
 969         self.orout = Signal(wid, reset_less=True)
 970
 971     def elaborate(self, platform):
 972         m = Module()
 973         or1 = Signal(self.wid, reset_less=True)
 974         or2 = Signal(self.wid, reset_less=True)
 975         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
 976         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
 977         m.d.comb += self.orout.eq(or1 | or2)
 978
 979         return m
 980
 981
 982 class Signs(Elaboratable):
 983     """ determines whether a or b are signed numbers
 984         based on the required operation type (OP_MUL_*)
 985     """
 986
 987     def __init__(self):
 988         self.part_ops = Signal(2, reset_less=True)
 989         self.a_signed = Signal(reset_less=True)
 990         self.b_signed = Signal(reset_less=True)
 991
 992     def elaborate(self, platform):
 993
 994         m = Module()
 995
 996         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
 997         bsig = (self.part_ops == OP_MUL_LOW) \
 998                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
 999         m.d.comb += self.a_signed.eq(asig)
1000         m.d.comb += self.b_signed.eq(bsig)
1001
1002         return m
1003
1004
1005 class Mul8_16_32_64(Elaboratable):
1006     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1007
1008     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1009     partitions on naturally-aligned boundaries. Supports the operation being
1010     set for each partition independently.
1011
1012     :attribute part_pts: the input partition points. Has a partition point at
1013         multiples of 8 in 0 < i < 64. Each partition point's associated
1014         ``Value`` is a ``Signal``. Modification not supported, except for by
1015         ``Signal.eq``.
1016     :attribute part_ops: the operation for each byte. The operation for a
1017         particular partition is selected by assigning the selected operation
1018         code to each byte in the partition. The allowed operation codes are:
1019
1020         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1021             RISC-V's `mul` instruction.
1022         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1023             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1024             instruction.
1025         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1026             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1027             `mulhsu` instruction.
1028         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1029             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1030             instruction.
1031     """
1032
1033     def __init__(self, register_levels=()):
1034         """ register_levels: specifies the points in the cascade at which
1035             flip-flops are to be inserted.
1036         """
1037
1038         # parameter(s)
1039         self.register_levels = list(register_levels)
1040
1041         # inputs
1042         self.part_pts = PartitionPoints()
1043         for i in range(8, 64, 8):
1044             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1045         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1046         self.a = Signal(64)
1047         self.b = Signal(64)
1048
1049         # intermediates (needed for unit tests)
1050         self._intermediate_output = Signal(128)
1051
1052         # output
1053         self.output = Signal(64)
1054
1055     def elaborate(self, platform):
1056         m = Module()
1057
1058         # collect part-bytes
1059         pbs = Signal(8, reset_less=True)
1060         tl = []
1061         for i in range(8):
1062             pb = Signal(name="pb%d" % i, reset_less=True)
1063             m.d.comb += pb.eq(self.part_pts.part_byte(i))
1064             tl.append(pb)
1065         m.d.comb += pbs.eq(Cat(*tl))
1066
1067         # create (doubled) PartitionPoints (output is double input width)
1068         expanded_part_pts = eps = PartitionPoints()
1069         for i, v in self.part_pts.items():
1070             ep = Signal(name=f"expanded_part_pts_{i*2}", reset_less=True)
1071             expanded_part_pts[i * 2] = ep
1072             m.d.comb += ep.eq(v)
1073
1074         # local variables
1075         signs = []
1076         for i in range(8):
1077             s = Signs()
1078             signs.append(s)
1079             setattr(m.submodules, "signs%d" % i, s)
1080             m.d.comb += s.part_ops.eq(self.part_ops[i])
1081
1082         n_levels = len(self.register_levels)+1
1083         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1084         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1085         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1086         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1087         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1088         for mod in [part_8, part_16, part_32, part_64]:
1089             m.d.comb += mod.a.eq(self.a)
1090             m.d.comb += mod.b.eq(self.b)
1091             for i in range(len(signs)):
1092                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1093                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1094             m.d.comb += mod.pbs.eq(pbs)
1095             nat_l.append(mod.not_a_term)
1096             nbt_l.append(mod.not_b_term)
1097             nla_l.append(mod.neg_lsb_a_term)
1098             nlb_l.append(mod.neg_lsb_b_term)
1099
1100         terms = []
1101
1102         for a_index in range(8):
1103             t = ProductTerms(8, 128, 8, a_index, 8)
1104             setattr(m.submodules, "terms_%d" % a_index, t)
1105
1106             m.d.comb += t.a.eq(self.a)
1107             m.d.comb += t.b.eq(self.b)
1108             m.d.comb += t.pb_en.eq(pbs)
1109
1110             for term in t.terms:
1111                 terms.append(term)
1112
1113         # it's fine to bitwise-or data together since they are never enabled
1114         # at the same time
1115         m.submodules.nat_or = nat_or = OrMod(128)
1116         m.submodules.nbt_or = nbt_or = OrMod(128)
1117         m.submodules.nla_or = nla_or = OrMod(128)
1118         m.submodules.nlb_or = nlb_or = OrMod(128)
1119         for l, mod in [(nat_l, nat_or),
1120                              (nbt_l, nbt_or),
1121                              (nla_l, nla_or),
1122                              (nlb_l, nlb_or)]:
1123             for i in range(len(l)):
1124                 m.d.comb += mod.orin[i].eq(l[i])
1125             terms.append(mod.orout)
1126
1127         add_reduce = AddReduce(terms,
1128                                128,
1129                                self.register_levels,
1130                                expanded_part_pts,
1131                                self.part_ops)
1132
1133         out_part_ops = add_reduce.levels[-1].out_part_ops
1134         out_part_pts = add_reduce.levels[-1]._reg_partition_points
1135
1136         m.submodules.add_reduce = add_reduce
1137         m.d.comb += self._intermediate_output.eq(add_reduce.output)
1138         # create _output_64
1139         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1140         m.d.comb += io64.intermed.eq(self._intermediate_output)
1141         for i in range(8):
1142             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1143
1144         # create _output_32
1145         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1146         m.d.comb += io32.intermed.eq(self._intermediate_output)
1147         for i in range(8):
1148             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1149
1150         # create _output_16
1151         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1152         m.d.comb += io16.intermed.eq(self._intermediate_output)
1153         for i in range(8):
1154             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1155
1156         # create _output_8
1157         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1158         m.d.comb += io8.intermed.eq(self._intermediate_output)
1159         for i in range(8):
1160             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1161
1162         m.submodules.p_8 = p_8 = Parts(8, eps, len(part_8.parts))
1163         m.submodules.p_16 = p_16 = Parts(8, eps, len(part_16.parts))
1164         m.submodules.p_32 = p_32 = Parts(8, eps, len(part_32.parts))
1165         m.submodules.p_64 = p_64 = Parts(8, eps, len(part_64.parts))
1166
1167         m.d.comb += p_8.epps.eq(out_part_pts)
1168         m.d.comb += p_16.epps.eq(out_part_pts)
1169         m.d.comb += p_32.epps.eq(out_part_pts)
1170         m.d.comb += p_64.epps.eq(out_part_pts)
1171
1172         # final output
1173         m.submodules.finalout = finalout = FinalOut(64)
1174         for i in range(len(part_8.parts)):
1175             m.d.comb += finalout.d8[i].eq(p_8.parts[i])
1176         for i in range(len(part_16.parts)):
1177             m.d.comb += finalout.d16[i].eq(p_16.parts[i])
1178         for i in range(len(part_32.parts)):
1179             m.d.comb += finalout.d32[i].eq(p_32.parts[i])
1180         m.d.comb += finalout.i8.eq(io8.output)
1181         m.d.comb += finalout.i16.eq(io16.output)
1182         m.d.comb += finalout.i32.eq(io32.output)
1183         m.d.comb += finalout.i64.eq(io64.output)
1184         m.d.comb += self.output.eq(finalout.out)
1185
1186         return m
1187
1188
1189 if __name__ == "__main__":
1190     m = Mul8_16_32_64()
1191     main(m, ports=[m.a,
1192                    m.b,
1193                    m._intermediate_output,
1194                    m.output,
1195                    *m.part_ops,
1196                    *m.part_pts.values()])