src/ieee754/part_mul_add/multiply.py

   1 # SPDX-License-Identifier: LGPL-2.1-or-later
   2 # See Notices.txt for copyright information
   3 """Integer Multiplication."""
   4
   5 from nmigen import Signal, Module, Value, Elaboratable, Cat, C, Mux, Repl
   6 from nmigen.hdl.ast import Assign
   7 from abc import ABCMeta, abstractmethod
   8 from nmigen.cli import main
   9 from functools import reduce
  10 from operator import or_
  11
  12
  13 class PartitionPoints(dict):
  14     """Partition points and corresponding ``Value``s.
  15
  16     The points at where an ALU is partitioned along with ``Value``s that
  17     specify if the corresponding partition points are enabled.
  18
  19     For example: ``{1: True, 5: True, 10: True}`` with
  20     ``width == 16`` specifies that the ALU is split into 4 sections:
  21     * bits 0 <= ``i`` < 1
  22     * bits 1 <= ``i`` < 5
  23     * bits 5 <= ``i`` < 10
  24     * bits 10 <= ``i`` < 16
  25
  26     If the partition_points were instead ``{1: True, 5: a, 10: True}``
  27     where ``a`` is a 1-bit ``Signal``:
  28     * If ``a`` is asserted:
  29         * bits 0 <= ``i`` < 1
  30         * bits 1 <= ``i`` < 5
  31         * bits 5 <= ``i`` < 10
  32         * bits 10 <= ``i`` < 16
  33     * Otherwise
  34         * bits 0 <= ``i`` < 1
  35         * bits 1 <= ``i`` < 10
  36         * bits 10 <= ``i`` < 16
  37     """
  38
  39     def __init__(self, partition_points=None):
  40         """Create a new ``PartitionPoints``.
  41
  42         :param partition_points: the input partition points to values mapping.
  43         """
  44         super().__init__()
  45         if partition_points is not None:
  46             for point, enabled in partition_points.items():
  47                 if not isinstance(point, int):
  48                     raise TypeError("point must be a non-negative integer")
  49                 if point < 0:
  50                     raise ValueError("point must be a non-negative integer")
  51                 self[point] = Value.wrap(enabled)
  52
  53     def like(self, name=None, src_loc_at=0, mul=1):
  54         """Create a new ``PartitionPoints`` with ``Signal``s for all values.
  55
  56         :param name: the base name for the new ``Signal``s.
  57         :param mul: a multiplication factor on the indices
  58         """
  59         if name is None:
  60             name = Signal(src_loc_at=1+src_loc_at).name  # get variable name
  61         retval = PartitionPoints()
  62         for point, enabled in self.items():
  63             point *= mul
  64             retval[point] = Signal(enabled.shape(), name=f"{name}_{point}")
  65         return retval
  66
  67     def eq(self, rhs):
  68         """Assign ``PartitionPoints`` using ``Signal.eq``."""
  69         if set(self.keys()) != set(rhs.keys()):
  70             raise ValueError("incompatible point set")
  71         for point, enabled in self.items():
  72             yield enabled.eq(rhs[point])
  73
  74     def as_mask(self, width, mul=1):
  75         """Create a bit-mask from `self`.
  76
  77         Each bit in the returned mask is clear only if the partition point at
  78         the same bit-index is enabled.
  79
  80         :param width: the bit width of the resulting mask
  81         :param mul: a "multiplier" which in-place expands the partition points
  82                     typically set to "2" when used for multipliers
  83         """
  84         bits = []
  85         for i in range(width):
  86             i /= mul
  87             if i.is_integer() and int(i) in self:
  88                 bits.append(~self[i])
  89             else:
  90                 bits.append(True)
  91         return Cat(*bits)
  92
  93     def get_max_partition_count(self, width):
  94         """Get the maximum number of partitions.
  95
  96         Gets the number of partitions when all partition points are enabled.
  97         """
  98         retval = 1
  99         for point in self.keys():
 100             if point < width:
 101                 retval += 1
 102         return retval
 103
 104     def fits_in_width(self, width):
 105         """Check if all partition points are smaller than `width`."""
 106         for point in self.keys():
 107             if point >= width:
 108                 return False
 109         return True
 110
 111     def part_byte(self, index, mfactor=1): # mfactor used for "expanding"
 112         if index == -1 or index == 7:
 113             return C(True, 1)
 114         assert index >= 0 and index < 8
 115         return self[(index * 8 + 8)*mfactor]
 116
 117
 118 class FullAdder(Elaboratable):
 119     """Full Adder.
 120
 121     :attribute in0: the first input
 122     :attribute in1: the second input
 123     :attribute in2: the third input
 124     :attribute sum: the sum output
 125     :attribute carry: the carry output
 126
 127     Rather than do individual full adders (and have an array of them,
 128     which would be very slow to simulate), this module can specify the
 129     bit width of the inputs and outputs: in effect it performs multiple
 130     Full 3-2 Add operations "in parallel".
 131     """
 132
 133     def __init__(self, width):
 134         """Create a ``FullAdder``.
 135
 136         :param width: the bit width of the input and output
 137         """
 138         self.in0 = Signal(width, reset_less=True)
 139         self.in1 = Signal(width, reset_less=True)
 140         self.in2 = Signal(width, reset_less=True)
 141         self.sum = Signal(width, reset_less=True)
 142         self.carry = Signal(width, reset_less=True)
 143
 144     def elaborate(self, platform):
 145         """Elaborate this module."""
 146         m = Module()
 147         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 148         m.d.comb += self.carry.eq((self.in0 & self.in1)
 149                                   | (self.in1 & self.in2)
 150                                   | (self.in2 & self.in0))
 151         return m
 152
 153
 154 class MaskedFullAdder(Elaboratable):
 155     """Masked Full Adder.
 156
 157     :attribute mask: the carry partition mask
 158     :attribute in0: the first input
 159     :attribute in1: the second input
 160     :attribute in2: the third input
 161     :attribute sum: the sum output
 162     :attribute mcarry: the masked carry output
 163
 164     FullAdders are always used with a "mask" on the output.  To keep
 165     the graphviz "clean", this class performs the masking here rather
 166     than inside a large for-loop.
 167
 168     See the following discussion as to why this is no longer derived
 169     from FullAdder.  Each carry is shifted here *before* being ANDed
 170     with the mask, so that an AOI cell may be used (which is more
 171     gate-efficient)
 172     https://en.wikipedia.org/wiki/AND-OR-Invert
 173     https://groups.google.com/d/msg/comp.arch/fcq-GLQqvas/vTxmcA0QAgAJ
 174     """
 175
 176     def __init__(self, width):
 177         """Create a ``MaskedFullAdder``.
 178
 179         :param width: the bit width of the input and output
 180         """
 181         self.width = width
 182         self.mask = Signal(width, reset_less=True)
 183         self.mcarry = Signal(width, reset_less=True)
 184         self.in0 = Signal(width, reset_less=True)
 185         self.in1 = Signal(width, reset_less=True)
 186         self.in2 = Signal(width, reset_less=True)
 187         self.sum = Signal(width, reset_less=True)
 188
 189     def elaborate(self, platform):
 190         """Elaborate this module."""
 191         m = Module()
 192         s1 = Signal(self.width, reset_less=True)
 193         s2 = Signal(self.width, reset_less=True)
 194         s3 = Signal(self.width, reset_less=True)
 195         c1 = Signal(self.width, reset_less=True)
 196         c2 = Signal(self.width, reset_less=True)
 197         c3 = Signal(self.width, reset_less=True)
 198         m.d.comb += self.sum.eq(self.in0 ^ self.in1 ^ self.in2)
 199         m.d.comb += s1.eq(Cat(0, self.in0))
 200         m.d.comb += s2.eq(Cat(0, self.in1))
 201         m.d.comb += s3.eq(Cat(0, self.in2))
 202         m.d.comb += c1.eq(s1 & s2 & self.mask)
 203         m.d.comb += c2.eq(s2 & s3 & self.mask)
 204         m.d.comb += c3.eq(s3 & s1 & self.mask)
 205         m.d.comb += self.mcarry.eq(c1 | c2 | c3)
 206         return m
 207
 208
 209 class PartitionedAdder(Elaboratable):
 210     """Partitioned Adder.
 211
 212     Performs the final add.  The partition points are included in the
 213     actual add (in one of the operands only), which causes a carry over
 214     to the next bit.  Then the final output *removes* the extra bits from
 215     the result.
 216
 217     partition: .... P... P... P... P... (32 bits)
 218     a        : .... .... .... .... .... (32 bits)
 219     b        : .... .... .... .... .... (32 bits)
 220     exp-a    : ....P....P....P....P.... (32+4 bits, P=1 if no partition)
 221     exp-b    : ....0....0....0....0.... (32 bits plus 4 zeros)
 222     exp-o    : ....xN...xN...xN...xN... (32+4 bits - x to be discarded)
 223     o        : .... N... N... N... N... (32 bits - x ignored, N is carry-over)
 224
 225     :attribute width: the bit width of the input and output. Read-only.
 226     :attribute a: the first input to the adder
 227     :attribute b: the second input to the adder
 228     :attribute output: the sum output
 229     :attribute partition_points: the input partition points. Modification not
 230         supported, except for by ``Signal.eq``.
 231     """
 232
 233     def __init__(self, width, partition_points, partition_step=1):
 234         """Create a ``PartitionedAdder``.
 235
 236         :param width: the bit width of the input and output
 237         :param partition_points: the input partition points
 238         :param partition_step: a multiplier (typically double) step
 239                                which in-place "expands" the partition points
 240         """
 241         self.width = width
 242         self.pmul = partition_step
 243         self.a = Signal(width, reset_less=True)
 244         self.b = Signal(width, reset_less=True)
 245         self.output = Signal(width, reset_less=True)
 246         self.partition_points = PartitionPoints(partition_points)
 247         if not self.partition_points.fits_in_width(width):
 248             raise ValueError("partition_points doesn't fit in width")
 249         expanded_width = 0
 250         for i in range(self.width):
 251             if i in self.partition_points:
 252                 expanded_width += 1
 253             expanded_width += 1
 254         self._expanded_width = expanded_width
 255
 256     def elaborate(self, platform):
 257         """Elaborate this module."""
 258         m = Module()
 259         expanded_a = Signal(self._expanded_width, reset_less=True)
 260         expanded_b = Signal(self._expanded_width, reset_less=True)
 261         expanded_o = Signal(self._expanded_width, reset_less=True)
 262
 263         expanded_index = 0
 264         # store bits in a list, use Cat later.  graphviz is much cleaner
 265         al, bl, ol, ea, eb, eo = [],[],[],[],[],[]
 266
 267         # partition points are "breaks" (extra zeros or 1s) in what would
 268         # otherwise be a massive long add.  when the "break" points are 0,
 269         # whatever is in it (in the output) is discarded.  however when
 270         # there is a "1", it causes a roll-over carry to the *next* bit.
 271         # we still ignore the "break" bit in the [intermediate] output,
 272         # however by that time we've got the effect that we wanted: the
 273         # carry has been carried *over* the break point.
 274
 275         for i in range(self.width):
 276             pi = i/self.pmul # double the range of the partition point test
 277             if pi.is_integer() and pi in self.partition_points:
 278                 # add extra bit set to 0 + 0 for enabled partition points
 279                 # and 1 + 0 for disabled partition points
 280                 ea.append(expanded_a[expanded_index])
 281                 al.append(~self.partition_points[pi]) # add extra bit in a
 282                 eb.append(expanded_b[expanded_index])
 283                 bl.append(C(0)) # yes, add a zero
 284                 expanded_index += 1 # skip the extra point.  NOT in the output
 285             ea.append(expanded_a[expanded_index])
 286             eb.append(expanded_b[expanded_index])
 287             eo.append(expanded_o[expanded_index])
 288             al.append(self.a[i])
 289             bl.append(self.b[i])
 290             ol.append(self.output[i])
 291             expanded_index += 1
 292
 293         # combine above using Cat
 294         m.d.comb += Cat(*ea).eq(Cat(*al))
 295         m.d.comb += Cat(*eb).eq(Cat(*bl))
 296         m.d.comb += Cat(*ol).eq(Cat(*eo))
 297
 298         # use only one addition to take advantage of look-ahead carry and
 299         # special hardware on FPGAs
 300         m.d.comb += expanded_o.eq(expanded_a + expanded_b)
 301         return m
 302
 303
 304 FULL_ADDER_INPUT_COUNT = 3
 305
 306 class AddReduceData:
 307
 308     def __init__(self, part_pts, n_inputs, output_width, n_parts):
 309         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 310                           for i in range(n_parts)]
 311         self.terms = [Signal(output_width, name=f"inputs_{i}",
 312                               reset_less=True)
 313                         for i in range(n_inputs)]
 314         self.part_pts = part_pts.like()
 315
 316     def eq_from(self, part_pts, inputs, part_ops):
 317         return [self.part_pts.eq(part_pts)] + \
 318                [self.terms[i].eq(inputs[i])
 319                                      for i in range(len(self.terms))] + \
 320                [self.part_ops[i].eq(part_ops[i])
 321                                      for i in range(len(self.part_ops))]
 322
 323     def eq(self, rhs):
 324         return self.eq_from(rhs.part_pts, rhs.terms, rhs.part_ops)
 325
 326
 327 class FinalReduceData:
 328
 329     def __init__(self, part_pts, output_width, n_parts):
 330         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
 331                           for i in range(n_parts)]
 332         self.output = Signal(output_width, reset_less=True)
 333         self.part_pts = part_pts.like()
 334
 335     def eq_from(self, part_pts, output, part_ops):
 336         return [self.part_pts.eq(part_pts)] + \
 337                [self.output.eq(output)] + \
 338                [self.part_ops[i].eq(part_ops[i])
 339                                      for i in range(len(self.part_ops))]
 340
 341     def eq(self, rhs):
 342         return self.eq_from(rhs.part_pts, rhs.output, rhs.part_ops)
 343
 344
 345 class FinalAdd(Elaboratable):
 346     """ Final stage of add reduce
 347     """
 348
 349     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 350                        partition_points):
 351         self.i = AddReduceData(partition_points, n_inputs,
 352                                output_width, n_parts)
 353         self.o = FinalReduceData(partition_points, output_width, n_parts)
 354         self.output_width = output_width
 355         self.n_inputs = n_inputs
 356         self.n_parts = n_parts
 357         self.register_levels = list(register_levels)
 358         self.partition_points = PartitionPoints(partition_points)
 359         if not self.partition_points.fits_in_width(output_width):
 360             raise ValueError("partition_points doesn't fit in output_width")
 361
 362     def elaborate(self, platform):
 363         """Elaborate this module."""
 364         m = Module()
 365
 366         output_width = self.output_width
 367         output = Signal(output_width, reset_less=True)
 368         if self.n_inputs == 0:
 369             # use 0 as the default output value
 370             m.d.comb += output.eq(0)
 371         elif self.n_inputs == 1:
 372             # handle single input
 373             m.d.comb += output.eq(self.i.terms[0])
 374         else:
 375             # base case for adding 2 inputs
 376             assert self.n_inputs == 2
 377             adder = PartitionedAdder(output_width,
 378                                      self.i.part_pts, 2)
 379             m.submodules.final_adder = adder
 380             m.d.comb += adder.a.eq(self.i.terms[0])
 381             m.d.comb += adder.b.eq(self.i.terms[1])
 382             m.d.comb += output.eq(adder.output)
 383
 384         # create output
 385         m.d.comb += self.o.eq_from(self.i.part_pts, output,
 386                                    self.i.part_ops)
 387
 388         return m
 389
 390
 391 class AddReduceSingle(Elaboratable):
 392     """Add list of numbers together.
 393
 394     :attribute inputs: input ``Signal``s to be summed. Modification not
 395         supported, except for by ``Signal.eq``.
 396     :attribute register_levels: List of nesting levels that should have
 397         pipeline registers.
 398     :attribute output: output sum.
 399     :attribute partition_points: the input partition points. Modification not
 400         supported, except for by ``Signal.eq``.
 401     """
 402
 403     def __init__(self, n_inputs, output_width, n_parts, register_levels,
 404                        partition_points):
 405         """Create an ``AddReduce``.
 406
 407         :param inputs: input ``Signal``s to be summed.
 408         :param output_width: bit-width of ``output``.
 409         :param register_levels: List of nesting levels that should have
 410             pipeline registers.
 411         :param partition_points: the input partition points.
 412         """
 413         self.n_inputs = n_inputs
 414         self.n_parts = n_parts
 415         self.output_width = output_width
 416         self.i = AddReduceData(partition_points, n_inputs,
 417                                output_width, n_parts)
 418         self.register_levels = list(register_levels)
 419         self.partition_points = PartitionPoints(partition_points)
 420         if not self.partition_points.fits_in_width(output_width):
 421             raise ValueError("partition_points doesn't fit in output_width")
 422
 423         max_level = AddReduceSingle.get_max_level(n_inputs)
 424         for level in self.register_levels:
 425             if level > max_level:
 426                 raise ValueError(
 427                     "not enough adder levels for specified register levels")
 428
 429         self.groups = AddReduceSingle.full_adder_groups(n_inputs)
 430         n_terms = AddReduceSingle.calc_n_inputs(n_inputs, self.groups)
 431         self.o = AddReduceData(partition_points, n_terms, output_width, n_parts)
 432
 433     @staticmethod
 434     def calc_n_inputs(n_inputs, groups):
 435         retval = len(groups)*2
 436         if n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 437             retval += 1
 438         elif n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 439             retval += 2
 440         else:
 441             assert n_inputs % FULL_ADDER_INPUT_COUNT == 0
 442         return retval
 443
 444     @staticmethod
 445     def get_max_level(input_count):
 446         """Get the maximum level.
 447
 448         All ``register_levels`` must be less than or equal to the maximum
 449         level.
 450         """
 451         retval = 0
 452         while True:
 453             groups = AddReduceSingle.full_adder_groups(input_count)
 454             if len(groups) == 0:
 455                 return retval
 456             input_count %= FULL_ADDER_INPUT_COUNT
 457             input_count += 2 * len(groups)
 458             retval += 1
 459
 460     @staticmethod
 461     def full_adder_groups(input_count):
 462         """Get ``inputs`` indices for which a full adder should be built."""
 463         return range(0,
 464                      input_count - FULL_ADDER_INPUT_COUNT + 1,
 465                      FULL_ADDER_INPUT_COUNT)
 466
 467     def create_next_terms(self):
 468         """ create next intermediate terms, for linking up in elaborate, below
 469         """
 470         terms = []
 471         adders = []
 472
 473         # create full adders for this recursive level.
 474         # this shrinks N terms to 2 * (N // 3) plus the remainder
 475         for i in self.groups:
 476             adder_i = MaskedFullAdder(self.output_width)
 477             adders.append((i, adder_i))
 478             # add both the sum and the masked-carry to the next level.
 479             # 3 inputs have now been reduced to 2...
 480             terms.append(adder_i.sum)
 481             terms.append(adder_i.mcarry)
 482         # handle the remaining inputs.
 483         if self.n_inputs % FULL_ADDER_INPUT_COUNT == 1:
 484             terms.append(self.i.terms[-1])
 485         elif self.n_inputs % FULL_ADDER_INPUT_COUNT == 2:
 486             # Just pass the terms to the next layer, since we wouldn't gain
 487             # anything by using a half adder since there would still be 2 terms
 488             # and just passing the terms to the next layer saves gates.
 489             terms.append(self.i.terms[-2])
 490             terms.append(self.i.terms[-1])
 491         else:
 492             assert self.n_inputs % FULL_ADDER_INPUT_COUNT == 0
 493
 494         return terms, adders
 495
 496     def elaborate(self, platform):
 497         """Elaborate this module."""
 498         m = Module()
 499
 500         terms, adders = self.create_next_terms()
 501
 502         # copy the intermediate terms to the output
 503         for i, value in enumerate(terms):
 504             m.d.comb += self.o.terms[i].eq(value)
 505
 506         # copy reg part points and part ops to output
 507         m.d.comb += self.o.part_pts.eq(self.i.part_pts)
 508         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
 509                                      for i in range(len(self.i.part_ops))]
 510
 511         # set up the partition mask (for the adders)
 512         part_mask = Signal(self.output_width, reset_less=True)
 513
 514         # get partition points as a mask
 515         mask = self.i.part_pts.as_mask(self.output_width, mul=2)
 516         m.d.comb += part_mask.eq(mask)
 517
 518         # add and link the intermediate term modules
 519         for i, (iidx, adder_i) in enumerate(adders):
 520             setattr(m.submodules, f"adder_{i}", adder_i)
 521
 522             m.d.comb += adder_i.in0.eq(self.i.terms[iidx])
 523             m.d.comb += adder_i.in1.eq(self.i.terms[iidx + 1])
 524             m.d.comb += adder_i.in2.eq(self.i.terms[iidx + 2])
 525             m.d.comb += adder_i.mask.eq(part_mask)
 526
 527         return m
 528
 529
 530 class AddReduce(Elaboratable):
 531     """Recursively Add list of numbers together.
 532
 533     :attribute inputs: input ``Signal``s to be summed. Modification not
 534         supported, except for by ``Signal.eq``.
 535     :attribute register_levels: List of nesting levels that should have
 536         pipeline registers.
 537     :attribute output: output sum.
 538     :attribute partition_points: the input partition points. Modification not
 539         supported, except for by ``Signal.eq``.
 540     """
 541
 542     def __init__(self, inputs, output_width, register_levels, partition_points,
 543                        part_ops):
 544         """Create an ``AddReduce``.
 545
 546         :param inputs: input ``Signal``s to be summed.
 547         :param output_width: bit-width of ``output``.
 548         :param register_levels: List of nesting levels that should have
 549             pipeline registers.
 550         :param partition_points: the input partition points.
 551         """
 552         self.inputs = inputs
 553         self.part_ops = part_ops
 554         n_parts = len(part_ops)
 555         self.o = FinalReduceData(partition_points, output_width, n_parts)
 556         self.output_width = output_width
 557         self.register_levels = register_levels
 558         self.partition_points = partition_points
 559
 560         self.create_levels()
 561
 562     @staticmethod
 563     def get_max_level(input_count):
 564         return AddReduceSingle.get_max_level(input_count)
 565
 566     @staticmethod
 567     def next_register_levels(register_levels):
 568         """``Iterable`` of ``register_levels`` for next recursive level."""
 569         for level in register_levels:
 570             if level > 0:
 571                 yield level - 1
 572
 573     def create_levels(self):
 574         """creates reduction levels"""
 575
 576         mods = []
 577         next_levels = self.register_levels
 578         partition_points = self.partition_points
 579         part_ops = self.part_ops
 580         n_parts = len(part_ops)
 581         inputs = self.inputs
 582         ilen = len(inputs)
 583         while True:
 584             groups = AddReduceSingle.full_adder_groups(len(inputs))
 585             if len(groups) == 0:
 586                 break
 587             next_level = AddReduceSingle(ilen, self.output_width, n_parts,
 588                                          next_levels, partition_points)
 589             mods.append(next_level)
 590             next_levels = list(AddReduce.next_register_levels(next_levels))
 591             partition_points = next_level.i.part_pts
 592             inputs = next_level.o.terms
 593             ilen = len(inputs)
 594             part_ops = next_level.i.part_ops
 595
 596         next_level = FinalAdd(ilen, self.output_width, n_parts,
 597                               next_levels, partition_points)
 598         mods.append(next_level)
 599
 600         self.levels = mods
 601
 602     def elaborate(self, platform):
 603         """Elaborate this module."""
 604         m = Module()
 605
 606         for i, next_level in enumerate(self.levels):
 607             setattr(m.submodules, "next_level%d" % i, next_level)
 608
 609         partition_points = self.partition_points
 610         inputs = self.inputs
 611         part_ops = self.part_ops
 612         n_parts = len(part_ops)
 613         n_inputs = len(inputs)
 614         output_width = self.output_width
 615         i = AddReduceData(partition_points, n_inputs, output_width, n_parts)
 616         m.d.comb += i.eq_from(partition_points, inputs, part_ops)
 617         for idx in range(len(self.levels)):
 618             mcur = self.levels[idx]
 619             if 0 in mcur.register_levels:
 620                 m.d.sync += mcur.i.eq(i)
 621             else:
 622                 m.d.comb += mcur.i.eq(i)
 623             i = mcur.o # for next loop
 624
 625         # output comes from last module
 626         m.d.comb += self.o.eq(i)
 627
 628         return m
 629
 630
 631 OP_MUL_LOW = 0
 632 OP_MUL_SIGNED_HIGH = 1
 633 OP_MUL_SIGNED_UNSIGNED_HIGH = 2  # a is signed, b is unsigned
 634 OP_MUL_UNSIGNED_HIGH = 3
 635
 636
 637 def get_term(value, shift=0, enabled=None):
 638     if enabled is not None:
 639         value = Mux(enabled, value, 0)
 640     if shift > 0:
 641         value = Cat(Repl(C(0, 1), shift), value)
 642     else:
 643         assert shift == 0
 644     return value
 645
 646
 647 class ProductTerm(Elaboratable):
 648     """ this class creates a single product term (a[..]*b[..]).
 649         it has a design flaw in that is the *output* that is selected,
 650         where the multiplication(s) are combinatorially generated
 651         all the time.
 652     """
 653
 654     def __init__(self, width, twidth, pbwid, a_index, b_index):
 655         self.a_index = a_index
 656         self.b_index = b_index
 657         shift = 8 * (self.a_index + self.b_index)
 658         self.pwidth = width
 659         self.twidth = twidth
 660         self.width = width*2
 661         self.shift = shift
 662
 663         self.ti = Signal(self.width, reset_less=True)
 664         self.term = Signal(twidth, reset_less=True)
 665         self.a = Signal(twidth//2, reset_less=True)
 666         self.b = Signal(twidth//2, reset_less=True)
 667         self.pb_en = Signal(pbwid, reset_less=True)
 668
 669         self.tl = tl = []
 670         min_index = min(self.a_index, self.b_index)
 671         max_index = max(self.a_index, self.b_index)
 672         for i in range(min_index, max_index):
 673             tl.append(self.pb_en[i])
 674         name = "te_%d_%d" % (self.a_index, self.b_index)
 675         if len(tl) > 0:
 676             term_enabled = Signal(name=name, reset_less=True)
 677         else:
 678             term_enabled = None
 679         self.enabled = term_enabled
 680         self.term.name = "term_%d_%d" % (a_index, b_index) # rename
 681
 682     def elaborate(self, platform):
 683
 684         m = Module()
 685         if self.enabled is not None:
 686             m.d.comb += self.enabled.eq(~(Cat(*self.tl).bool()))
 687
 688         bsa = Signal(self.width, reset_less=True)
 689         bsb = Signal(self.width, reset_less=True)
 690         a_index, b_index = self.a_index, self.b_index
 691         pwidth = self.pwidth
 692         m.d.comb += bsa.eq(self.a.part(a_index * pwidth, pwidth))
 693         m.d.comb += bsb.eq(self.b.part(b_index * pwidth, pwidth))
 694         m.d.comb += self.ti.eq(bsa * bsb)
 695         m.d.comb += self.term.eq(get_term(self.ti, self.shift, self.enabled))
 696         """
 697         #TODO: sort out width issues, get inputs a/b switched on/off.
 698         #data going into Muxes is 1/2 the required width
 699
 700         pwidth = self.pwidth
 701         width = self.width
 702         bsa = Signal(self.twidth//2, reset_less=True)
 703         bsb = Signal(self.twidth//2, reset_less=True)
 704         asel = Signal(width, reset_less=True)
 705         bsel = Signal(width, reset_less=True)
 706         a_index, b_index = self.a_index, self.b_index
 707         m.d.comb += asel.eq(self.a.part(a_index * pwidth, pwidth))
 708         m.d.comb += bsel.eq(self.b.part(b_index * pwidth, pwidth))
 709         m.d.comb += bsa.eq(get_term(asel, self.shift, self.enabled))
 710         m.d.comb += bsb.eq(get_term(bsel, self.shift, self.enabled))
 711         m.d.comb += self.ti.eq(bsa * bsb)
 712         m.d.comb += self.term.eq(self.ti)
 713         """
 714
 715         return m
 716
 717
 718 class ProductTerms(Elaboratable):
 719     """ creates a bank of product terms.  also performs the actual bit-selection
 720         this class is to be wrapped with a for-loop on the "a" operand.
 721         it creates a second-level for-loop on the "b" operand.
 722     """
 723     def __init__(self, width, twidth, pbwid, a_index, blen):
 724         self.a_index = a_index
 725         self.blen = blen
 726         self.pwidth = width
 727         self.twidth = twidth
 728         self.pbwid = pbwid
 729         self.a = Signal(twidth//2, reset_less=True)
 730         self.b = Signal(twidth//2, reset_less=True)
 731         self.pb_en = Signal(pbwid, reset_less=True)
 732         self.terms = [Signal(twidth, name="term%d"%i, reset_less=True) \
 733                             for i in range(blen)]
 734
 735     def elaborate(self, platform):
 736
 737         m = Module()
 738
 739         for b_index in range(self.blen):
 740             t = ProductTerm(self.pwidth, self.twidth, self.pbwid,
 741                             self.a_index, b_index)
 742             setattr(m.submodules, "term_%d" % b_index, t)
 743
 744             m.d.comb += t.a.eq(self.a)
 745             m.d.comb += t.b.eq(self.b)
 746             m.d.comb += t.pb_en.eq(self.pb_en)
 747
 748             m.d.comb += self.terms[b_index].eq(t.term)
 749
 750         return m
 751
 752
 753 class LSBNegTerm(Elaboratable):
 754
 755     def __init__(self, bit_width):
 756         self.bit_width = bit_width
 757         self.part = Signal(reset_less=True)
 758         self.signed = Signal(reset_less=True)
 759         self.op = Signal(bit_width, reset_less=True)
 760         self.msb = Signal(reset_less=True)
 761         self.nt = Signal(bit_width*2, reset_less=True)
 762         self.nl = Signal(bit_width*2, reset_less=True)
 763
 764     def elaborate(self, platform):
 765         m = Module()
 766         comb = m.d.comb
 767         bit_wid = self.bit_width
 768         ext = Repl(0, bit_wid) # extend output to HI part
 769
 770         # determine sign of each incoming number *in this partition*
 771         enabled = Signal(reset_less=True)
 772         m.d.comb += enabled.eq(self.part & self.msb & self.signed)
 773
 774         # for 8-bit values: form a * 0xFF00 by using -a * 0x100, the
 775         # negation operation is split into a bitwise not and a +1.
 776         # likewise for 16, 32, and 64-bit values.
 777
 778         # width-extended 1s complement if a is signed, otherwise zero
 779         comb += self.nt.eq(Mux(enabled, Cat(ext, ~self.op), 0))
 780
 781         # add 1 if signed, otherwise add zero
 782         comb += self.nl.eq(Cat(ext, enabled, Repl(0, bit_wid-1)))
 783
 784         return m
 785
 786
 787 class Parts(Elaboratable):
 788
 789     def __init__(self, pbwid, part_pts, n_parts):
 790         self.pbwid = pbwid
 791         # inputs
 792         self.part_pts = PartitionPoints.like(part_pts)
 793         # outputs
 794         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 795                       for i in range(n_parts)]
 796
 797     def elaborate(self, platform):
 798         m = Module()
 799
 800         part_pts, parts = self.part_pts, self.parts
 801         # collect part-bytes (double factor because the input is extended)
 802         pbs = Signal(self.pbwid, reset_less=True)
 803         tl = []
 804         for i in range(self.pbwid):
 805             pb = Signal(name="pb%d" % i, reset_less=True)
 806             m.d.comb += pb.eq(part_pts.part_byte(i))
 807             tl.append(pb)
 808         m.d.comb += pbs.eq(Cat(*tl))
 809
 810         # negated-temporary copy of partition bits
 811         npbs = Signal.like(pbs, reset_less=True)
 812         m.d.comb += npbs.eq(~pbs)
 813         byte_count = 8 // len(parts)
 814         for i in range(len(parts)):
 815             pbl = []
 816             pbl.append(npbs[i * byte_count - 1])
 817             for j in range(i * byte_count, (i + 1) * byte_count - 1):
 818                 pbl.append(pbs[j])
 819             pbl.append(npbs[(i + 1) * byte_count - 1])
 820             value = Signal(len(pbl), name="value_%d" % i, reset_less=True)
 821             m.d.comb += value.eq(Cat(*pbl))
 822             m.d.comb += parts[i].eq(~(value).bool())
 823
 824         return m
 825
 826
 827 class Part(Elaboratable):
 828     """ a key class which, depending on the partitioning, will determine
 829         what action to take when parts of the output are signed or unsigned.
 830
 831         this requires 2 pieces of data *per operand, per partition*:
 832         whether the MSB is HI/LO (per partition!), and whether a signed
 833         or unsigned operation has been *requested*.
 834
 835         once that is determined, signed is basically carried out
 836         by splitting 2's complement into 1's complement plus one.
 837         1's complement is just a bit-inversion.
 838
 839         the extra terms - as separate terms - are then thrown at the
 840         AddReduce alongside the multiplication part-results.
 841     """
 842     def __init__(self, part_pts, width, n_parts, n_levels, pbwid):
 843
 844         self.pbwid = pbwid
 845         self.part_pts = part_pts
 846
 847         # inputs
 848         self.a = Signal(64, reset_less=True)
 849         self.b = Signal(64, reset_less=True)
 850         self.a_signed = [Signal(name=f"a_signed_{i}", reset_less=True)
 851                             for i in range(8)]
 852         self.b_signed = [Signal(name=f"_b_signed_{i}", reset_less=True)
 853                             for i in range(8)]
 854         self.pbs = Signal(pbwid, reset_less=True)
 855
 856         # outputs
 857         self.parts = [Signal(name=f"part_{i}", reset_less=True)
 858                             for i in range(n_parts)]
 859
 860         self.not_a_term = Signal(width, reset_less=True)
 861         self.neg_lsb_a_term = Signal(width, reset_less=True)
 862         self.not_b_term = Signal(width, reset_less=True)
 863         self.neg_lsb_b_term = Signal(width, reset_less=True)
 864
 865     def elaborate(self, platform):
 866         m = Module()
 867
 868         pbs, parts = self.pbs, self.parts
 869         part_pts = self.part_pts
 870         m.submodules.p = p = Parts(self.pbwid, part_pts, len(parts))
 871         m.d.comb += p.part_pts.eq(part_pts)
 872         parts = p.parts
 873
 874         byte_count = 8 // len(parts)
 875
 876         not_a_term, neg_lsb_a_term, not_b_term, neg_lsb_b_term = (
 877                 self.not_a_term, self.neg_lsb_a_term,
 878                 self.not_b_term, self.neg_lsb_b_term)
 879
 880         byte_width = 8 // len(parts) # byte width
 881         bit_wid = 8 * byte_width     # bit width
 882         nat, nbt, nla, nlb = [], [], [], []
 883         for i in range(len(parts)):
 884             # work out bit-inverted and +1 term for a.
 885             pa = LSBNegTerm(bit_wid)
 886             setattr(m.submodules, "lnt_%d_a_%d" % (bit_wid, i), pa)
 887             m.d.comb += pa.part.eq(parts[i])
 888             m.d.comb += pa.op.eq(self.a.part(bit_wid * i, bit_wid))
 889             m.d.comb += pa.signed.eq(self.b_signed[i * byte_width]) # yes b
 890             m.d.comb += pa.msb.eq(self.b[(i + 1) * bit_wid - 1]) # really, b
 891             nat.append(pa.nt)
 892             nla.append(pa.nl)
 893
 894             # work out bit-inverted and +1 term for b
 895             pb = LSBNegTerm(bit_wid)
 896             setattr(m.submodules, "lnt_%d_b_%d" % (bit_wid, i), pb)
 897             m.d.comb += pb.part.eq(parts[i])
 898             m.d.comb += pb.op.eq(self.b.part(bit_wid * i, bit_wid))
 899             m.d.comb += pb.signed.eq(self.a_signed[i * byte_width]) # yes a
 900             m.d.comb += pb.msb.eq(self.a[(i + 1) * bit_wid - 1]) # really, a
 901             nbt.append(pb.nt)
 902             nlb.append(pb.nl)
 903
 904         # concatenate together and return all 4 results.
 905         m.d.comb += [not_a_term.eq(Cat(*nat)),
 906                      not_b_term.eq(Cat(*nbt)),
 907                      neg_lsb_a_term.eq(Cat(*nla)),
 908                      neg_lsb_b_term.eq(Cat(*nlb)),
 909                     ]
 910
 911         return m
 912
 913
 914 class IntermediateOut(Elaboratable):
 915     """ selects the HI/LO part of the multiplication, for a given bit-width
 916         the output is also reconstructed in its SIMD (partition) lanes.
 917     """
 918     def __init__(self, width, out_wid, n_parts):
 919         self.width = width
 920         self.n_parts = n_parts
 921         self.part_ops = [Signal(2, name="dpop%d" % i, reset_less=True)
 922                                      for i in range(8)]
 923         self.intermed = Signal(out_wid, reset_less=True)
 924         self.output = Signal(out_wid//2, reset_less=True)
 925
 926     def elaborate(self, platform):
 927         m = Module()
 928
 929         ol = []
 930         w = self.width
 931         sel = w // 8
 932         for i in range(self.n_parts):
 933             op = Signal(w, reset_less=True, name="op%d_%d" % (w, i))
 934             m.d.comb += op.eq(
 935                 Mux(self.part_ops[sel * i] == OP_MUL_LOW,
 936                     self.intermed.part(i * w*2, w),
 937                     self.intermed.part(i * w*2 + w, w)))
 938             ol.append(op)
 939         m.d.comb += self.output.eq(Cat(*ol))
 940
 941         return m
 942
 943
 944 class FinalOut(Elaboratable):
 945     """ selects the final output based on the partitioning.
 946
 947         each byte is selectable independently, i.e. it is possible
 948         that some partitions requested 8-bit computation whilst others
 949         requested 16 or 32 bit.
 950     """
 951     def __init__(self, output_width, n_parts, part_pts):
 952         self.part_pts = part_pts
 953         self.i = IntermediateData(part_pts, output_width, n_parts)
 954         self.out_wid = output_width//2
 955         # output
 956         self.out = Signal(self.out_wid, reset_less=True)
 957         self.intermediate_output = Signal(output_width, reset_less=True)
 958
 959     def elaborate(self, platform):
 960         m = Module()
 961
 962         part_pts = self.part_pts
 963         m.submodules.p_8 = p_8 = Parts(8, part_pts, 8)
 964         m.submodules.p_16 = p_16 = Parts(8, part_pts, 4)
 965         m.submodules.p_32 = p_32 = Parts(8, part_pts, 2)
 966         m.submodules.p_64 = p_64 = Parts(8, part_pts, 1)
 967
 968         out_part_pts = self.i.part_pts
 969
 970         # temporaries
 971         d8 = [Signal(name=f"d8_{i}", reset_less=True) for i in range(8)]
 972         d16 = [Signal(name=f"d16_{i}", reset_less=True) for i in range(4)]
 973         d32 = [Signal(name=f"d32_{i}", reset_less=True) for i in range(2)]
 974
 975         i8 = Signal(self.out_wid, reset_less=True)
 976         i16 = Signal(self.out_wid, reset_less=True)
 977         i32 = Signal(self.out_wid, reset_less=True)
 978         i64 = Signal(self.out_wid, reset_less=True)
 979
 980         m.d.comb += p_8.part_pts.eq(out_part_pts)
 981         m.d.comb += p_16.part_pts.eq(out_part_pts)
 982         m.d.comb += p_32.part_pts.eq(out_part_pts)
 983         m.d.comb += p_64.part_pts.eq(out_part_pts)
 984
 985         for i in range(len(p_8.parts)):
 986             m.d.comb += d8[i].eq(p_8.parts[i])
 987         for i in range(len(p_16.parts)):
 988             m.d.comb += d16[i].eq(p_16.parts[i])
 989         for i in range(len(p_32.parts)):
 990             m.d.comb += d32[i].eq(p_32.parts[i])
 991         m.d.comb += i8.eq(self.i.outputs[0])
 992         m.d.comb += i16.eq(self.i.outputs[1])
 993         m.d.comb += i32.eq(self.i.outputs[2])
 994         m.d.comb += i64.eq(self.i.outputs[3])
 995
 996         ol = []
 997         for i in range(8):
 998             # select one of the outputs: d8 selects i8, d16 selects i16
 999             # d32 selects i32, and the default is i64.
1000             # d8 and d16 are ORed together in the first Mux
1001             # then the 2nd selects either i8 or i16.
1002             # if neither d8 nor d16 are set, d32 selects either i32 or i64.
1003             op = Signal(8, reset_less=True, name="op_%d" % i)
1004             m.d.comb += op.eq(
1005                 Mux(d8[i] | d16[i // 2],
1006                     Mux(d8[i], i8.part(i * 8, 8), i16.part(i * 8, 8)),
1007                     Mux(d32[i // 4], i32.part(i * 8, 8), i64.part(i * 8, 8))))
1008             ol.append(op)
1009         m.d.comb += self.out.eq(Cat(*ol))
1010         m.d.comb += self.intermediate_output.eq(self.i.intermediate_output)
1011         return m
1012
1013
1014 class OrMod(Elaboratable):
1015     """ ORs four values together in a hierarchical tree
1016     """
1017     def __init__(self, wid):
1018         self.wid = wid
1019         self.orin = [Signal(wid, name="orin%d" % i, reset_less=True)
1020                      for i in range(4)]
1021         self.orout = Signal(wid, reset_less=True)
1022
1023     def elaborate(self, platform):
1024         m = Module()
1025         or1 = Signal(self.wid, reset_less=True)
1026         or2 = Signal(self.wid, reset_less=True)
1027         m.d.comb += or1.eq(self.orin[0] | self.orin[1])
1028         m.d.comb += or2.eq(self.orin[2] | self.orin[3])
1029         m.d.comb += self.orout.eq(or1 | or2)
1030
1031         return m
1032
1033
1034 class Signs(Elaboratable):
1035     """ determines whether a or b are signed numbers
1036         based on the required operation type (OP_MUL_*)
1037     """
1038
1039     def __init__(self):
1040         self.part_ops = Signal(2, reset_less=True)
1041         self.a_signed = Signal(reset_less=True)
1042         self.b_signed = Signal(reset_less=True)
1043
1044     def elaborate(self, platform):
1045
1046         m = Module()
1047
1048         asig = self.part_ops != OP_MUL_UNSIGNED_HIGH
1049         bsig = (self.part_ops == OP_MUL_LOW) \
1050                     | (self.part_ops == OP_MUL_SIGNED_HIGH)
1051         m.d.comb += self.a_signed.eq(asig)
1052         m.d.comb += self.b_signed.eq(bsig)
1053
1054         return m
1055
1056
1057 class IntermediateData:
1058
1059     def __init__(self, part_pts, output_width, n_parts):
1060         self.part_ops = [Signal(2, name=f"part_ops_{i}", reset_less=True)
1061                           for i in range(n_parts)]
1062         self.part_pts = part_pts.like()
1063         self.outputs = [Signal(output_width, name="io%d" % i, reset_less=True)
1064                           for i in range(4)]
1065         # intermediates (needed for unit tests)
1066         self.intermediate_output = Signal(output_width)
1067
1068     def eq_from(self, part_pts, outputs, intermediate_output,
1069                       part_ops):
1070         return [self.part_pts.eq(part_pts)] + \
1071                [self.intermediate_output.eq(intermediate_output)] + \
1072                [self.outputs[i].eq(outputs[i])
1073                                      for i in range(4)] + \
1074                [self.part_ops[i].eq(part_ops[i])
1075                                      for i in range(len(self.part_ops))]
1076
1077     def eq(self, rhs):
1078         return self.eq_from(rhs.part_pts, rhs.outputs,
1079                             rhs.intermediate_output, rhs.part_ops)
1080
1081
1082 class AllTermsData:
1083
1084     def __init__(self, partition_points):
1085         self.a = Signal(64)
1086         self.b = Signal(64)
1087         self.part_pts = partition_points.like()
1088         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1089
1090     def eq_from(self, part_pts, inputs, part_ops):
1091         return [self.part_pts.eq(part_pts)] + \
1092                [self.a.eq(a), self.b.eq(b)] + \
1093                [self.part_ops[i].eq(part_ops[i])
1094                                      for i in range(len(self.part_ops))]
1095
1096     def eq(self, rhs):
1097         return self.eq_from(rhs.part_pts, rhs.a, rhs.b, rhs.part_ops)
1098
1099
1100 class AllTerms(Elaboratable):
1101     """Set of terms to be added together
1102     """
1103
1104     def __init__(self, n_inputs, output_width, n_parts, register_levels,
1105                        partition_points):
1106         """Create an ``AddReduce``.
1107
1108         :param inputs: input ``Signal``s to be summed.
1109         :param output_width: bit-width of ``output``.
1110         :param register_levels: List of nesting levels that should have
1111             pipeline registers.
1112         :param partition_points: the input partition points.
1113         """
1114         self.i = AllTermsData(partition_points)
1115         self.register_levels = register_levels
1116         self.n_inputs = n_inputs
1117         self.n_parts = n_parts
1118         self.output_width = output_width
1119         self.o = AddReduceData(self.i.part_pts, n_inputs,
1120                                output_width, n_parts)
1121
1122     def elaborate(self, platform):
1123         m = Module()
1124
1125         eps = self.i.part_pts
1126
1127         # collect part-bytes
1128         pbs = Signal(8, reset_less=True)
1129         tl = []
1130         for i in range(8):
1131             pb = Signal(name="pb%d" % i, reset_less=True)
1132             m.d.comb += pb.eq(eps.part_byte(i))
1133             tl.append(pb)
1134         m.d.comb += pbs.eq(Cat(*tl))
1135
1136         # local variables
1137         signs = []
1138         for i in range(8):
1139             s = Signs()
1140             signs.append(s)
1141             setattr(m.submodules, "signs%d" % i, s)
1142             m.d.comb += s.part_ops.eq(self.i.part_ops[i])
1143
1144         n_levels = len(self.register_levels)+1
1145         m.submodules.part_8 = part_8 = Part(eps, 128, 8, n_levels, 8)
1146         m.submodules.part_16 = part_16 = Part(eps, 128, 4, n_levels, 8)
1147         m.submodules.part_32 = part_32 = Part(eps, 128, 2, n_levels, 8)
1148         m.submodules.part_64 = part_64 = Part(eps, 128, 1, n_levels, 8)
1149         nat_l, nbt_l, nla_l, nlb_l = [], [], [], []
1150         for mod in [part_8, part_16, part_32, part_64]:
1151             m.d.comb += mod.a.eq(self.i.a)
1152             m.d.comb += mod.b.eq(self.i.b)
1153             for i in range(len(signs)):
1154                 m.d.comb += mod.a_signed[i].eq(signs[i].a_signed)
1155                 m.d.comb += mod.b_signed[i].eq(signs[i].b_signed)
1156             m.d.comb += mod.pbs.eq(pbs)
1157             nat_l.append(mod.not_a_term)
1158             nbt_l.append(mod.not_b_term)
1159             nla_l.append(mod.neg_lsb_a_term)
1160             nlb_l.append(mod.neg_lsb_b_term)
1161
1162         terms = []
1163
1164         for a_index in range(8):
1165             t = ProductTerms(8, 128, 8, a_index, 8)
1166             setattr(m.submodules, "terms_%d" % a_index, t)
1167
1168             m.d.comb += t.a.eq(self.i.a)
1169             m.d.comb += t.b.eq(self.i.b)
1170             m.d.comb += t.pb_en.eq(pbs)
1171
1172             for term in t.terms:
1173                 terms.append(term)
1174
1175         # it's fine to bitwise-or data together since they are never enabled
1176         # at the same time
1177         m.submodules.nat_or = nat_or = OrMod(128)
1178         m.submodules.nbt_or = nbt_or = OrMod(128)
1179         m.submodules.nla_or = nla_or = OrMod(128)
1180         m.submodules.nlb_or = nlb_or = OrMod(128)
1181         for l, mod in [(nat_l, nat_or),
1182                              (nbt_l, nbt_or),
1183                              (nla_l, nla_or),
1184                              (nlb_l, nlb_or)]:
1185             for i in range(len(l)):
1186                 m.d.comb += mod.orin[i].eq(l[i])
1187             terms.append(mod.orout)
1188
1189         # copy the intermediate terms to the output
1190         for i, value in enumerate(terms):
1191             m.d.comb += self.o.terms[i].eq(value)
1192
1193         # copy reg part points and part ops to output
1194         m.d.comb += self.o.part_pts.eq(eps)
1195         m.d.comb += [self.o.part_ops[i].eq(self.i.part_ops[i])
1196                                      for i in range(len(self.i.part_ops))]
1197
1198         return m
1199
1200
1201 class Intermediates(Elaboratable):
1202     """ Intermediate output modules
1203     """
1204
1205     def __init__(self, output_width, n_parts, partition_points):
1206         self.i = FinalReduceData(partition_points, output_width, n_parts)
1207         self.o = IntermediateData(partition_points, output_width, n_parts)
1208
1209     def elaborate(self, platform):
1210         m = Module()
1211
1212         out_part_ops = self.i.part_ops
1213         out_part_pts = self.i.part_pts
1214
1215         # create _output_64
1216         m.submodules.io64 = io64 = IntermediateOut(64, 128, 1)
1217         m.d.comb += io64.intermed.eq(self.i.output)
1218         for i in range(8):
1219             m.d.comb += io64.part_ops[i].eq(out_part_ops[i])
1220         m.d.comb += self.o.outputs[3].eq(io64.output)
1221
1222         # create _output_32
1223         m.submodules.io32 = io32 = IntermediateOut(32, 128, 2)
1224         m.d.comb += io32.intermed.eq(self.i.output)
1225         for i in range(8):
1226             m.d.comb += io32.part_ops[i].eq(out_part_ops[i])
1227         m.d.comb += self.o.outputs[2].eq(io32.output)
1228
1229         # create _output_16
1230         m.submodules.io16 = io16 = IntermediateOut(16, 128, 4)
1231         m.d.comb += io16.intermed.eq(self.i.output)
1232         for i in range(8):
1233             m.d.comb += io16.part_ops[i].eq(out_part_ops[i])
1234         m.d.comb += self.o.outputs[1].eq(io16.output)
1235
1236         # create _output_8
1237         m.submodules.io8 = io8 = IntermediateOut(8, 128, 8)
1238         m.d.comb += io8.intermed.eq(self.i.output)
1239         for i in range(8):
1240             m.d.comb += io8.part_ops[i].eq(out_part_ops[i])
1241         m.d.comb += self.o.outputs[0].eq(io8.output)
1242
1243         for i in range(8):
1244             m.d.comb += self.o.part_ops[i].eq(out_part_ops[i])
1245         m.d.comb += self.o.part_pts.eq(out_part_pts)
1246         m.d.comb += self.o.intermediate_output.eq(self.i.output)
1247
1248         return m
1249
1250
1251 class Mul8_16_32_64(Elaboratable):
1252     """Signed/Unsigned 8/16/32/64-bit partitioned integer multiplier.
1253
1254     Supports partitioning into any combination of 8, 16, 32, and 64-bit
1255     partitions on naturally-aligned boundaries. Supports the operation being
1256     set for each partition independently.
1257
1258     :attribute part_pts: the input partition points. Has a partition point at
1259         multiples of 8 in 0 < i < 64. Each partition point's associated
1260         ``Value`` is a ``Signal``. Modification not supported, except for by
1261         ``Signal.eq``.
1262     :attribute part_ops: the operation for each byte. The operation for a
1263         particular partition is selected by assigning the selected operation
1264         code to each byte in the partition. The allowed operation codes are:
1265
1266         :attribute OP_MUL_LOW: the LSB half of the product. Equivalent to
1267             RISC-V's `mul` instruction.
1268         :attribute OP_MUL_SIGNED_HIGH: the MSB half of the product where both
1269             ``a`` and ``b`` are signed. Equivalent to RISC-V's `mulh`
1270             instruction.
1271         :attribute OP_MUL_SIGNED_UNSIGNED_HIGH: the MSB half of the product
1272             where ``a`` is signed and ``b`` is unsigned. Equivalent to RISC-V's
1273             `mulhsu` instruction.
1274         :attribute OP_MUL_UNSIGNED_HIGH: the MSB half of the product where both
1275             ``a`` and ``b`` are unsigned. Equivalent to RISC-V's `mulhu`
1276             instruction.
1277     """
1278
1279     def __init__(self, register_levels=()):
1280         """ register_levels: specifies the points in the cascade at which
1281             flip-flops are to be inserted.
1282         """
1283
1284         # parameter(s)
1285         self.register_levels = list(register_levels)
1286
1287         # inputs
1288         self.part_pts = PartitionPoints()
1289         for i in range(8, 64, 8):
1290             self.part_pts[i] = Signal(name=f"part_pts_{i}")
1291         self.part_ops = [Signal(2, name=f"part_ops_{i}") for i in range(8)]
1292         self.a = Signal(64)
1293         self.b = Signal(64)
1294
1295         # intermediates (needed for unit tests)
1296         self.intermediate_output = Signal(128)
1297
1298         # output
1299         self.output = Signal(64)
1300
1301     def elaborate(self, platform):
1302         m = Module()
1303
1304         part_pts = self.part_pts
1305
1306         n_inputs = 64 + 4
1307         n_parts = 8 #len(self.part_pts)
1308         t = AllTerms(n_inputs, 128, n_parts, self.register_levels, part_pts)
1309         m.submodules.allterms = t
1310         m.d.comb += t.i.a.eq(self.a)
1311         m.d.comb += t.i.b.eq(self.b)
1312         m.d.comb += t.i.part_pts.eq(part_pts)
1313         for i in range(8):
1314             m.d.comb += t.i.part_ops[i].eq(self.part_ops[i])
1315
1316         terms = t.o.terms
1317
1318         add_reduce = AddReduce(terms,
1319                                128,
1320                                self.register_levels,
1321                                t.o.part_pts,
1322                                t.o.part_ops)
1323
1324         out_part_ops = add_reduce.o.part_ops
1325         out_part_pts = add_reduce.o.part_pts
1326
1327         m.submodules.add_reduce = add_reduce
1328
1329         interm = Intermediates(128, 8, part_pts)
1330         m.submodules.intermediates = interm
1331         m.d.comb += interm.i.eq(add_reduce.o)
1332
1333         # final output
1334         m.submodules.finalout = finalout = FinalOut(128, 8, part_pts)
1335         m.d.comb += finalout.i.eq(interm.o)
1336         m.d.comb += self.output.eq(finalout.out)
1337         m.d.comb += self.intermediate_output.eq(finalout.intermediate_output)
1338
1339         return m
1340
1341
1342 if __name__ == "__main__":
1343     m = Mul8_16_32_64()
1344     main(m, ports=[m.a,
1345                    m.b,
1346                    m.intermediate_output,
1347                    m.output,
1348                    *m.part_ops,
1349                    *m.part_pts.values()])